## shop7 買取ホムラ

In [1]:
from pathlib import Path
import pandas as pd
import os, re, json, pathlib
from typing import Dict, Optional, List, Iterable, Union
from django.utils import timezone

_AIR_PAT = re.compile(r"(iPhone)\s*(Air)(?:\s*(Pro\s*Max|Pro|Plus|mini))?", re.I)

_NUM_MODEL_PAT = re.compile(r"(iPhone)\s*(\d{2})(?:\s*(Pro\s*Max|Pro|Plus|mini))?", re.I)

def _parse_capacity_gb(text: str) -> Optional[int]:
    if not text:
        return None
    t = str(text)
    m = re.search(r"(\d+(?:\.\d+)?)\s*TB", t, flags=re.I)
    if m:
        return int(round(float(m.group(1)) * 1024))
    m = re.search(r"(\d{2,4})\s*GB", t, flags=re.I)
    if m:
        return int(m.group(1))
    return None

def to_int_yen(s: object) -> Optional[int]:
    if s is None: return None
    txt = str(s).strip()
    if not re.search(r"\d", txt): return None
    # 范围 "105,000～110,000"
    parts = re.split(r"[~～\-–—]", txt)
    candidates = []
    for p in parts:
        # 排除 12-14 位纯数字（像 JAN/电话）
        if re.fullmatch(r"\d{12,14}", p.strip()):
            continue
        digits = re.sub(r"[^\d万]", "", p)
        if not digits:
            continue
        if "万" in digits:
            m = re.search(r"([\d\.]+)万", digits)
            base = float(m.group(1)) if m else 0.0
            candidates.append(int(base * 10000))
        else:
            candidates.append(int(re.sub(r"[^\d]", "", digits)))
    if not candidates:
        return None
    val = max(candidates)
    # 合理区间过滤
    if val < 1000 or val > 5_000_000:
        return None
    return val

def _norm(s: str) -> str:
    return (s or "").strip()


def _price_from_shop7(x: object) -> Optional[int]:
    """data2 -> price_new：去掉“新品/未開封/货币符号/逗号”，区间取最大"""
    if x is None:
        return None
    s = str(x)
    s = s.replace("新品", "").replace("新\u54c1", "")
    s = s.replace("未開封", "").replace("未开封", "")
    return to_int_yen(s)



def _load_iphone17_info_df_for_shop2() -> pd.DataFrame:
    """
    读取 AppleStockChecker/data/iphone17_info.csv 或 settings / env 指定的路径。
    输出列：part_number, model_name_norm, capacity_gb
    """
    path = "/Users/syu/PycharmProjects/YamagotiProjects/AppleStockChecker/data/iphone17_info.csv"
    pth = Path(path)
    if not pth.exists():
        raise FileNotFoundError(f"未找到 iphone17_info：{pth}")

    if re.search(r"\.(xlsx|xlsm|xls|ods)$", str(pth), re.I):
        df = pd.read_excel(pth)
    else:
        df = pd.read_csv(pth, encoding="utf-8-sig")

    need = {"part_number", "model_name", "capacity_gb","color"}
    missing = need - set(df.columns)
    if missing:
        raise ValueError(f"iphone17_info 缺少必要列：{missing}")

    df = df.copy()
    # df["model_name_norm"] = df["model_name"].map(_normalize_model_generic)
    df["capacity_gb"] = pd.to_numeric(df["capacity_gb"], errors="coerce").astype("Int64")
    df = df.dropna(subset=["model_name", "capacity_gb", "part_number","color"])
    return df[["part_number", "model_name", "capacity_gb","color"]]

def _normalize_model_generic(text: str) -> str:
    """
    统一型号主体：
      - iPhone17/16 + 后缀（Pro/Pro Max/Plus/mini）
      - iPhone Air（含“17 air”→ Air）
      - 允许紧凑写法：17pro / 17promax / 16Pro / 16Plus ...
    输出：'iPhone 17 Pro Max' / 'iPhone 17 Pro' / 'iPhone Air' / ...
    """
    if not text:
        return ""
    t = str(text).replace("\u3000", " ")
    t = re.sub(r"\s+", " ", t)

    # 日文别名到英文
    t = (t.replace("プロマックス", "Pro Max")
           .replace("プロ", "Pro")
           .replace("プラス", "Plus")
           .replace("ミニ", "mini")
           .replace("エアー", "Air")
           .replace("エア", "Air"))

    # ❗ 在“数字后立即跟英文”的位置补一个空格：17pro -> 17 pro
    t = re.sub(r"(\d{2})(?=[A-Za-z])", r"\1 ", t)

    # 标准化大小写/形态：pro-max / ProMax / promáx → Pro Max；pro → Pro；plus → Plus；mini → mini
    t = re.sub(r"(?i)\bpro\s*max\b", "Pro Max", t)
    t = re.sub(r"(?i)\bpro\b", "Pro", t)
    t = re.sub(r"(?i)\bplus\b", "Plus", t)
    t = re.sub(r"(?i)\bmini\b", "mini", t)

    # 若没有 iPhone 前缀但出现纯数字代号，补上
    if "iPhone" not in t and re.search(r"\b1[0-9]\b", t):
        t = re.sub(r"\b(1[0-9])\b", r"iPhone \1", t, count=1)

    # 特例：'17 air' → iPhone Air（防止被当成 iPhone 17）
    t = re.sub(r"(?i)\biPhone\s+17\s+Air\b", "iPhone Air", t)

    # 去容量/SIM/括号噪声
    t = re.sub(r"(\d+(?:\.\d+)?\s*TB|\d{2,4}\s*GB)", "", t, flags=re.I)
    t = re.sub(r"SIMフリ[ーｰ–-]?|シムフリ[ーｰ–-]?|sim\s*free", "", t, flags=re.I)
    t = re.sub(r"[（）\(\)\[\]【】].*?[（）\(\)\[\]【】]", "", t)
    t = re.sub(r"\s+", " ", t).strip()

    # 1) 数字代号机型
    m = _NUM_MODEL_PAT.search(t)
    if m:
        base = f"{m.group(1)} {m.group(2)}"
        suf  = (m.group(3) or "").strip()
        return f"{base} {suf}".strip()

    # 2) Air
    m2 = _AIR_PAT.search(t)
    if m2:
        # 当前返回主体 'iPhone Air'；若以后真有 Air Plus 等可在此扩展
        return "iPhone Air"

    return ""


# def _norm_model_for_shop7(text: str) -> str:
#     """
#     在 _normalize_model_generic 之前做一点“shop7 特有”的宽松处理：
#       - ‘promax/ProMax/pro-max’ → ‘Pro Max’
#       - ‘17 air’ → ‘iPhone Air’
#       - 没有 iPhone 前缀但有 '17' 的，补成 ‘iPhone 17 ...’
#     然后交给 _normalize_model_generic 做最终归一。
#     """
#     if not text:
#         return ""
#     t = str(text).replace("\u3000", " ")
#     t = re.sub(r"\s+", " ", t)
#
#     # 日文/英文后缀标准化
#     t = (t.replace("プロマックス", "Pro Max")
#            .replace("プロ", "Pro")
#            .replace("プラス", "Plus")
#            .replace("ミニ", "mini")
#            .replace("エアー", "Air")
#            .replace("エア", "Air"))
#
#     # promax 连写/大小写
#     t = re.sub(r"(?i)pro[-\s]?max", "Pro Max", t)
#
#     # 若没有 iPhone 前缀但出现 "17 air" / "17 pro max" / "17 pro" / "17 plus"
#     # 先把 "17 air" 显式改成 "iPhone Air"（Air 没有数字后缀）
#     if re.search(r"(?i)\b17\s+air\b", t):
#         # 去掉“17 ”，以免 _normalize_model_generic 误识别为 iPhone 17
#         t = re.sub(r"(?i)\b17\s+air\b", "iPhone Air", t)
#
#     # 若没有 iPhone 单词但有纯数字代号（例如 "17 Pro Max 256GB"）
#     if "iPhone" not in t and re.search(r"\b1[0-9]\b", t):
#         t = re.sub(r"\b(1[0-9])\b", r"iPhone \1", t, count=1)
#
#     return _normalize_model_generic(t)


In [2]:
import re
import time
from typing import Dict, List, Tuple, Optional

def clean_shop7(df: pd.DataFrame) -> pd.DataFrame:
    print("DEBUG: shop7:買取ホムラ ----------> 进入清洗器 时间:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    _SHORT_MODEL_REPLACEMENTS = [
    (re.compile(r'(?i)\b17\s*pro\s*max\b'), "iPhone 17 Pro Max"),
    (re.compile(r'(?i)\b17promax\b'), "iPhone 17 Pro Max"),
    (re.compile(r'(?i)\b17\s*pro\b'), "iPhone 17 Pro"),
    (re.compile(r'(?i)\b17pro\b'), "iPhone 17 Pro"),
    (re.compile(r'(?i)\b17\s*air\b'), "iPhone 17 Air"),
    (re.compile(r'(?i)\b17air\b'), "iPhone 17 Air"),
    (re.compile(r'(?i)\bi\s*phone\s*17\b'), "iPhone 17"),
    (re.compile(r'(?i)\b17\b'), "iPhone 17"),  # 小心：放最后做兜底
]
    def _norm_model_for_shop7(s: Optional[str]) -> str:
        """
        针对 shop7 的 model 字段做宽松归一化：
          - 跳过纯数字的行（返回空字符串）
          - 将 '17pro', '17promax', '17 pro max' 等短写扩展为 'iPhone 17 Pro Max'
          - 最后调用 _normalize_model_generic 生成最终的归一字符串（与 info 表匹配）
        返回 '' 表示无法识别（将被跳过）
        """
        if s is None:
            return ""
        txt = str(s).strip()
        if not txt:
            return ""

        # 跳过行号/序号（仅数字或少量标点）
        if re.fullmatch(r'[\d\-\.\s]+', txt):
            # 代表这是序号/编号行，例如 "1", "2" 等
            return ""

        # 统一全角空格 & 多余空白
        txt = re.sub(r'[\u3000\s]+', ' ', txt).strip()

        # 先把常用短写替换成标准形式
        expanded = txt
        for patt, repl in _SHORT_MODEL_REPLACEMENTS:
            expanded = patt.sub(repl, expanded)

        # 可能存在像 "17pro 256GB" 或 "17promax 256GB" 这样的，
        # 上面的替换会把它变成 "iPhone 17 Pro 256GB" 之类，交给 normalize 处理。
        try:
            norm = _normalize_model_generic(expanded)
        except Exception:
            # 若 _normalize_model_generic 出错，退化为简单返回 expanded（再由上级判定是否匹配）
            norm = expanded

        # 如果 normalize 后为空或仅数字，认为无法识别
        if not norm or re.fullmatch(r'[\d\-\.\s]+', str(norm).strip()):
            return ""
        print(norm)
        return norm

    info_df = _load_iphone17_info_df_for_shop2()  # part_number, model_name_norm, capacity_gb, color

    # 必要列检查
    need_cols = ["data", "data2", "time-scraped"]
    for c in need_cols:
        if c not in df.columns:
            raise ValueError(f"shop7 清洗器缺少必要列：{c}")

    # 先把 time-scraped 为空的行排除，避免时间解析报错
    df = df.copy().reset_index(drop=True)
    mask_time_ok = df["time-scraped"].astype(str).str.strip().ne("") & df["time-scraped"].notna()
    df = df[mask_time_ok].reset_index(drop=True)
    if df.empty:
        print("DEBUG: 输入 df 为空或所有行 time-scraped 缺失，返回空 DataFrame")
        return pd.DataFrame(columns=["part_number","shop_name","price_new","recorded_at"])

    # data -> 机型&容量
    model_norm_series = df["data2"].map(_norm_model_for_shop7)
    print(model_norm_series)
    cap_gb_series     = df["data2"].map(_parse_capacity_gb)

    # 价格/时间（注意 data2 里是价格，我们仅在行有 price 时才处理该行）
    price_series  = df["data3"].map(_price_from_shop7)
    recorded_at   = df["time-scraped"]

    # ------------- 构建 (model_norm, cap) -> { color_norm: part_number } -------------
    info2 = info_df.copy()
    if "color" not in info2.columns:
        raise ValueError("info_df 缺少 'color' 列，无法进行颜色映射")
    info2["model_name_norm"] = info2["model_name"].map(_normalize_model_generic)
    info2["capacity_gb"] = pd.to_numeric(info2["capacity_gb"], errors="coerce").astype("Int64")
    info2["color_norm"] = info2["color"].map(lambda x: _norm(str(x)))

    pn_map: Dict[Tuple[str, int], Dict[str, str]] = {}
    for _, r in info2.iterrows():
        m = r["model_name_norm"]
        cap = r["capacity_gb"]
        col = r["color_norm"]
        pn = str(r["part_number"])
        if pd.isna(cap) or not m or not col:
            continue
        key = (m, int(cap))
        pn_map.setdefault(key, {})
        pn_map[key][col] = pn

    print(f"DEBUG: 建立了 pn_map，包含 {len(pn_map)} 个 (model,cap) 条目")

    # ----------------- 颜色减价解析函数（shop7 专用） -----------------
    DELTA_RE = re.compile(
        r"(?P<labels>[^\d¥￥円\+\-−－]+?)\s*(?P<sign>[+\-−－])\s*(?P<amount>[0-9０-９,，]+)",
        re.UNICODE
    )

    FW_TO_ASC = str.maketrans({
        "０":"0","１":"1","２":"2","３":"3","４":"4","５":"5","６":"6","７":"7","８":"8","９":"9",
        "，":",","．":".","－":"-","＋":"+","　":" "
    })

    def _to_int_amount(s: str) -> Optional[int]:
        if s is None:
            return None
        t = str(s).translate(FW_TO_ASC)
        m = re.search(r"([0-9][0-9,]*)", t)
        if not m:
            return None
        try:
            return int(m.group(1).replace(",", ""))
        except Exception:
            return None

    def _parse_color_deltas_shop7(text: str) -> Dict[str, int]:
        res: Dict[str, int] = {}
        if not text or not str(text).strip():
            return res
        s = str(text).strip()
        parts = []
        # 尝试先用 DELTA_RE 匹配整段里的所有有金额的片段；若没则按分隔符拆
        found = False
        for m in DELTA_RE.finditer(s):
            found = True
            labels_part = m.group("labels") or ""
            sign = m.group("sign") or "+"
            amt_txt = m.group("amount")
            amt = _to_int_amount(amt_txt)
            if amt is None:
                continue
            delta = -int(amt) if sign in ("-", "−", "－") else int(amt)
            # labels_part 可能包含多个颜色，用常见分隔符拆
            for tok in re.split(r"[／/、，,・\s]+", labels_part):
                tok = tok.strip()
                if not tok:
                    continue
                key = _norm(tok)
                res[key] = delta
        if not found:
            # 退化处理：没有显式金额匹配的情况下，尝试找类似 "シルバー/ディープブルー-3000" 形式
            # 以最后出现的 +/- 数字为金额，前面子串作为标签
            # 例如 "シルバー/ディープブルー-3000"
            m2 = re.search(r"(?P<labels>.+?)[\s]*([+\-−－])\s*(?P<amount>[0-9０-９,，]+)", s)
            if m2:
                labels_part = m2.group("labels") or ""
                sign = m2.group(2) or "+"
                amt_txt = m2.group("amount")
                amt = _to_int_amount(amt_txt)
                if amt is not None:
                    delta = -int(amt) if sign in ("-", "−", "－") else int(amt)
                    for tok in re.split(r"[／/、，,・\s]+", labels_part):
                        tok = tok.strip()
                        if tok:
                            res[_norm(tok)] = delta
        # Debug print for parsed deltas
        if res:
            print(f"DEBUG: 解析到 color deltas from '{text}': {res}")
        else:
            print(f"DEBUG: 未解析到 color deltas from '{text}'")
        return res

    # ----------------- 主循环：遍历含价格的行 -----------------
    rows: List[dict] = []
    n = len(df)
    for i in range(n):
        base_price = price_series.iat[i]
        if base_price is None:
            # 不是机种行（或者是下行的颜色行），跳过
            continue

        model_text_raw = df["data"].iat[i] if df["data"].iat[i] is not None else ""
        m = model_norm_series.iat[i]
        c = cap_gb_series.iat[i]
        t = recorded_at.iat[i]
        print(f"DEBUG: 处理行 i={i}, model_raw='{model_text_raw}', model_norm='{m}', cap='{c}', base_price={base_price}")

        if not m or pd.isna(c):
            print(f"DEBUG: 跳过 i={i} 因为 model/cap 缺失")
            continue
        c = int(c)
        key = (m, c)
        color_to_pn = pn_map.get(key)
        print(f"DEBUG: 对应的 color->pn 映射: {color_to_pn}")
        if not color_to_pn:
            print(f"DEBUG: info 表中未找到该机型容量 key={key}，跳过 i={i}")
            continue

        # 检查下一行是否为颜色减价行：下一行 data 非空 且 data2 为空
        deltas: Dict[str, int] = {}
        j = i + 1
        if j < n:
            nxt_data = df["data2"].iat[j] if df["data2"].iat[j] is not None else ""
            nxt_data2 = df["data3"].iat[j] if "data3" in df.columns and df["data2"].iat[j] is not None else ""
            if str(nxt_data).strip() and (str(nxt_data2).strip() == "" or pd.isna(nxt_data2)):
                print(f"DEBUG: 发现潜在颜色行 at i+1={j}: '{nxt_data}'")
                deltas = _parse_color_deltas_shop7(nxt_data)
            else:
                print(f"DEBUG: 下一行 i+1={j} 不是颜色行 (data='{nxt_data}' data2='{nxt_data2}')")

        # 生成每个颜色的 price_new
        for col_norm, pn in color_to_pn.items():
            delta = 0
            if deltas:
                if col_norm in deltas:
                    delta = deltas[col_norm]
                    print(f"DEBUG: 直接匹配 col_norm='{col_norm}' 得到 delta={delta} for pn={pn}")
                else:
                    # 进一步尝试在 info2 中查原始 color 文本匹配 deltas 的 key
                    matches = info2[
                        (info2["model_name_norm"] == m) &
                        (info2["capacity_gb"].astype("Int64") == c) &
                        (info2["part_number"].astype(str) == str(pn))
                    ]
                    raw_color = matches["color"].iat[0] if not matches.empty else ""
                    matched = False
                    for lbl_norm, dval in deltas.items():
                        # 尝试在 raw_color 的归一化字符串中查找 lbl_norm
                        if lbl_norm and lbl_norm in _norm(raw_color):
                            delta = dval
                            matched = True
                            print(f"DEBUG: 通过 raw_color='{raw_color}' 的归一化匹配 lbl='{lbl_norm}' -> delta={delta} for pn={pn}")
                            break
                        # 也尝试原文子串匹配
                        if lbl_norm and lbl_norm in raw_color:
                            delta = dval
                            matched = True
                            print(f"DEBUG: 通过 raw_color 原文匹配 lbl='{lbl_norm}' -> delta={delta} for pn={pn}")
                            break
                    if not matched and deltas:
                        print(f"DEBUG: 未匹配到颜色调整 for pn={pn} (raw_color='{raw_color}'), 使用 delta=0")

            price_final = int(base_price + delta)
            print(f"DEBUG: 输出 -> pn={pn}, base={base_price}, delta={delta}, final={price_final}")
            rows.append({
                "part_number": str(pn),
                "shop_name": "買取ホムラ",
                "price_new": price_final,
                "recorded_at": t,
            })

    out = pd.DataFrame(rows, columns=["part_number","shop_name","price_new","recorded_at"])
    if not out.empty:
        out = out.dropna(subset=["part_number","price_new"]).reset_index(drop=True)
        out["part_number"] = out["part_number"].astype(str)
        out["price_new"] = pd.to_numeric(out["price_new"], errors="coerce").astype("Int64")
    print("DEBUG: 完成 clean_shop7, 产出行数:", len(out))
    return out


In [3]:
df_7 = pd.read_csv("/Users/syu/PycharmProjects/YamagotiProjects/shop7.csv")
df_7

Unnamed: 0,web-scraper-order,data,data2,data3,data4,time-scraped
0,1761615275-1,1,【未開封】商品名,,,2025-10-28 10:34:35
1,1761615275-2,2,17pro 256GB,,,2025-10-28 10:34:35
2,1761615275-3,3,,,,2025-10-28 10:34:35
3,1761615275-4,4,17pro 512GB,,,2025-10-28 10:34:35
4,1761615275-5,5,,,,2025-10-28 10:34:35
...,...,...,...,...,...,...
127,1761615275-128,128,,,,2025-10-28 10:34:35
128,1761615275-129,149,SE3 64GB,,,2025-10-28 10:34:35
129,1761615275-130,150,,,,2025-10-28 10:34:35
130,1761615275-131,151,SE3 128GB,,,2025-10-28 10:34:35


In [4]:
res = clean_shop7(df_7)
res

DEBUG: shop7:買取ホムラ ----------> 进入清洗器 时间: 2025-10-28 10:38:04
iPhone 17 Pro
iPhone 17 Pro
iPhone 17 Pro
iPhone 17 Pro Max
iPhone 17 Pro Max
iPhone 17 Pro Max
iPhone 17 Pro Max
iPhone Air
iPhone Air
iPhone Air
iPhone 17
iPhone 17
iPhone 16 Pro
iPhone 16 Pro
iPhone 16 Pro
iPhone 16 Pro
iPhone 16 Pro Max
iPhone 16 Pro Max
iPhone 16 Pro Max
iPhone 16
iPhone 16
iPhone 16
iPhone 16 Plus
iPhone 16 Plus
iPhone 16 Plus
iPhone 16
iPhone 16
iPhone 16
iPhone 15 Pro
iPhone 15 Pro
iPhone 15 Pro
iPhone 15 Pro
iPhone 15 Pro Max
iPhone 15 Pro Max
iPhone 15 Pro Max
iPhone 15
iPhone 15
iPhone 15
iPhone 15 Plus
iPhone 15 Plus
iPhone 15 Plus
iPhone 14 Pro
iPhone 14 Pro
iPhone 14 Pro
iPhone 14 Pro
iPhone 14 Pro Max
iPhone 14 Pro Max
iPhone 14 Pro Max
iPhone 14 Pro Max
iPhone 14
iPhone 14
iPhone 14
iPhone 14 Plus
iPhone 14 Plus
iPhone 14 Plus
0                   
1      iPhone 17 Pro
2                   
3      iPhone 17 Pro
4                   
           ...      
127                 
128                 
1

Unnamed: 0,part_number,shop_name,price_new,recorded_at


In [54]:
def _build_color_map_shop14(info_df: pd.DataFrame) -> Dict[Tuple[str, int], Dict[str, Tuple[str, str]]]:
    """
    构建 (model_norm, cap_gb) -> { color_norm: (part_number, color_raw) }
    """
    df = info_df.copy()
    df["model_name_norm"] = df["model_name"].map(_normalize_model_generic)
    df["capacity_gb"] = pd.to_numeric(df["capacity_gb"], errors="coerce").astype("Int64")
    df["color_norm"] = df["color"].map(lambda x: _norm(str(x)))
    cmap: Dict[Tuple[str, int], Dict[str, Tuple[str, str]]] = {}
    for _, r in df.iterrows():
        m = r["model_name_norm"]
        cap = r["capacity_gb"]
        if not m or pd.isna(cap):
            continue
        key = (m, int(cap))
        cmap.setdefault(key, {})
        cmap[key][_norm(str(r["color"]))] = (str(r["part_number"]), str(r["color"]))
    return cmap
FAMILY_SYNONYMS_shop14 = {
    # blue family
    "blue": ["ブルー", "青"],
    "ブルー": ["ブルー", "青"],
    "青": ["ブルー", "青"],

    # black
    "black": ["ブラック", "黒"],
    "ブラック": ["ブラック", "黒"],
    "黒": ["ブラック", "黒"],

    # white
    "white": ["ホワイト", "白"],
    "ホワイト": ["ホワイト", "白"],
    "白": ["ホワイト", "白"],

    # green
    "green": ["グリーン", "緑"],
    "グリーン": ["グリーン", "緑"],
    "緑": ["グリーン", "緑"],

    # red
    "red": ["レッド", "赤"],
    "レッド": ["レッド", "赤"],
    "赤": ["レッド", "赤"],

    # pink
    "pink": ["ピンク"],
    "ピンク": ["ピンク"],

    # purple
    "purple": ["パープル", "紫"],
    "パープル": ["パープル", "紫"],
    "紫": ["パープル", "紫"],

    # yellow
    "yellow": ["イエロー", "黄"],
    "イエロー": ["イエロー", "黄"],
    "黄": ["イエロー", "黄"],

    # orange / silver / gold / gray / natural
    "orange": ["オレンジ", "橙"],
    "オレンジ": ["オレンジ", "橙"],
    "橙": ["オレンジ", "橙"],

    "silver": ["シルバー", "銀"],
    "シルバー": ["シルバー", "銀"],
    "銀": ["シルバー", "銀"],

    "gold": ["ゴールド", "金"],
    "ゴールド": ["ゴールド", "金"],
    "金": ["ゴールド", "金"],

    "gray": ["グレー", "グレイ", "灰"],
    "グレー": ["グレー", "グレイ", "灰"],
    "グレイ": ["グレー", "グレイ", "灰"],
    "灰": ["グレー", "グレイ", "灰"],

    "natural": ["ナチュラル"],
    "ナチュラル": ["ナチュラル"],
}
COLOR_DELTA_RE_shop14 = re.compile(
    r"""(?P<label>[^：:\-\s/、／]+)\s*
        (?P<sep>[：:\-])?\s*          # ← 这里改为可选 ?!
        (?P<sign>[+\-−－])?\s*
        (?P<amount>\d[\d,]*)\s*(円)?
    """,
    re.UNICODE | re.VERBOSE,
)

# 新的分隔正则：保留全角分隔符等，但对 ASCII 逗号只在 *非数字两侧* 时作为分隔符
_SPLIT_TOKENS_SAFE_RE = re.compile(
    r"""
    [／/、，]                 # 全角/斜杠类分隔符（始终切分）
    |(?<!\d),(?!\d)          # ASCII 逗号：仅当其两侧不是数字时切分（避免拆千位分隔）
    |(?:\s+\+\s+)            # " + " 形式
    |(?:\s*;\s*)             # 分号
    """,
    re.UNICODE | re.VERBOSE,
)

_COLOR_ABS_PRICE_RE = re.compile(
    r"""^\s*
        (?P<label>[^：:\-\s/、／¥円]+?)    # 颜色标签（非贪心，避免包含金额）
        \s*(?:[:：]?\s*)                   # 可选分隔符
        (?:¥|￥)?\s*                       # 可选货币符号
        (?P<amount>\d{1,3}(?:[,\uFF0C]\d{3})*|\d+)  # 支持千位逗号（ASCII or fullwidth）或无逗号数字
        \s*(?:円)?\s*$
    """,
    re.UNICODE | re.VERBOSE,
)
def _label_matches_color_shop14(label_raw: str, color_raw: str, color_norm: str) -> bool:
    """
    宽松匹配 label 是否命中颜色：
    1) 归一化精确相等
    2) label_raw 是 color_raw 的子串
    3) 同义族：label 无论是英文还是日文（如 “blue”“ブルー”“青”“銀”“橙”），
       都先取出该族的“日文关键词集合”，只要其中任意一个出现在 color_raw 中即命中。
    """
    label_norm = _norm(label_raw)

    # 1) 精确相等（归一化后）
    if label_norm == color_norm:
        return True

    # 2) 原文子串
    if label_raw and str(label_raw) in str(color_raw):
        return True

    # 3) 同义族匹配（正向键 + 反向值）
    # 3.1 直接以 label_raw/label_norm 作为键
    keys = {label_raw.strip().lower(), label_norm, label_raw.strip()}
    candidates = set()
    for k in keys:
        if k in FAMILY_SYNONYMS_shop14:
            candidates.update(FAMILY_SYNONYMS_shop14[k])

    # 3.2 若还没命中，将 label 当作“族内词”去反查家族，再收集该家族的全部关键词
    if not candidates:
        for fam, tokens in FAMILY_SYNONYMS_shop14.items():
            if any((t == label_raw) or (t == label_norm) or (t in str(label_raw)) for t in tokens):
                candidates.update(tokens)
                break

    # 家族里的任一关键词是 color_raw 的子串即可
    return any(tok in str(color_raw) for tok in candidates)
def _extract_color_deltas_shop14(text: str) -> List[Tuple[str, int]]:
    """
    从 '减价条件2' 提取若干 (label_raw, delta_int)。
    允许多组，使用 '/', '／', '、', ',', '，', ';' 等分隔。
    例：
      '青-3000'          -> [('青', -3000)]
      '橙/銀+1000'       -> [('橙', +1000), ('銀', +1000)]
      'ブルー：-2,000円' -> [('ブルー', -2000)]
    """
    out: List[Tuple[str, int]] = []
    if not text:
        return out
    # 先分段，再逐段匹配
    parts = [p.strip() for p in SPLIT_TOKENS_RE.split(str(text)) if p and p.strip()]
    for part in parts:
        m = COLOR_DELTA_RE_shop14.search(part)
        if not m:
            continue
        label = m.group("label").strip()
        sep = m.group("sep")
        sign = m.group("sign")
        amt = to_int_yen(m.group("amount"))
        if amt is None:
            continue
        # 有显式 sign 用之；否则以分隔符是否为负号判断
        if sign:
            negative = sign in ("-", "−", "－")
        else:
            negative = sep in ("-", "−", "－")
        delta = -int(amt) if negative else int(amt)
        out.append((label, delta))
    return out

def _clean_remark_frag(x) -> str:
    """把单列 remark 做清理：去 None/nan，统一空格，去 BOM，去多余标点尾巴等。"""
    if x is None:
        return ""
    s = str(x).strip()
    if not s:
        return ""
    # pandas 的 nan/NaN/None 字符串化为 'nan'，把它当空
    if s.lower() == "nan":
        return ""
    # 去 BOM / 不可见空白
    s = s.lstrip("\ufeff").replace("\u3000", " ")
    # 把多空格压成一个
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _has_all_colors(text: str) -> Optional[int]:
    """
    若文本含“全色”，且可选出现 '全色 ± 金額'，返回统一 delta；
    若仅出现 '全色' 无金额，返回 0；
    若未出现 '全色'，返回 None。
    """
    if not text:
        return None
    s = str(text)
    if "全色" not in s:
        return None
    # 试图解析 "全色 ± n円"
    m = re.search(r"全色\s*[：:\-]?\s*([+\-−－])?\s*(\d[\d,]*)\s*円", s)
    if m:
        sign = m.group(1) or "+"
        amt = to_int_yen(m.group(2)) or 0
        if sign in ("-", "−", "－"):
            amt = -amt
        return int(amt)
    return 0
SPLIT_TOKENS_RE = re.compile(r"[／/、，,]|(?:\s+\+\s+)|(?:\s*;\s*)")
COLOR_ABS_PRICE_RE = re.compile(
    r"""(?P<label>[^：:\-\s/、／¥円]+)\s*      # 颜色标签（不能以 +/- 开头）
        (?:[:：]?\s*)                         # 可选分隔
        (?:¥|￥)?\s*                          # 可选货币符号
        (?P<amount>\d[\d,]*)\s*               # 金额
        (?:円)?\s*$                           # 可选 '円'
    """,
    re.UNICODE | re.VERBOSE,
)
def _norm_label(lbl: str) -> str:
    """去除空白并统一全角空格/NBSP，保留原文字顺序用作匹配用 key"""
    if lbl is None:
        return ""
    s = str(lbl)
    # 去掉左右空白并规范全角空格为半角
    s = s.strip().replace("\u3000", " ").replace("\xa0", " ").strip()
    # 把中间多空格合并
    s = re.sub(r"\s+", " ", s)
    return s

def _extract_color_abs_prices(text: str) -> List[Tuple[str, int]]:
    """
    从 text 中抽取 (label_raw, abs_price) 绝对价。
    修复点：不会在数字千位分隔符处拆分（保留 229,000 完整）。
    支持多标签共用金额：'青/銀327000'、'青 銀 327000' 等。
    """
    out: List[Tuple[str, int]] = []
    if not text:
        return out

    pending_labels: List[str] = []

    # 先把非可见 BOM / nan 文本规范化
    s_all = str(text).strip()
    if s_all.lower() == "nan" or s_all == "":
        return out

    # 逐片段处理（使用更安全的切分）
    parts = [p.strip() for p in _SPLIT_TOKENS_SAFE_RE.split(s_all) if p and p.strip()]
    if not parts:
        parts = [s_all]

    for part in parts:
        # 如果片段里同时含有 + 或 - （显式差额），跳过（差额解析会处理）
        if any(ch in part for ch in ("+", "-", "−", "－")):
            # 但也要考虑像 "青 229,000" 这种包含空格和逗号的正常绝对价 -> 上面条件不会触发
            # 所以这里是安全的
            continue

        m = _COLOR_ABS_PRICE_RE.search(part)
        if m:
            label_raw = _norm_label(m.group("label"))
            amt_txt = m.group("amount")
            # 把千分符去掉（支持 ASCII comma 和 全角逗号）
            amt_clean = re.sub(r"[,\uFF0C]", "", amt_txt)
            try:
                amt_val = int(amt_clean)
            except Exception:
                # 额外容错：用 to_int_yen 作为 fallback（如果你有该工具）
                try:
                    amt_val = int(to_int_yen(amt_txt) or 0)
                except Exception:
                    continue

            if label_raw:
                out.append((label_raw, amt_val))
                # 把 pending 的标签也一并赋值（多标签共用金额情形）
                for pl in pending_labels:
                    pln = _norm_label(pl)
                    if pln:
                        out.append((pln, amt_val))
                pending_labels = []
            continue

        # 没有找到金额：这个片段可能只是标签（或多标签连着）
        # 用斜杠或全角/半角逗号/顿号分割出标签候选
        for tok in re.split(r"[／/、，;；,]", part):
            tok = _norm_label(tok)
            if tok:
                pending_labels.append(tok)

    return out


def clean_shop14(df: pd.DataFrame) -> pd.DataFrame:
    print("shop14:買取楽園---------->进入清洗器时间：", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    for c in ["name", "data6", "price2", "减价条件2", "time-scraped"]:
        if c not in df.columns:
            raise ValueError(f"shop14 清洗器缺少必要列：{c}")

    info_df = _load_iphone17_info_df_for_shop2()
    cmap_all = _build_color_map_shop14(info_df)

    rows: List[dict] = []

    for idx, row in df.iterrows():
        status = str(row.get("data6") or "")
        if "未開封" not in status:
            print(f"[{idx}] skip: data6 不包含 未開封 -> '{status}'")
            continue

        model_text = str(row.get("name") or "").strip()
        if not model_text:
            print(f"[{idx}] skip: name 为空")
            continue

        model_norm = _normalize_model_generic(model_text)
        cap_gb = _parse_capacity_gb(model_text)
        if not model_norm or pd.isna(cap_gb):
            print(f"[{idx}] skip: 无法解析 model/capacity -> name='{model_text}', model_norm='{model_norm}', cap_gb='{cap_gb}'")
            continue
        cap_gb = int(cap_gb)

        key = (model_norm, cap_gb)
        color_map = cmap_all.get(key)
        if not color_map:
            print(f"[{idx}] skip: info_df 中无该机型容量 -> {key}")
            continue

        base_price = to_int_yen(row.get("price2"))
        if base_price is None:
            print(f"[{idx}] skip: 无法解析基准价 price2='{row.get('price2')}'")
            continue
        base_price = int(base_price)

        # ===== 清洗并合并 remark 列（过滤掉 nan/空）
        part_a = _clean_remark_frag(row.get("减价条件2"))
        part_b = _clean_remark_frag(row.get("23432")) if "23432" in row.index else ""
        # 也支持旧字段名：若某些文件用不同名字，可以按需加
        combined = " ".join([p for p in (part_a, part_b) if p]).strip()

        # rec_at = parse_dt_aware(row.get("time-scraped"))
        rec_at = row.get("time-scraped")

        print(f"[{idx}] model='{model_text}' -> norm='{model_norm}', cap={cap_gb}, base_price={base_price}, combined_remark='{combined}'")

        # 先看“全色”
        all_delta = _has_all_colors(combined)
        if all_delta is not None:
            final_price = base_price + all_delta
            print(f"[{idx}] 全色调整 detected: all_delta={all_delta}, final_price={final_price}")
            for _col_norm, (pn, _raw) in color_map.items():
                rows.append({
                    "part_number": pn,
                    "shop_name": "買取楽園",
                    "price_new": int(final_price),
                    "recorded_at": rec_at,
                })
            continue

        # 关键点：分别在每个单独列上也尝试解析，并将结果合并
        # 这样像你的例子 'nan 青 229,000'，若 '青 229,000' 在单列里可以命中
        abs_list = []
        labels_and_deltas = []

        # parse each source fragment separately (优先解析每个 fragment)
        for frag in (part_a, part_b):
            if not frag:
                continue
            a = _extract_color_abs_prices(frag)
            d = _extract_color_deltas_shop14(frag)
            if a:
                abs_list.extend(a)
            if d:
                labels_and_deltas.extend(d)

        # 兼容再尝试对合并字符串解析（兜底）
        if not abs_list:
            abs_list = _extract_color_abs_prices(combined)
        if not labels_and_deltas:
            labels_and_deltas = _extract_color_deltas_shop14(combined)

        color_abs: Dict[str, int] = {}
        color_deltas: Dict[str, int] = {}

        print(f"[{idx}] parsed abs_list={abs_list}, labels_and_deltas={labels_and_deltas}")

        # 绝对价匹配
        if abs_list:
            for col_norm, (pn, col_raw) in color_map.items():
                for label_raw, abs_price in abs_list:
                    if _label_matches_color_shop14(label_raw, col_raw, col_norm):
                        color_abs[col_norm] = abs_price
                        print(f"[{idx}] abs match -> color_raw='{col_raw}' (norm={col_norm}) abs_price={abs_price}")

        # 差额匹配
        if labels_and_deltas:
            for col_norm, (pn, col_raw) in color_map.items():
                for label_raw, delta in labels_and_deltas:
                    if _label_matches_color_shop14(label_raw, col_raw, col_norm):
                        color_deltas[col_norm] = delta
                        print(f"[{idx}] delta match -> color_raw='{col_raw}' (norm={col_norm}) delta={delta}")

        # 生成 price
        for col_norm, (pn, col_raw) in color_map.items():
            if col_norm in color_abs:
                price_val = color_abs[col_norm]
                reason = "abs"
            else:
                price_val = base_price + color_deltas.get(col_norm, 0)
                reason = f"base+delta({color_deltas.get(col_norm,0)})" if col_norm in color_deltas else "base"
            print(f"[{idx}] -> color='{col_raw}' (norm={col_norm}) pn={pn} price={price_val} reason={reason}")
            rows.append({
                "part_number": pn,
                "shop_name": "買取楽園",
                "price_new": int(price_val),
                "recorded_at": rec_at,
            })

    out = pd.DataFrame(rows, columns=["part_number", "shop_name", "price_new", "recorded_at"])
    if not out.empty:
        out = out.dropna(subset=["part_number", "price_new"]).reset_index(drop=True)
        out["part_number"] = out["part_number"].astype(str)
        out["price_new"] = pd.to_numeric(out["price_new"], errors="coerce").astype("Int64")
    return out

## shop14 買取楽園

In [55]:

df_14 = pd.read_csv("/Users/syu/PycharmProjects/YamagotiProjects/shop14 (1).csv")
df_14

Unnamed: 0,web-scraper-order,web-scraper-start-url,data5,price2,data6,name,减价条件,减价条件2,23432,time-scraped
0,1761554891-1,https://www.keitairakuen.com/product-category/...,"¥176,500","新品: ¥176,500",SIM FREE 未開封,iPhone17 Pro 256GB,,,,2025-10-27 17:48:11
1,1761554891-2,https://www.keitairakuen.com/product-category/...,"¥162,500","新品: ¥162,500",SIM FREE　開封,iPhone17 Pro 256GB,"交換品 10,000",,,2025-10-27 17:48:11
2,1761554891-3,https://www.keitairakuen.com/product-category/...,"¥212,000","新品: ¥212,000",SIM FREE 未開封,iPhone17 Pro 512GB,,,,2025-10-27 17:48:11
3,1761554891-4,https://www.keitairakuen.com/product-category/...,"¥195,500","新品: ¥195,500",SIM FREE　開封,iPhone17 Pro 512GB,"交換品 10,000",,,2025-10-27 17:48:11
4,1761554891-5,https://www.keitairakuen.com/product-category/...,"¥245,000","新品: ¥245,000",SIM FREE 未開封,iPhone17 Pro 1TB,,,,2025-10-27 17:48:11
5,1761554891-6,https://www.keitairakuen.com/product-category/...,"¥229,500","新品: ¥229,500",SIM FREE　開封,iPhone17 Pro 1TB,"交換品 10,000",,,2025-10-27 17:48:11
6,1761554891-7,https://www.keitairakuen.com/product-category/...,"¥198,000","新品: ¥198,000",SIM FREE 未開封,iPhone17 Pro Max 256GB,,,,2025-10-27 17:48:11
7,1761554891-8,https://www.keitairakuen.com/product-category/...,"¥183,500","新品: ¥183,500",SIM FREE　開封,iPhone17 Pro Max 256GB,"交換品 10,000",,,2025-10-27 17:48:11
8,1761554891-9,https://www.keitairakuen.com/product-category/...,"¥230,500","新品: ¥230,500",SIM FREE 未開封,iPhone17 Pro Max 512GB,,,"青 229,000",2025-10-27 17:48:11
9,1761554891-10,https://www.keitairakuen.com/product-category/...,"¥217,000","新品: ¥217,000",SIM FREE　開封,iPhone17 Pro Max 512GB,"交換品 10,000",,,2025-10-27 17:48:11


In [56]:
res = clean_shop14(df_14)
res

shop14:買取楽園---------->进入清洗器时间： 2025-10-27 18:17:31
[0] model='iPhone17 Pro 256GB' -> norm='iPhone 17 Pro', cap=256, base_price=176500, combined_remark=''
[0] parsed abs_list=[], labels_and_deltas=[]
[0] -> color='シルバー' (norm=シルバー) pn=MG854J/A price=176500 reason=base
[0] -> color='コズミックオレンジ' (norm=コズミックオレンジ) pn=MG864J/A price=176500 reason=base
[0] -> color='ディープブルー' (norm=ディープブルー) pn=MG874J/A price=176500 reason=base
[1] skip: data6 不包含 未開封 -> 'SIM FREE　開封'
[2] model='iPhone17 Pro 512GB' -> norm='iPhone 17 Pro', cap=512, base_price=212000, combined_remark=''
[2] parsed abs_list=[], labels_and_deltas=[]
[2] -> color='シルバー' (norm=シルバー) pn=MG894J/A price=212000 reason=base
[2] -> color='コズミックオレンジ' (norm=コズミックオレンジ) pn=MG8A4J/A price=212000 reason=base
[2] -> color='ディープブルー' (norm=ディープブルー) pn=MG8C4J/A price=212000 reason=base
[3] skip: data6 不包含 未開封 -> 'SIM FREE　開封'
[4] model='iPhone17 Pro 1TB' -> norm='iPhone 17 Pro', cap=1024, base_price=245000, combined_remark=''
[4] parsed abs_list=[],

Unnamed: 0,part_number,shop_name,price_new,recorded_at
0,MG854J/A,買取楽園,176500,2025-10-27 17:48:11
1,MG864J/A,買取楽園,176500,2025-10-27 17:48:11
2,MG874J/A,買取楽園,176500,2025-10-27 17:48:11
3,MG894J/A,買取楽園,212000,2025-10-27 17:48:11
4,MG8A4J/A,買取楽園,212000,2025-10-27 17:48:11
5,MG8C4J/A,買取楽園,212000,2025-10-27 17:48:11
6,MG8D4J/A,買取楽園,245000,2025-10-27 17:48:11
7,MG8E4J/A,買取楽園,245000,2025-10-27 17:48:11
8,MG8F4J/A,買取楽園,245000,2025-10-27 17:48:11
9,MFY84J/A,買取楽園,198000,2025-10-27 17:48:11


## shop9モバステ

In [14]:
def clean_shop9(df: pd.DataFrame) -> pd.DataFrame:
    print("shop9:アキモバ---------->进入清洗器时间：", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    """
    增强的 shop9 清洗器，支持 price 字段中包含：
      - 绝对价（例：'橙175,000/青,銀174,000'）
      - 差额（例：'青,黒-2,000円'）
      - 全色（'全色'）
    输出列与以前一致：part_number, shop_name (固定 'アキモバ'), price_new, recorded_at
    """
    info_df = _load_iphone17_info_df_for_shop2()

    col_model = "機種名"
    col_price = "買取価格"
    col_time  = "time-scraped"
    for need in (col_model, col_price, col_time):
        if need not in df.columns:
            raise ValueError(f"shop9 清洗器缺少必要列：{need}")

    # --- helpers ----------------------------------------------------------
    SPLIT_LABELS = re.compile(r"[／/、，,・\s]+")  # 分隔多个颜色标签的符号集

    # 把全角数字/逗号/符号等做成半角并返回整数（或 None）
    def _norm_amount_to_int(s: str) -> Optional[int]:
        if s is None:
            return None
        # 简单半角化和去噪
        tt = str(s).replace("　", " ").replace("，", ",").replace("．", ".")
        # remove full-width digits -> use translate map if available, else fallback
        tt = tt.translate(str.maketrans({
            '０':'0','１':'1','２':'2','３':'3','４':'4','５':'5','６':'6','７':'7','８':'8','９':'9',
            '－':'-','＋':'+','¥':'','￥':''
        }))
        m = re.search(r"([0-9][0-9,]*)", tt)
        if not m:
            return None
        try:
            return int(m.group(1).replace(",", ""))
        except Exception:
            return None

    # 安全分割：始终切分全角 / 、 ／ ; 但对 ASCII 逗号只在两侧不是数字时才切分（避免分隔千位逗号）
    _SAFE_PART_SPLIT_RE = re.compile(r"[／/、，;；\n]|(?<!\d),(?!\d)")

    # 更稳健的金额捕获（支持 ASCII comma 或 全角逗号 作为千位分隔）
    _ABS_PRICE_RE = re.compile(
        r"""
        ^\s*
        (?P<label>[^：:\-\+\s/、，,・¥￥円]+(?:[／/、，,・\s]+[^：:\-\+\s/、，,・¥￥円]+)*)?   # 标签（可能含空格或 / 等分隔的多个词）
        \s*
        (?:(?:[:：]?\s*)?)
        (?:¥|￥)?\s*
        (?P<amount>\d{1,3}(?:[,\uFF0C]\d{3})*|\d+)\s*(?:円)?    # 支持千位逗号（ASCII或全角）或无逗号数字
        \s*$
        """,
        re.UNICODE | re.VERBOSE,
    )

    # 从文本中提取差额：标签 + (必带 + 或 -) + 数字
    DELTA_RE = re.compile(
        r"""(?P<labels>[^+\-−－\d¥￥円]+?)\s*(?P<sign>[+\-−－])\s*(?P<amount>[０-９0-9][０-９0-9,，]*)\s*(?:円)?""",
        re.VERBOSE | re.UNICODE
    )

    def _extract_abs_prices(text: str) -> List[Tuple[str, int]]:
            """
            返回 [(label_raw, abs_price_int), ...]。
            - 不会在数字的千位逗号处切分（保留 229,000）。
            - 支持“先标签后金额”的情形：'青,銀229,000' -> pending ['青'] then see '銀229,000' -> assign 229000 to both.
            - 支持 '橙230,500/青,銀229,000'、'青/銀327000'、'シルバー：206000円' 等。
            """
            out: List[Tuple[str, int]] = []
            if not text:
                return out

            s = str(text).strip()
            if s.lower() == "nan" or s == "":
                return out

            pending: List[str] = []

            # 切片（避免拆千位逗号）
            parts = [p.strip() for p in _SAFE_PART_SPLIT_RE.split(s) if p and p.strip()]
            if not parts:
                parts = [s]

            for part in parts:
                # 如果包含 +/- 符号，这个片段交给差额解析
                if any(ch in part for ch in ("+", "-", "−", "－")):
                    # 但要注意像 "青 229,000" 这种含空格+数字不会触发这里（安全）
                    continue

                m = _ABS_PRICE_RE.match(part)
                if m:
                    label_raw = (m.group("label") or "").strip()
                    amt_txt = m.group("amount") or ""
                    # 清理千分符（ASCII comma 和 全角逗号）
                    amt_clean = re.sub(r"[,\uFF0C]", "", amt_txt)
                    try:
                        amt_val = int(amt_clean)
                    except Exception:
                        amt_val = _norm_amount_to_int(amt_txt)
                    if amt_val is None:
                        # 无效金额则跳过该片段（但保留 label 到 pending）
                        if label_raw:
                            # 把 label 分成若干 token 并缓存
                            toks = [t.strip() for t in re.split(r"[／/、，,・\s]+", label_raw) if t.strip()]
                            pending.extend(toks)
                        continue

                    # 当前片段若含标签，把标签拆开并赋值；否则若 pending 不空，把 pending 全部赋值
                    if label_raw:
                        toks = [t.strip() for t in re.split(r"[／/、，,・\s]+", label_raw) if t.strip()]
                    else:
                        toks = []

                    # 如果没有当前标签，但 pending 有值，则把 pending 全部赋值
                    if not toks and pending:
                        for pl in pending:
                            out.append((pl, int(amt_val)))
                        pending = []
                    else:
                        # 把当前标签及 pending 全部赋值（pending 也许是前面单独的标签）
                        for pl in pending:
                            out.append((pl, int(amt_val)))
                        for tok in toks:
                            out.append((tok, int(amt_val)))
                        pending = []
                    continue

                # 若无法匹配金额，则把这个片段作为标签缓存（可能是 '青' 或 '青,銀' 里的一部分）
                for tok in re.split(r"[／/、，,・\s]+", part):
                    tok = tok.strip()
                    if tok:
                        pending.append(tok)

            return out

    def _extract_deltas(text: str) -> List[Tuple[str, int]]:
        """返回 [(label_raw, delta_int), ...]。处理 '青,黒-2,000円' / 'シルバー-3,000/ディープブルー-3,000' 等情况。"""
        out: List[Tuple[str, int]] = []
        if not text:
            return out
        s = str(text)
        # 全局查找所有带符号的匹配（支持多标签共享同一金额，如 'シルバー/ディープブルー-3000'）
        for m in DELTA_RE.finditer(s):
            labels_part = m.group("labels") or ""
            sign = m.group("sign") or "+"
            amt_txt = m.group("amount")
            amt = _norm_amount_to_int(amt_txt)
            if amt is None:
                continue
            if sign in ("-", "−", "－"):
                delta = -int(amt)
            else:
                delta = int(amt)
            # labels_part 可能包含多个标签
            toks = [t.strip() for t in re.split(r"[／/、，,・\s]+", labels_part) if t.strip()]
            if not toks:
                continue
            for tok in toks:
                out.append((tok, delta))
        # 退化处理：如果没有找到任何带符号的匹配，但字符串中包含 '全色'，返回全色 0（或查找后面是否带数）
        if not out and "全色" in s:
            # 查看是否有数值附带（例如 '全色-1000' 的情况已被上面捕获）
            m = re.search(r"全色\s*[：:\-]?\s*([+\-−－])?\s*([０-９0-9][０-９0-9,，]*)", s)
            if m:
                sign = m.group(1) or "+"
                amt = _norm_amount_to_int(m.group(2))
                if amt is None:
                    amt = 0
                out.append(("全色", -amt if sign in ("-", "−", "－") else amt))
            else:
                out.append(("全色", 0))
        return out

    # ---------------------------------------------------------------------

    # 先把 info_df 分组，建立 (model_norm, cap) -> { color_norm: part_number }
    info_df2 = info_df.copy()
    info_df2["model_name_norm"] = info_df2["model_name"].map(_normalize_model_generic)
    info_df2["capacity_gb"] = pd.to_numeric(info_df2["capacity_gb"], errors="coerce").astype("Int64")
    info_df2["color_norm"] = info_df2["color"].map(lambda x: _norm(str(x)))

    pn_map: Dict[Tuple[str, int], Dict[str, str]] = {}
    for _, r in info_df2.iterrows():
        m = r["model_name_norm"]
        cap = r["capacity_gb"]
        col = r["color_norm"]
        pn = str(r["part_number"])
        if pd.isna(cap) or not m or not col:
            continue
        key = (m, int(cap))
        pn_map.setdefault(key, {})
        pn_map[key][col] = pn

    # 处理输入列
    model_norm = df[col_model].map(_normalize_model_generic)
    cap_gb     = df[col_model].map(_parse_capacity_gb)
    # recorded_at = df[col_time].map(lambda x: parse_dt_aware(x))  # shop9 你说不需要时区修正，但 parse_dt_aware 会处理，保留即可
    recorded_at = df[col_time]

    rows = []
    for i in range(len(df)):
        m = model_norm.iat[i]
        c = cap_gb.iat[i]
        t = recorded_at.iat[i]
        raw_price_cell = df[col_price].iat[i]

        if not m or pd.isna(c):
            continue
        c = int(c)

        key = (m, c)
        color_to_pn = pn_map.get(key)
        if not color_to_pn:
            # info 表中没有该机型/容量，跳过
            continue

        s = str(raw_price_cell) if raw_price_cell is not None else ""
        # 先尝试解析绝对价（优先）
        abs_list = _extract_abs_prices(s)  # [(label, price), ...]
        deltas = _extract_deltas(s)       # [(label, delta), ...]
        base_price = to_int_yen(s)        # 基准价（若文本里有明显基价，会返回第一个数）
        # 优化逻辑：
        # - 如果 abs_list 存在，则对匹配到的颜色使用绝对价；未出现的颜色使用 base_price（若无 base，则跳过那些未给价的颜色）
        # - 否则若 deltas 中包含 ('全色', val) 或 '全色'，对所有颜色用 base + val
        # - 否则对有 delta 的颜色采用 base+delta，其他颜色采用 base
        color_abs_map: Dict[str, int] = {}
        color_delta_map: Dict[str, int] = {}
        # fill abs map: 把 label 映射到 color_norm
        if abs_list:
            for label_raw, amt in abs_list:
                lbl_norm = _norm(label_raw)
                # label 可能是多个逗号分隔形式，拆开再映射
                for tok in [t for t in re.split(r"[／/、，,・\s]+", label_raw) if t and t.strip()]:
                    n = _norm(tok)
                    # 找到 info_df 中与之匹配的颜色 key（宽松匹配：直接相等或子串）
                    matched = None
                    for col_norm, pn in color_to_pn.items():
                        if n == col_norm or tok in col_norm or tok in pn or tok in str(col_norm):
                            matched = col_norm
                            break
                        # 也可匹配 unicode exact substring of raw color, but we use available col_norm keys
                    if matched:
                        color_abs_map[matched] = int(amt)
            # 可能 abs_list 给到的是部分颜色；对未给到绝对价但 info 中有颜色，若 base_price 存在则回退到 base_price
        if deltas:
            for label_raw, delta in deltas:
                if label_raw == "全色":
                    color_delta_map["ALL"] = int(delta)
                    continue
                for tok in [t for t in re.split(r"[／/、，,・\s]+", label_raw) if t and t.strip()]:
                    n = _norm(tok)
                    matched = None
                    for col_norm in color_to_pn.keys():
                        # 宽松匹配：归一化相同或标签是颜色原文的子串
                        if n == col_norm or tok in col_norm or tok in str(col_norm):
                            matched = col_norm
                            break
                    if matched:
                        color_delta_map[matched] = int(delta)

        # 生成输出行
        # 1) 如果存在 ALL（全色），以 base + ALL（若 base 缺失则跳过）
        if "ALL" in color_delta_map:
            if base_price is None:
                # 没有基价但声明了全色差额，无法计算 -> 跳过整个机型
                continue
            final = int(base_price + color_delta_map["ALL"])
            for col_norm, pn in color_to_pn.items():
                rows.append({
                    "part_number": pn,
                    "shop_name": "アキモバ",
                    "price_new": int(final),
                    "recorded_at": t,
                })
            continue

        # 2) 若存在绝对价，优先使用其对应颜色；其余颜色用 base_price（若 base 不存在则跳过未指定颜色）
        if color_abs_map:
            for col_norm, pn in color_to_pn.items():
                if col_norm in color_abs_map:
                    price_val = color_abs_map[col_norm]
                    rows.append({
                        "part_number": pn,
                        "shop_name": "アキモバ",
                        "price_new": int(price_val),
                        "recorded_at": t,
                    })
                else:
                    # 回退：若 base_price 存在，用 base；否则跳过
                    if base_price is not None:
                        rows.append({
                            "part_number": pn,
                            "shop_name": "アキモバ",
                            "price_new": int(base_price),
                            "recorded_at": t,
                        })
            continue

        # 3) 否则按差额映射（部分颜色可能有 delta）
        if base_price is None:
            # 没有基价也没有绝对价 -> 跳过
            continue
        for col_norm, pn in color_to_pn.items():
            delta = color_delta_map.get(col_norm, 0)
            rows.append({
                "part_number": pn,
                "shop_name": "アキモバ",
                "price_new": int(base_price + delta),
                "recorded_at": t,
            })

    out = pd.DataFrame(rows, columns=["part_number","shop_name","price_new","recorded_at"])
    if not out.empty:
        out = out.dropna(subset=["part_number","price_new"]).reset_index(drop=True)
        out["part_number"] = out["part_number"].astype(str)
        out["price_new"] = pd.to_numeric(out["price_new"], errors="coerce").astype("Int64")
    return out

In [53]:
# @register_cleaner("shop9")
def clean_shop9(df: pd.DataFrame) -> pd.DataFrame:
    import time
    print("shop9:アキモバ---------->进入清洗器时间：", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

    info_df = _load_iphone17_info_df_for_shop2()

    col_model = "機種名"
    col_price = "買取価格"
    col_color = "色・詳細等"
    col_time  = "time-scraped"

    for need in (col_model, col_price, col_color, col_time):
        if need not in df.columns:
            raise ValueError(f"shop9 清洗器缺少必要列：{need}")

    # 同义表（可扩充）
    FAMILY_SYNONYMS_SHOP9 = {
        "blue": ["ブルー", "青", "ディープブルー", "ディープ ブルー"],
        "ブルー": ["ブルー", "青", "ディープブルー"],
        "青": ["ブルー", "青", "ディープブルー"],
        "ディープブルー": ["ディープブルー", "ブルー", "青"],
        "silver": ["シルバー", "銀"],
        "シルバー": ["シルバー", "銀"],
        "銀": ["シルバー", "銀"],
        "black": ["ブラック", "黒"],
        "ブラック": ["ブラック", "黒"],
        "黒": ["ブラック", "黒"],
        "orange": ["オレンジ", "橙", "コズミックオレンジ"],
        "オレンジ": ["オレンジ", "橙"],
        "橙": ["オレンジ", "橙"],
        "white": ["ホワイト", "白"],
        "ホワイト": ["ホワイト", "白"],
    }
    SYNONYM_LOOKUP = {}
    for k, toks in FAMILY_SYNONYMS_SHOP9.items():
        for t in toks:
            SYNONYM_LOOKUP[_norm(str(t))] = [ _norm(str(x)) for x in toks ]

    # 正则与辅助
    SPLIT_SEPS = r"[／/、，,・\s]+"  # 分隔多个颜色标签的符号集
    # 全局抓取：labels（可以有多个标签） + 金额（允许千分位逗号 & 全角数字）
    GLOBAL_LABEL_AMOUNT_RE = re.compile(
        r"""(?P<labels>(?:[^\d¥￥円/、，,;；]+?(?:[／/、，,・\s]+[^\d¥￥円/、，,;；]+?)*))
            \s*(?:[:：]?\s*)?
            (?:¥|￥)?\s*(?P<amount>[０-９0-9][０-９0-9,，]*)(?:円)?
        """,
        re.VERBOSE | re.UNICODE,
    )
    # 差额（含符号）
    DELTA_RE = re.compile(
        r"""(?P<labels>[^+\-−－\d¥￥円]+?)\s*(?P<sign>[+\-−－])\s*(?P<amount>[０-９0-9][０-９0-9,，]*)\s*(?:円)?""",
        re.VERBOSE | re.UNICODE
    )

    def _norm_amount_to_int(s: str) -> Optional[int]:
        if s is None:
            return None
        tt = str(s).replace("　", " ").replace("，", ",").replace("．", ".")
        tt = tt.translate(str.maketrans({
            '０':'0','１':'1','２':'2','３':'3','４':'4','５':'5','６':'6','７':'7','８':'8','９':'9',
            '－':'-','＋':'+','¥':'','￥':''
        }))
        m = re.search(r"([0-9][0-9,]*)", tt)
        if not m:
            return None
        try:
            return int(m.group(1).replace(",", ""))
        except Exception:
            return None

    def _is_pure_number_token(tok: str) -> bool:
        tok2 = tok.replace(",", "").replace("，", "").strip()
        return bool(re.fullmatch(r"[0-9,]+", tok2))

    def _extract_abs_prices(text: str) -> List[Tuple[str, int]]:
        """
        使用全局正则抓取 'labels + amount' 的片段（labels 可含多个以 / 、 等分隔）。
        例如：
          '未開 橙230,500/青,銀229,000' -> [('橙',230500), ('青',229000), ('銀',229000)]
        """
        out: List[Tuple[str, int]] = []
        if not text:
            return out
        s = str(text)
        for m in GLOBAL_LABEL_AMOUNT_RE.finditer(s):
            labels_part = m.group("labels") or ""
            amt_txt = m.group("amount")
            amt = _norm_amount_to_int(amt_txt)
            if amt is None:
                continue
            toks = [t.strip() for t in re.split(SPLIT_SEPS, labels_part) if t.strip()]
            for tok in toks:
                if _is_pure_number_token(tok):
                    continue
                out.append((tok, int(amt)))
        # fallback: 若找不到任何 labels+amount，但存在「単独标签」与后面单独金额（少见），
        # 则尝试简单的 "label amount" 的查找（已被 GLOBAL 捕获的大多数会命中）
        if not out:
            # 尝试形式 like '青 229,000'
            m2 = re.finditer(r"(?P<label>[^\d¥￥円/、，,;；]+?)\s*(?:¥|￥)?\s*(?P<amount>[０-９0-9][０-９0-9,，]*)", s)
            for m in m2:
                label = m.group("label").strip()
                amt = _norm_amount_to_int(m.group("amount"))
                if label and amt is not None and not _is_pure_number_token(label):
                    out.append((label, int(amt)))
        return out

    def _extract_deltas(text: str) -> List[Tuple[str, int]]:
        out: List[Tuple[str, int]] = []
        if not text:
            return out
        s = str(text)
        for m in DELTA_RE.finditer(s):
            labels_part = m.group("labels") or ""
            sign = m.group("sign") or "+"
            amt_txt = m.group("amount")
            amt = _norm_amount_to_int(amt_txt)
            if amt is None:
                continue
            delta = -int(amt) if sign in ("-", "−", "－") else int(amt)
            toks = [t.strip() for t in re.split(SPLIT_SEPS, labels_part) if t.strip()]
            for tok in toks:
                if _is_pure_number_token(tok):
                    continue
                out.append((tok, delta))
        # 全色 fallback
        if not out and "全色" in s:
            m = re.search(r"全色\s*[：:\-]?\s*([+\-−－])?\s*([０-９0-9][０-９0-9,，]*)", s)
            if m:
                sign = m.group(1) or "+"
                amt = _norm_amount_to_int(m.group(2))
                if amt is None:
                    amt = 0
                out.append(("全色", -amt if sign in ("-", "−", "－") else amt))
            else:
                out.append(("全色", 0))
        return out

    # build pn map
    info_df2 = info_df.copy()
    info_df2["model_name_norm"] = info_df2["model_name"].map(_normalize_model_generic)
    info_df2["capacity_gb"] = pd.to_numeric(info_df2["capacity_gb"], errors="coerce").astype("Int64")
    info_df2["color_norm"] = info_df2["color"].map(lambda x: _norm(str(x)))

    pn_map: Dict[Tuple[str, int], Dict[str, str]] = {}
    for _, r in info_df2.iterrows():
        m = r["model_name_norm"]
        cap = r["capacity_gb"]
        col = r["color_norm"]
        pn = str(r["part_number"])
        if pd.isna(cap) or not m or not col:
            continue
        key = (m, int(cap))
        pn_map.setdefault(key, {})
        pn_map[key][col] = pn

    # process rows
    model_norm = df[col_model].map(_normalize_model_generic)
    cap_gb     = df[col_model].map(_parse_capacity_gb)
    # recorded_at = df[col_time].map(lambda x: parse_dt_aware(x))
    recorded_at = df[col_time]

    rows = []
    for i in range(len(df)):
        raw_model = df[col_model].iat[i]
        m = model_norm.iat[i]
        c = cap_gb.iat[i]
        t = recorded_at.iat[i]
        raw_price_cell = df[col_price].iat[i]
        raw_color_cell = df[col_color].iat[i]

        print(f"[DEBUG row={i}] raw_model={raw_model!r} -> norm={m!r}, cap={c!r}, raw_price={raw_price_cell!r}, raw_color={raw_color_cell!r}")

        if not m or pd.isna(c):
            print(f"[DEBUG row={i}] skip: model/cap missing")
            continue
        c = int(c)

        key = (m, c)
        color_to_pn = pn_map.get(key)
        if not color_to_pn:
            print(f"[DEBUG row={i}] skip: no pn_map for key={key}")
            continue

        s_color = str(raw_color_cell) if raw_color_cell is not None else ""
        s_price = str(raw_price_cell) if raw_price_cell is not None else ""
        # parse from color-col first (优先)
        abs_list = _extract_abs_prices(s_color)
        deltas = _extract_deltas(s_color)
        base_price = to_int_yen(s_price) or to_int_yen(s_color)

        # if not found in color-col, try price-col
        if not abs_list and not deltas:
            abs_list = _extract_abs_prices(s_price)
            deltas = _extract_deltas(s_price)
            if base_price is None:
                base_price = to_int_yen(s_price)

        # final fallback: whole row
        if not abs_list and not deltas:
            full_row_parts = []
            for col in df.columns:
                try:
                    v = df[col].iat[i]
                except Exception:
                    v = df.iloc[i].get(col)
                if v is None:
                    continue
                sv = str(v).strip()
                if sv and sv.lower() != "nan":
                    full_row_parts.append(sv)
            s_full = " ".join(full_row_parts)
            if s_full and s_full != s_color and s_full != s_price:
                print(f"[DEBUG row={i}] fallback parsing from full row: {s_full!r}")
                abs_list = _extract_abs_prices(s_full)
                deltas = _extract_deltas(s_full)
                if base_price is None:
                    base_price = to_int_yen(s_full)

        print(f"[DEBUG row={i}] parsed abs_list={abs_list}, deltas={deltas}, base_price={base_price}")

        # label -> col_norm matching（宽松 + 同义表）
        def _match_label_to_colnorm(tok: str) -> Optional[str]:
            if not tok:
                return None
            tok_norm = _norm(tok)
            # direct equal
            for col_norm in color_to_pn.keys():
                if tok_norm == col_norm:
                    return col_norm
            # synonyms
            candidates = set()
            if tok_norm in SYNONYM_LOOKUP:
                candidates.update(SYNONYM_LOOKUP[tok_norm])
            candidates.add(tok_norm)
            for cand in candidates:
                for col_norm in color_to_pn.keys():
                    if cand == col_norm or cand in col_norm or col_norm in cand:
                        return col_norm
            # fallback substring
            tok_short = re.sub(r"[\s\u3000\-]+", "", tok_norm)
            for col_norm in color_to_pn.keys():
                if tok_short in col_norm or col_norm in tok_short:
                    return col_norm
            return None

        color_abs_map: Dict[str, int] = {}
        color_delta_map: Dict[str, int] = {}

        for label_raw, amt in abs_list:
            toks = [t.strip() for t in re.split(SPLIT_SEPS, label_raw) if t.strip()]
            for tok in toks:
                if _is_pure_number_token(tok):
                    print(f"[DEBUG row={i}] abs skip numeric token={tok!r}")
                    continue
                matched = _match_label_to_colnorm(tok)
                if matched:
                    color_abs_map[matched] = int(amt)
                    print(f"[DEBUG row={i}] abs match: token={tok!r} -> color_norm={matched!r} price={amt}")
                else:
                    print(f"[DEBUG row={i}] abs NO-match token={tok!r}")

        for label_raw, delta in deltas:
            if label_raw == "全色":
                color_delta_map["ALL"] = int(delta)
                print(f"[DEBUG row={i}] delta ALL -> {delta}")
                continue
            toks = [t.strip() for t in re.split(SPLIT_SEPS, label_raw) if t.strip()]
            for tok in toks:
                if _is_pure_number_token(tok):
                    print(f"[DEBUG row={i}] delta skip numeric token={tok!r}")
                    continue
                matched = _match_label_to_colnorm(tok)
                if matched:
                    color_delta_map[matched] = int(delta)
                    print(f"[DEBUG row={i}] delta match: token={tok!r} -> color_norm={matched!r} delta={delta}")
                else:
                    print(f"[DEBUG row={i}] delta NO-match token={tok!r}")

        # 输出生成逻辑：ALL -> ABS -> delta/base
        if "ALL" in color_delta_map:
            if base_price is None:
                print(f"[DEBUG row={i}] ALL present but base price missing -> skip")
                continue
            final = int(base_price + color_delta_map["ALL"])
            for col_norm, pn in color_to_pn.items():
                rows.append({"part_number": pn, "shop_name": "アキモバ", "price_new": int(final), "recorded_at": t})
                print(f"[DEBUG row={i}] -> color={col_norm} pn={pn} price={final} reason=ALL")
            continue

        if color_abs_map:
            for col_norm, pn in color_to_pn.items():
                if col_norm in color_abs_map:
                    val = color_abs_map[col_norm]
                    rows.append({"part_number": pn, "shop_name": "アキモバ", "price_new": int(val), "recorded_at": t})
                    print(f"[DEBUG row={i}] -> color={col_norm} pn={pn} price={val} reason=ABS")
                else:
                    if base_price is not None:
                        rows.append({"part_number": pn, "shop_name": "アキモバ", "price_new": int(base_price), "recorded_at": t})
                        print(f"[DEBUG row={i}] -> color={col_norm} pn={pn} price={base_price} reason=BASE-FALLBACK")
                    else:
                        print(f"[DEBUG row={i}] -> color={col_norm} pn={pn} skipped (no abs, no base)")
            continue

        if base_price is None:
            print(f"[DEBUG row={i}] no base/abs -> skip")
            continue

        for col_norm, pn in color_to_pn.items():
            delta = color_delta_map.get(col_norm, 0)
            val = int(base_price + delta)
            rows.append({"part_number": pn, "shop_name": "アキモバ", "price_new": val, "recorded_at": t})
            print(f"[DEBUG row={i}] -> color={col_norm} pn={pn} price={val} reason={'BASE+DELTA' if delta else 'BASE'}")

    out = pd.DataFrame(rows, columns=["part_number","shop_name","price_new","recorded_at"])
    if not out.empty:
        out = out.dropna(subset=["part_number","price_new"]).reset_index(drop=True)
        out["part_number"] = out["part_number"].astype(str)
        out["price_new"] = pd.to_numeric(out["price_new"], errors="coerce").astype("Int64")
    return out


In [54]:
df_9 = pd.read_csv("/Users/syu/PycharmProjects/YamagotiProjects/shop9 (1).csv")
df_9

Unnamed: 0,web-scraper-order,web-scraper-start-url,機種名,買取価格,色・詳細等,time-scraped
0,1761627842-1,https://akiba-mobile.co.jp/,,,,2025-10-28 14:04:02
1,1761627842-2,https://akiba-mobile.co.jp/,iPhone17 Pro 256GB,"177,000円",未開封品 全色,2025-10-28 14:04:02
2,1761627842-3,https://akiba-mobile.co.jp/,iPhone17 Pro 512GB,"212,000円",未開封品 全色,2025-10-28 14:04:02
3,1761627842-4,https://akiba-mobile.co.jp/,iPhone17 Pro 1TB,"245,000円",未開封品 全色,2025-10-28 14:04:02
4,1761627842-5,https://akiba-mobile.co.jp/,iPhone17 Pro Max 256GB,"198,000円",未開封品 全色,2025-10-28 14:04:02
5,1761627842-6,https://akiba-mobile.co.jp/,iPhone17 Pro Max 512GB,"230,500円","未開 橙,銀230,500/青229,000",2025-10-28 14:04:02
6,1761627842-7,https://akiba-mobile.co.jp/,iPhone17 ProMax 1TB,"263,000円",未開封品 全色,2025-10-28 14:04:02
7,1761627842-8,https://akiba-mobile.co.jp/,iPhone17 Pro Max 2TB,"314,000円",未開封品 全色,2025-10-28 14:04:02
8,1761627842-9,https://akiba-mobile.co.jp/,iPhone Air 256GB,"127,000円","未開封品 青,黒-2,000円",2025-10-28 14:04:02
9,1761627842-10,https://akiba-mobile.co.jp/,iPhone17 256GB,"128,500円",未開封品 全色,2025-10-28 14:04:02


In [55]:
res = clean_shop9(df_9)
res

shop9:アキモバ---------->进入清洗器时间： 2025-10-28 14:08:53
[DEBUG row=0] raw_model=nan -> norm='', cap=np.float64(nan), raw_price=nan, raw_color=nan
[DEBUG row=0] skip: model/cap missing
[DEBUG row=1] raw_model='iPhone17 Pro 256GB' -> norm='iPhone 17 Pro', cap=np.float64(256.0), raw_price='177,000円', raw_color='未開封品 全色'
[DEBUG row=1] parsed abs_list=[], deltas=[('全色', 0)], base_price=177000
[DEBUG row=1] delta ALL -> 0
[DEBUG row=1] -> color=シルバー pn=MG854J/A price=177000 reason=ALL
[DEBUG row=1] -> color=コズミックオレンジ pn=MG864J/A price=177000 reason=ALL
[DEBUG row=1] -> color=ディープブルー pn=MG874J/A price=177000 reason=ALL
[DEBUG row=2] raw_model='iPhone17 Pro 512GB' -> norm='iPhone 17 Pro', cap=np.float64(512.0), raw_price='212,000円', raw_color='未開封品 全色'
[DEBUG row=2] parsed abs_list=[], deltas=[('全色', 0)], base_price=212000
[DEBUG row=2] delta ALL -> 0
[DEBUG row=2] -> color=シルバー pn=MG894J/A price=212000 reason=ALL
[DEBUG row=2] -> color=コズミックオレンジ pn=MG8A4J/A price=212000 reason=ALL
[DEBUG row=2] -> 

Unnamed: 0,part_number,shop_name,price_new,recorded_at
0,MG854J/A,アキモバ,177000,2025-10-28 14:04:02
1,MG864J/A,アキモバ,177000,2025-10-28 14:04:02
2,MG874J/A,アキモバ,177000,2025-10-28 14:04:02
3,MG894J/A,アキモバ,212000,2025-10-28 14:04:02
4,MG8A4J/A,アキモバ,212000,2025-10-28 14:04:02
5,MG8C4J/A,アキモバ,212000,2025-10-28 14:04:02
6,MG8D4J/A,アキモバ,245000,2025-10-28 14:04:02
7,MG8E4J/A,アキモバ,245000,2025-10-28 14:04:02
8,MG8F4J/A,アキモバ,245000,2025-10-28 14:04:02
9,MFY84J/A,アキモバ,198000,2025-10-28 14:04:02


In [None]:
## shop12

## shop12 トゥインクル

In [50]:

def clean_shop12(df: pd.DataFrame) -> pd.DataFrame:
    import time
    print("shop12:トゥインクル---------->进入清洗器时间：", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    """
    解析规则改进：
      - 若備考1 含多行（有 \n），仅跳过含“開封/開封品/※開封”那一行，不影响其它行的匹配；
      - 支持绝对价 (Silver ¥230,500)、支持差额 (Blue-2000円)、支持 '全色'；
      - 使用宽松的颜色同义表匹配 info 表颜色；
      - debug 输出详细显示每行解析过程和最终结果。
    """
    # 必要列检查
    for c in ["モデルナンバー", "備考1", "買取価格", "time-scraped"]:
        if c not in df.columns:
            raise ValueError(f"shop12 清洗器缺少必要列：{c}")

    # 载入 info 表并构建 color map
    info_df = _load_iphone17_info_df_for_shop2()
    def _build_color_map(info_df: pd.DataFrame) -> Dict[Tuple[str, int], Dict[str, Tuple[str, str]]]:
        df2 = info_df.copy()
        df2["model_name_norm"] = df2["model_name"].map(_normalize_model_generic)
        df2["capacity_gb"] = pd.to_numeric(df2["capacity_gb"], errors="coerce").astype("Int64")
        df2["color_norm"] = df2["color"].map(lambda x: _norm(str(x)))
        cmap: Dict[Tuple[str, int], Dict[str, Tuple[str, str]]] = {}
        for _, r in df2.iterrows():
            m = r["model_name_norm"]
            cap = r["capacity_gb"]
            if not m or pd.isna(cap):
                continue
            key = (m, int(cap))
            cmap.setdefault(key, {})
            cmap[key][_norm(str(r["color"]))] = (str(r["part_number"]), str(r["color"]))
        return cmap

    cmap_all = _build_color_map(info_df)

    # 同义表（可按需扩充）
    FAMILY_SYNONYMS = {
        "blue": ["ブルー", "青", "ディープブルー"],
        "ブルー": ["ブルー", "青", "ディープブルー"],
        "青": ["ブルー", "青", "ディープブルー"],
        "ディープブルー": ["ディープブルー", "ブルー", "青"],
        "silver": ["シルバー", "銀", "Silver"],
        "シルバー": ["シルバー", "銀", "Silver"],
        "銀": ["シルバー", "銀"],
        "black": ["ブラック", "黒", "Black"],
        "ブラック": ["ブラック", "黒"],
        "黒": ["ブラック", "黒"],
        "white": ["ホワイト", "白", "White"],
        "ホワイト": ["ホワイト", "白"],
        "orange": ["オレンジ", "橙"],
        "オレンジ": ["オレンジ", "橙"],
    }
    EN_TO_JP = {
    "silver": ["シルバー", "銀"],
    "blue":   ["ブルー", "青", "ディープブルー"],
    "black":  ["ブラック", "黒"],
    "white":  ["ホワイト", "白"],
    "gold":   ["ゴールド", "金"],
    "green":  ["グリーン", "緑"],
    "red":    ["レッド", "赤"],
    "pink":   ["ピンク"],
    "purple": ["パープル", "紫"],
    "yellow": ["イエロー", "黄"],
    "orange": ["オレンジ", "橙"],
    "gray":   ["グレー", "グレイ", "灰"],
    "natural":["ナチュラル"],
}
    # 便于查找的反查表（归一化键 -> 家族关键词列表）
    SYN_LOOKUP: Dict[str, List[str]] = {}
    for k, toks in FAMILY_SYNONYMS.items():
        SYN_LOOKUP[_norm(k)] = [_norm(t) for t in toks]

    # 正则与工具
    SPLIT_SEPS = r"[／/、，,・\s]+"  # 用于拆分多个颜色标签
    ABS_RE = re.compile(
        r"""(?P<labels>[^\d¥￥円:：/、，,;；※]+?)\s*(?:[:：]?\s*)?(?:¥|￥)?\s*(?P<amount>[０-９0-9][０-９0-9,，]*)\s*(?:円)?""",
        re.UNICODE | re.VERBOSE,
    )
    DELTA_RE = re.compile(
        r"""(?P<labels>[^+\-−－\d¥￥円/、，,;；※]+?)\s*(?P<sign>[+\-−－])\s*(?P<amount>[０-９0-9][０-９0-9,，]*)\s*(?:円)?""",
        re.UNICODE | re.VERBOSE,
    )

    def _norm_amount_to_int(s: str) -> Optional[int]:
        if s is None:
            return None
        tt = str(s).replace("　", " ").replace("，", ",").replace("．", ".")
        tt = tt.translate(str.maketrans({
            '０':'0','１':'1','２':'2','３':'3','４':'4','５':'5','６':'6','７':'7','８':'8','９':'9',
            '－':'-','＋':'+','¥':'','￥':''
        }))
        m = re.search(r"([0-9][0-9,]*)", tt)
        if not m:
            return None
        try:
            return int(m.group(1).replace(",", ""))
        except Exception:
            return None

    def _line_is_opened(ln: str) -> bool:
        """判断该子行是否属于开封价说明（应跳过）"""
        if not ln:
            return False
        s = str(ln)
        return ("開封" in s) or ("※開封" in s) or ("開封品" in s) or ("開封済" in s)

    def _extract_abs_prices(text: str) -> List[Tuple[str, int]]:
        """
        从文本（可能含换行）提取 [(label_raw, absolute_price), ...]。
        仅处理不含“開封”字样的子行。
        """
        out: List[Tuple[str, int]] = []
        if not text:
            return out
        s = str(text)
        lines = [ln for ln in re.split(r"[\r\n]+", s) if ln is not None]
        for ln in lines:
            if not ln or _line_is_opened(ln):
                # 跳过开封行
                if _line_is_opened(ln):
                    print(f"[DEBUG] skip opened-line for abs: {ln!r}")
                continue
            # 在该行寻找 label+amount 匹配（可能多个）
            for m in ABS_RE.finditer(ln):
                labels_part = m.group("labels") or ""
                amt_txt = m.group("amount")
                amt = _norm_amount_to_int(amt_txt)
                if amt is None:
                    continue
                toks = [t.strip() for t in re.split(SPLIT_SEPS, labels_part) if t.strip()]
                for tok in toks:
                    # 忽略纯数字 token
                    if re.fullmatch(r"[0-9,，]+", tok.replace(" ", "")):
                        continue
                    out.append((tok, int(amt)))
        return out

    def _extract_deltas(text: str) -> List[Tuple[str, int]]:
        """
        抽取差额：label +/- amount。跳过含“開封”的子行/片段。
        """
        out: List[Tuple[str, int]] = []
        if not text:
            return out
        s = str(text)
        lines = [ln for ln in re.split(r"[\r\n]+", s) if ln is not None]
        for ln in lines:
            if not ln:
                continue
            if _line_is_opened(ln):
                print(f"[DEBUG] skip opened-line for delta: {ln!r}")
                continue
            for m in DELTA_RE.finditer(ln):
                labels_part = m.group("labels") or ""
                sign = m.group("sign") or "+"
                amt_txt = m.group("amount")
                amt = _norm_amount_to_int(amt_txt)
                if amt is None:
                    continue
                delta = -int(amt) if sign in ("-", "−", "－") else int(amt)
                toks = [t.strip() for t in re.split(SPLIT_SEPS, labels_part) if t.strip()]
                for tok in toks:
                    if re.fullmatch(r"[0-9,，]+", tok.replace(" ", "")):
                        continue
                    out.append((tok, delta))
        # 全色 fallback（如果没有找到差额，但某些非开封行包含 全色）
        if not out:
            # 搜索所有非开封子行是否包含全色
            s_all = str(text)
            for ln in re.split(r"[\r\n]+", s_all):
                if not ln or _line_is_opened(ln):
                    continue
                if "全色" in ln:
                    m = re.search(r"全色\s*[：:\-]?\s*([+\-−－])?\s*([０-９0-9][０-９0-9,，]*)?", ln)
                    if m:
                        sign = m.group(1) or "+"
                        amt = m.group(2)
                        amt_v = _norm_amount_to_int(amt) if amt else 0
                        out.append(("全色", -amt_v if sign in ("-", "−", "－") else amt_v))
                    else:
                        out.append(("全色", 0))
                    break
        return out

    def _label_matches_color(label_raw: str, color_raw: str, color_norm: str) -> bool:
        """
        更稳健的颜色匹配：
          - 英文标签走纯小写通道（不经 _norm），用 EN_TO_JP 直接映射再比对；
          - 日文/中文标签走 _norm + 同义族（SYN_LOOKUP/FAMILY_SYNONYMS）；
          - 然后做双向原文/归一子串兜底；
          - 带调试打印（去掉/注释即可）。
        """
        if not label_raw or not color_raw:
            return False

        lbl_raw = str(label_raw).strip()
        cr_raw  = str(color_raw).strip()

        # ===== 1) 英文直译通道（不经 _norm，避免 ASCII 被吞/变形）=====
        label_lower = lbl_raw.lower()
        if label_lower in EN_TO_JP:
            for jp in EN_TO_JP[label_lower]:
                if jp in cr_raw:
                    # print(f"[match EN] '{lbl_raw}' -> '{jp}' in color_raw='{cr_raw}'")
                    return True
                if _norm(jp) == color_norm:
                    # print(f"[match EN] '{lbl_raw}' -> norm('{jp}') == color_norm='{color_norm}'")
                    return True
            # 英文直译没中，继续日文流程

        # ===== 2) 日文/中文通道：_norm + 同义族 =====
        ln = _norm(lbl_raw)        # 归一化标签
        cn = color_norm            # 归一化 info 颜色键

        # 2-1 精确归一化等值
        if ln == cn:
            # print(f"[match JP exact] '{lbl_raw}'(norm={ln}) == color_norm={cn}")
            return True

        # 2-2 原文子串/归一子串（双向）
        if lbl_raw in cr_raw or ln in cn or cn in ln:
            # print(f"[match substr] '{lbl_raw}' in '{cr_raw}' or norm-substr")
            return True

        # 2-3 家族同义（先走预构建的 SYN_LOOKUP；没有则走 FAMILY_SYNONYMS）
        if 'SYN_LOOKUP' in globals() and ln in SYN_LOOKUP:
            for cand in SYN_LOOKUP[ln]:
                if cand == cn or cand in _norm(cr_raw) or cand in cr_raw:
                    # print(f"[match SYN] '{lbl_raw}' -> cand='{cand}' matches color_raw/norm")
                    return True
        if ln in FAMILY_SYNONYMS:
            for tok in FAMILY_SYNONYMS[ln]:
                if tok in cr_raw or _norm(tok) == cn:
                    # print(f"[match FAMILY] '{lbl_raw}' -> tok='{tok}' matches color")
                    return True

        # ===== 3) 最后兜底：把空白去掉后做子串（处理 'ディープ ブルー' 之类）=====
        ln_short = re.sub(r"[\s\u3000]+", "", ln)
        cn_short = re.sub(r"[\s\u3000]+", "", cn)
        if ln_short and (ln_short in cn_short or cn_short in ln_short):
            # print(f"[match short-substr] '{ln_short}' ~ '{cn_short}'")
            return True

        print(f"[no match] label_raw='{label_raw}' color_raw='{color_raw}' color_norm='{color_norm}'")
        return False
    rows: List[dict] = []

    for idx, row in df.iterrows():
        price_base = to_int_yen(row.get("買取価格"))
        if price_base is None:
            # 跳过无价行（标题/分隔）
            continue

        model_text = str(row.get("モデルナンバー") or "").strip()
        if not model_text:
            continue

        model_norm = _normalize_model_generic(model_text)
        cap_gb = _parse_capacity_gb(model_text)
        if not model_norm or pd.isna(cap_gb):
            print(f"[DEBUG row={idx}] 跳过（model/cap 解析失败） model={model_text!r}")
            continue
        cap_gb = int(cap_gb)

        key = (model_norm, cap_gb)
        color_map = cmap_all.get(key)
        if not color_map:
            print(f"[DEBUG row={idx}] 跳过（info 表无该型号/容量） key={key}")
            continue

        remark_raw = row.get("備考1") or ""
        abs_list = _extract_abs_prices(remark_raw)
        delta_list = _extract_deltas(remark_raw)

        print(f"[DEBUG row={idx}] model={model_text!r} price_base={price_base} remark={remark_raw!r}")
        print(f"[DEBUG row={idx}] parsed abs_list={abs_list}, delta_list={delta_list}")

        # 映射 label -> color_norm
        color_abs_map: Dict[str, int] = {}
        color_delta_map: Dict[str, int] = {}

        for label_raw, amt in abs_list:
            matched = None
            for col_norm, (pn, col_raw) in color_map.items():
                if _label_matches_color(label_raw, col_raw, col_norm):
                    matched = col_norm
                    break
            if matched:
                color_abs_map[matched] = int(amt)
                print(f"[DEBUG row={idx}] abs match: {label_raw!r} -> color_norm={matched}, price={amt}")
            else:
                print(f"[DEBUG row={idx}] abs NO-match: {label_raw!r}")

        for label_raw, delta in delta_list:
            if label_raw == "全色":
                color_delta_map["ALL"] = int(delta)
                print(f"[DEBUG row={idx}] delta ALL = {delta}")
                continue
            matched = None
            for col_norm, (pn, col_raw) in color_map.items():
                if _label_matches_color(label_raw, col_raw, col_norm):
                    matched = col_norm
                    break
            if matched:
                color_delta_map[matched] = int(delta)
                print(f"[DEBUG row={idx}] delta match: {label_raw!r} -> color_norm={matched}, delta={delta}")
            else:
                print(f"[DEBUG row={idx}] delta NO-match: {label_raw!r}")

        # 生成输出：优先绝对价 -> 再考虑 全色差额 -> 否则按基价+差额
        if "ALL" in color_delta_map:
            final_price = int(price_base + color_delta_map["ALL"])
            for col_norm, (pn, col_raw) in color_map.items():
                rows.append({"part_number": pn, "shop_name": "トゥインクル", "price_new": final_price, "recorded_at": row.get("time-scraped")})
                print(f"[DEBUG row={idx}] OUT: color={col_norm} pn={pn} price={final_price} reason=ALL")
            continue

        if color_abs_map:
            # 绝对价覆盖部分颜色，未列出的颜色用 base price
            for col_norm, (pn, col_raw) in color_map.items():
                if col_norm in color_abs_map:
                    val = color_abs_map[col_norm]
                    rows.append({"part_number": pn, "shop_name": "トゥインクル", "price_new": int(val), "recorded_at": row.get("time-scraped")})
                    print(f"[DEBUG row={idx}] OUT: color={col_norm} pn={pn} price={val} reason=ABS")
                else:
                    rows.append({"part_number": pn, "shop_name": "トゥインクル", "price_new": int(price_base), "recorded_at": row.get("time-scraped")})
                    print(f"[DEBUG row={idx}] OUT: color={col_norm} pn={pn} price={price_base} reason=BASE-FALLBACK")
            continue

        # 否则按差额映射（可能部分颜色有 delta；其它用 base）
        for col_norm, (pn, col_raw) in color_map.items():
            delta = color_delta_map.get(col_norm, 0)
            val = int(price_base + delta)
            # rows.append({"part_number": pn, "shop_name": "トゥインクル", "price_new": val, "recorded_at": row.get("time-scraped")})
            rows.append({"part_number": pn, "shop_name": "トゥインクル", "price_new": val, "recorded_at": parse_dt_aware(row.get("time-scraped"))})
            print(f"[DEBUG row={idx}] OUT: color={col_norm} pn={pn} price={val} reason={'BASE+DELTA' if delta else 'BASE'}")

    out = pd.DataFrame(rows, columns=["part_number", "shop_name", "price_new", "recorded_at"])
    if not out.empty:
        out = out.dropna(subset=["part_number", "price_new"]).reset_index(drop=True)
        out["part_number"] = out["part_number"].astype(str)
        out["price_new"] = pd.to_numeric(out["price_new"], errors="coerce").astype("Int64")
    return out



In [51]:
df_12 = pd.read_csv("/Users/syu/PycharmProjects/YamagotiProjects/shop12 (1).csv")
df_12

Unnamed: 0,web-scraper-order,web-scraper-start-url,モデルナンバー,備考1,買取価格,time-scraped
0,1761623484-1,https://www.twinkle-mobile.co.jp/index.php?Cat...,モデルナンバー,備考,買取価格,2025-10-28 12:51:24
1,1761623484-2,https://www.twinkle-mobile.co.jp/index.php?Cat...,【apple store版＆キャリア版】iPhone17 Pro&iPhone17 Pro Max,,,2025-10-28 12:51:24
2,1761623484-3,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro 256GB 未開封,"全色\n※開封品 ¥162,000","￥176,500",2025-10-28 12:51:24
3,1761623484-4,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro 512GB 未開封,"全色\n※開封品 ¥195,000","￥212,000",2025-10-28 12:51:24
4,1761623484-5,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro 1TB 未開封,"全色\n※開封品 ¥229,500","￥245,000",2025-10-28 12:51:24
5,1761623484-6,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro Max 256GB 未開封,"全色\n※開封品 ¥183,000","￥198,000",2025-10-28 12:51:24
6,1761623484-7,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro Max 512GB 未開封,"Silver ¥230,500\nBlue ¥229,000\n※開封品 ¥216,500","￥230,500",2025-10-28 12:51:24
7,1761623484-8,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro Max 1TB 未開封,"Silver ¥260,000\nBlue ¥260,000\n※開封品 ¥252,000","￥263,000",2025-10-28 12:51:24
8,1761623484-9,https://www.twinkle-mobile.co.jp/index.php?Cat...,iPhone17 Pro Max 2TB 未開封,"全色\n※開封品 ¥303,000","￥314,000",2025-10-28 12:51:24
9,1761623484-10,https://www.twinkle-mobile.co.jp/index.php?Cat...,【apple store版＆キャリア版】iPhone Air,,,2025-10-28 12:51:24


In [52]:
res = clean_shop12(df_12)
res

shop12:トゥインクル---------->进入清洗器时间： 2025-10-28 14:00:27
[DEBUG] skip opened-line for abs: '※開封品 ¥162,000'
[DEBUG] skip opened-line for delta: '※開封品 ¥162,000'
[DEBUG row=2] model='iPhone17 Pro 256GB 未開封' price_base=176500 remark='全色\n※開封品 ¥162,000'
[DEBUG row=2] parsed abs_list=[], delta_list=[('全色', 0)]
[DEBUG row=2] delta ALL = 0
[DEBUG row=2] OUT: color=シルバー pn=MG854J/A price=176500 reason=ALL
[DEBUG row=2] OUT: color=コズミックオレンジ pn=MG864J/A price=176500 reason=ALL
[DEBUG row=2] OUT: color=ディープブルー pn=MG874J/A price=176500 reason=ALL
[DEBUG] skip opened-line for abs: '※開封品 ¥195,000'
[DEBUG] skip opened-line for delta: '※開封品 ¥195,000'
[DEBUG row=3] model='iPhone17 Pro 512GB 未開封' price_base=212000 remark='全色\n※開封品 ¥195,000'
[DEBUG row=3] parsed abs_list=[], delta_list=[('全色', 0)]
[DEBUG row=3] delta ALL = 0
[DEBUG row=3] OUT: color=シルバー pn=MG894J/A price=212000 reason=ALL
[DEBUG row=3] OUT: color=コズミックオレンジ pn=MG8A4J/A price=212000 reason=ALL
[DEBUG row=3] OUT: color=ディープブルー pn=MG8C4J/A price

Unnamed: 0,part_number,shop_name,price_new,recorded_at
0,MG854J/A,トゥインクル,176500,2025-10-28 12:51:24
1,MG864J/A,トゥインクル,176500,2025-10-28 12:51:24
2,MG874J/A,トゥインクル,176500,2025-10-28 12:51:24
3,MG894J/A,トゥインクル,212000,2025-10-28 12:51:24
4,MG8A4J/A,トゥインクル,212000,2025-10-28 12:51:24
5,MG8C4J/A,トゥインクル,212000,2025-10-28 12:51:24
6,MG8D4J/A,トゥインクル,245000,2025-10-28 12:51:24
7,MG8E4J/A,トゥインクル,245000,2025-10-28 12:51:24
8,MG8F4J/A,トゥインクル,245000,2025-10-28 12:51:24
9,MFY84J/A,トゥインクル,198000,2025-10-28 12:51:24


## Shop10 ドラゴンモバイル