In [5]:
import json
import pandas as pd
import re
import os

In [6]:
ML_MOVIES_PATH = 'ml-25m/movies.csv'
MY_MOVIES_PATH = 'movies.json'
MAPPING_OUTPUT = 'id_mapping.json'

def normalize_title(title):
    """標準化標題：轉小寫、移除年份、移除標點符號"""
    # 移除年份 (e.g., "Toy Story (1995)" -> "Toy Story ")
    title = re.sub(r'\s*\(\d{4}\)', '', str(title))
    # 移除標點與多餘空格，轉小寫
    title = re.sub(r'[^a-z0-9]', '', title.lower())
    return title

def build_mapping():
    print("正在建立 ID 映射表...")
    
    # 1. 讀取 MovieLens 資料
    if not os.path.exists(ML_MOVIES_PATH):
        print(f"錯誤：找不到 {ML_MOVIES_PATH}，請下載 ml-25m dataset。")
        return
    ml_df = pd.read_csv(ML_MOVIES_PATH)
    
    # 2. 讀取 IMSDb 資料
    if not os.path.exists(MY_MOVIES_PATH):
        print(f"錯誤：找不到 {MY_MOVIES_PATH}")
        return
    with open(MY_MOVIES_PATH, 'r', encoding='utf-8') as f:
        my_movies = json.load(f)

    # 3. 建立 MovieLens 查找表 {normalized_title: ml_id}
    ml_lookup = {}
    for _, row in ml_df.iterrows():
        norm = normalize_title(row['title'])
        ml_lookup[norm] = row['movieId']

    # 4. 進行匹配
    mapping = {} # {imsdb_slug: ml_id}
    matched_count = 0
    
    for m in my_movies:
        imsdb_id = m['id']
        norm_title = normalize_title(m['title'])
        
        if norm_title in ml_lookup:
            mapping[imsdb_id] = int(ml_lookup[norm_title])
            matched_count += 1
    
    # 5. 存檔
    with open(MAPPING_OUTPUT, 'w') as f:
        json.dump(mapping, f, indent=2)
        
    print(f"映射完成！成功匹配: {matched_count} / {len(my_movies)} 部電影")
    print(f"映射表已儲存至: {MAPPING_OUTPUT}")


build_mapping()

正在建立 ID 映射表...
映射完成！成功匹配: 1025 / 1223 部電影
映射表已儲存至: id_mapping.json
