In [9]:
# 1. 安装并配置 Kaggle CLI（将 your kaggle.json 上传到 /content）
%cd /content
!ls

allbut.pl     sample_data  u2.test  u4.test  ua.test  u.genre	    u.user
lastfm1k.zip  u1.base	   u3.base  u5.base  ub.base  u.info
mku.sh	      u1.test	   u3.test  u5.test  ub.test  u.item
README	      u2.base	   u4.base  ua.base  u.data   u.occupation


In [11]:
import os, json, random
import pandas as pd
from collections import defaultdict

# ── 一、路径 & 参数 ───────────────────────────────────────────
RAW_DIR    = '/content'               # 数据就放在 /content 下
OUT_DIR    = '/content/ml100k_proc'
LGN_DIR    = os.path.join(OUT_DIR, 'lgn')
MIN_HIST   = 5
MAX_HIST   = 30
CANDIDATES = 20
random.seed(42)

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(LGN_DIR, exist_ok=True)

# ── 二、加载 interaction 日志 (u.data) ────────────────────────
data_path = os.path.join(RAW_DIR, 'u.data')
df = pd.read_csv(data_path, sep='\t',
                 names=['user_id','item_id','rating','timestamp'])
df = df.sort_values('timestamp')
print("Loaded interactions:", df.shape)

# ── 三、加载 item 元数据 (u.item) ─────────────────────────────
item_path = os.path.join(RAW_DIR, 'u.item')
items = {}
with open(item_path, encoding='latin-1') as f:
    for line in f:
        parts = line.strip().split('|')
        mid = int(parts[0])
        title = parts[1]
        items[mid] = {'title': title}
print("Loaded item_meta:", len(items))

# ── 四、构建 & 过滤用户序列 ─────────────────────────────────
user_seqs = df.groupby('user_id')['item_id'].apply(list).to_dict()
user_seqs = {
    u: seq[-MAX_HIST:]
    for u, seq in user_seqs.items()
    if len(seq) >= MIN_HIST + 1
}
print("Active users:", len(user_seqs))

# ── 五、生成 (history→next) 样本 & 拆分 Train/Cal/Test ─────────────
samples = []
for u, seq in user_seqs.items():
    for t in range(MIN_HIST, len(seq)):
        samples.append((u, seq[:t], seq[t]))

# 按用户聚集
by_user = defaultdict(list)
for u, h, n in samples:
    by_user[u].append((h, n))

train_samps, cal_samps, test_samps = [], [], []
for u, recs in by_user.items():
    *rest, last = recs
    # Test：最后一次交互
    test_samps.append((u, last[0], last[1]))
    # Cal：倒数第二次
    cal_samps.append((u, rest[-1][0], rest[-1][1]))
    # Train：其余（注意这里要加 u）
    for hist, nxt in rest[:-1]:
        train_samps.append((u, hist, nxt))

print(f"Train={len(train_samps)}, Cal={len(cal_samps)}, Test={len(test_samps)}")

# ── 六、负采样函数 ────────────────────────────────────────────
all_items = list(items.keys())
def sample_cands(hist, nxt):
    pool = list(set(all_items) - set(hist) - {nxt})
    negs = (random.sample(pool, CANDIDATES-1)
            if len(pool) >= CANDIDATES-1
            else random.choices(pool, k=CANDIDATES-1))
    cands = [nxt] + negs
    random.shuffle(cands)
    return cands

# ── 七、写 JSONL ─────────────────────────────────────────────
def write_jsonl(name, splits):
    path = os.path.join(OUT_DIR, f"{name}.jsonl")
    with open(path, 'w', encoding='utf-8') as out:
        for u, h, n in splits:
            cands = sample_cands(h, n)
            rec = {
                'user_id': u,
                'history': h,
                'history_titles': [items[i]['title'] for i in h],
                'candidates': cands,
                'candidates_titles': [items[i]['title'] for i in cands],
                'label': n,
                'label_title': items[n]['title']
            }
            out.write(json.dumps(rec, ensure_ascii=False) + '\n')
    print(f"Wrote {name}.jsonl ({len(splits)} samples)")

for split, data in [('train', train_samps),
                    ('cal',   cal_samps),
                    ('test',  test_samps)]:
    write_jsonl(split, data)

# ── 八、写 LightGCN 格式 & 统计分析 ──────────────────────────
with open(os.path.join(LGN_DIR, 'train.txt'), 'w') as ft, \
     open(os.path.join(LGN_DIR, 'test.txt'),  'w') as fe:
    for u, seq in user_seqs.items():
        ft.write(f"{u} " + " ".join(map(str, seq[:-1])) + "\n")
        fe.write(f"{u} " + str(seq[-1]) + "\n")
print("Wrote LightGCN files")

for split in ['train','cal','test']:
    df_s = pd.read_json(os.path.join(OUT_DIR, f"{split}.jsonl"), lines=True)
    print(f"\n=== {split.upper()} STATISTICS ===")
    print("Samples:", len(df_s))
    print("Users:", df_s['user_id'].nunique())
    items_all = set(df_s['history'].explode()) | set(df_s['candidates'].explode())
    print("Items:", len(items_all))
    stats = df_s['history'].apply(len).describe()[['min','25%','50%','75%','max']]
    print("History lengths:", stats.to_dict())
    print("Candidate list lengths:", sorted(df_s['candidates'].apply(len).unique()))
    pos = df_s.apply(lambda r: r['candidates'].index(r['label']), axis=1)
    print("Label position distribution:", pos.value_counts().sort_index().to_dict())


Loaded interactions: (100000, 4)
Loaded item_meta: 1682
Active users: 943
Train=20450, Cal=943, Test=943
Wrote train.jsonl (20450 samples)
Wrote cal.jsonl (943 samples)
Wrote test.jsonl (943 samples)
Wrote LightGCN files

=== TRAIN STATISTICS ===
Samples: 20450
Users: 943
Items: 1682
History lengths: {'min': 5.0, '25%': 10.0, '50%': 15.0, '75%': 21.0, 'max': 27.0}
Candidate list lengths: [np.int64(20)]
Label position distribution: {0: 950, 1: 1063, 2: 1037, 3: 992, 4: 1032, 5: 995, 6: 1011, 7: 1055, 8: 1027, 9: 1028, 10: 1083, 11: 1048, 12: 1007, 13: 1003, 14: 1048, 15: 1006, 16: 1002, 17: 1038, 18: 978, 19: 1047}

=== CAL STATISTICS ===
Samples: 943
Users: 943
Items: 1682
History lengths: {'min': 18.0, '25%': 28.0, '50%': 28.0, '75%': 28.0, 'max': 28.0}
Candidate list lengths: [np.int64(20)]
Label position distribution: {0: 44, 1: 37, 2: 50, 3: 48, 4: 48, 5: 54, 6: 53, 7: 39, 8: 51, 9: 34, 10: 54, 11: 42, 12: 54, 13: 54, 14: 36, 15: 58, 16: 52, 17: 42, 18: 53, 19: 40}

=== TEST STATIS