In [1]:
# === Parameters ===
raw_dir     = './data/raw'        # path to ratings.dat, movies.dat
out_dir     = './data/processed'
min_history = 5
cal_ratio   = 0.1
candidates  = 20

In [2]:
import os, json, random
import pandas as pd
from collections import defaultdict

os.makedirs(raw_dir, exist_ok=True)
os.makedirs(out_dir, exist_ok=True)
random.seed(42)


In [4]:
# 1) 加载评分日志
ratings_path = os.path.join(raw_dir, 'ratings.dat')
if not os.path.exists(ratings_path):
    raise FileNotFoundError(f"{ratings_path} not found. 请确认文件已上传/挂载。")
df_ratings = pd.read_csv(ratings_path, sep='::', engine='python',
                         names=['user_id','item_id','rating','timestamp'])
print(f"Loaded {len(df_ratings)} interactions")
print(f"Users: {df_ratings['user_id'].nunique()}, Items: {df_ratings['item_id'].nunique()}")


Loaded 1000209 interactions
Users: 6040, Items: 3706


In [5]:
# 2) 加载电影元数据
item_meta = {}
movies_path = os.path.join(raw_dir, 'movies.dat')
if not os.path.exists(movies_path):
    raise FileNotFoundError(f"{movies_path} not found.")
with open(movies_path, 'r', encoding='ISO-8859-1') as f:
    for line in f:
        m, title, genres = line.strip().split('::')
        item_meta[int(m)] = {'title': title, 'genres': genres.split('|')}
print(f"Loaded metadata for {len(item_meta)} movies")

Loaded metadata for 3883 movies


In [6]:
# 3) 构建用户序列
max_history = 30
user_seqs = {}
seq_lens = []

for u, grp in df_ratings.groupby('user_id'):
    full_seq = grp.sort_values('timestamp')['item_id'].tolist()
    if len(full_seq) >= min_history + 1:
        trunc_seq = full_seq[-max_history:]
        user_seqs[u] = trunc_seq
        seq_lens.append(len(trunc_seq))

print(f"Users with ≥{min_history+1} interactions: {len(user_seqs)}")
print(f"Truncated sequence lengths (≤{max_history}): "
      f"min={min(seq_lens)}, max={max(seq_lens)}, avg={sum(seq_lens)/len(seq_lens):.1f}")


Users with ≥6 interactions: 6040
Truncated sequence lengths (≤30): min=20, max=30, avg=29.3


In [7]:
# 4) 生成 (history, next) 样本
samples = []
for u, seq in user_seqs.items():
    for t in range(min_history, len(seq)):
        samples.append((u, seq[:t], seq[t]))
print(f"Total samples (history→next): {len(samples)}")

Total samples (history→next): 146639


In [8]:
# 5) Split Train / Cal / Test
from collections import defaultdict
import random

by_user = defaultdict(list)
for u, h, n in samples:
    by_user[u].append((h, n))

train_samps, cal_samps, test_samps = [], [], []
for u, seqs in by_user.items():
    # seqs is [(hist1,nxt1), …, (hist_last-1,nxt_last-1), (hist_last,nxt_last)]
    *rest, last = seqs

    # Test: last interaction
    hist_last, nxt_last = last
    test_samps.append((u, hist_last, nxt_last))

    # Cal: second‑to‑last interaction
    hist_cal, nxt_cal = rest[-1]
    cal_samps.append((u, hist_cal, nxt_cal))

    # Train: all earlier interactions
    for hist_t, nxt_t in rest[:-1]:
        train_samps.append((u, hist_t, nxt_t))

print(f"Train samples: {len(train_samps)}")
print(f"Cal   samples: {len(cal_samps)} ")
print(f"Test  samples: {len(test_samps)}")


Train samples: 134559
Cal   samples: 6040 
Test  samples: 6040


In [9]:
# 6) 训练集历史长度统计
hist_lens = [len(h) for u,h,n in train_samps]
print(f"Train history lengths: min={min(hist_lens)}, max={max(hist_lens)}, avg={sum(hist_lens)/len(hist_lens):.1f}")


Train history lengths: min=5, max=27, avg=15.7


In [10]:
# 7) 负采样函数
all_items = df_ratings['item_id'].unique().tolist()

def sample_cands(hist, nxt):
    """
    Returns a shuffled list of candidates (1 positive + (candidates-1) negatives).
    """
    M_neg = candidates - 1
    # Sample negatives
    negs = list(set(all_items) - set(hist) - {nxt})
    if len(negs) < M_neg:
        negs = random.choices(negs, k=M_neg)
    else:
        negs = random.sample(negs, M_neg)
    # Combine and shuffle
    items = [nxt] + negs
    random.shuffle(items)
    return items

# Example usage
print(sample_cands([1, 2, 3], 4))  # Outputs candidates in random order


[3244, 3261, 1220, 2828, 370, 131, 3265, 1912, 3904, 139, 2430, 437, 2987, 110, 589, 4, 2624, 980, 1079, 474]


In [11]:
# 8) Write JSONL with titles, genres, and label title/genres
def write_split(name, splits):
    path = os.path.join(out_dir, f"{name}.jsonl")
    with open(path, 'w') as f:
        for u, h, n in splits:
            cands = sample_cands(h, n)
            rec = {
                'user_id':            u,
                'history':            h,
                'history_titles':     [item_meta[i]['title'] for i in h],
                'history_genres':     [item_meta[i]['genres'] for i in h],
                'candidates':         cands,
                'candidates_titles':  [item_meta[i]['title'] for i in cands],
                'candidates_genres':  [item_meta[i]['genres'] for i in cands],
                'label':              n,
                'label_title':        item_meta[n]['title']

            }
            f.write(json.dumps(rec, ensure_ascii=False) + '\n')
    print(f"Wrote {len(splits)} samples to {path}")

# Generate files
write_split('train', train_samps)
write_split('cal',   cal_samps)
write_split('test',  test_samps)


Wrote 134559 samples to ./data/processed/train.jsonl
Wrote 6040 samples to ./data/processed/cal.jsonl
Wrote 6040 samples to ./data/processed/test.jsonl


In [12]:
# 9) 写入电影元数据
meta_path = os.path.join(out_dir, 'item_meta.json')
with open(meta_path, 'w') as f:
    json.dump(item_meta, f, ensure_ascii=False, indent=2)
print(f"Wrote {len(item_meta)} movie metadata entries to {meta_path}")

Wrote 3883 movie metadata entries to ./data/processed/item_meta.json


In [13]:
# Checking and Analyzing the Processed SarRec Datasets with Graceful File Checks

import os
import pandas as pd
from collections import defaultdict

# Paths
train_path = 'data/processed/train.jsonl'
cal_path   = 'data/processed/cal.jsonl'
test_path  = 'data/processed/test.jsonl'

dfs = {}
for name, path in [('Train', train_path), ('Calibration', cal_path), ('Test', test_path)]:
    if os.path.exists(path):
        dfs[name] = pd.read_json(path, lines=True)
        print(f"Loaded {len(dfs[name])} samples for {name} from {path}")
    else:
        print(f"Warning: {path} not found. Skipping {name}.")
        dfs[name] = None

# If Train exists, sample display and stats
if dfs['Train'] is not None:
    df = dfs['Train']
    from IPython.display import display

    print("\n=== Train Sample ===")
    display(df.head())

    print("\n--- Train Stats ---")
    print("Total samples:", len(df))
    print("Unique users:", df['user_id'].nunique())
    # Unique items
    items = df['history'].explode().tolist() + df['candidates'].explode().tolist()
    print("Unique items:", len(set(items)))
    # History length
    df['hist_len'] = df['history'].apply(len)
    print("History length stats:\n", df['hist_len'].describe())
    # Candidate list length
    df['cand_len'] = df['candidates'].apply(len)
    print("Candidate lengths:", df['cand_len'].unique())

    # Label positions
    positions = df.apply(lambda row: row['candidates'].index(row['label']), axis=1)
    print("Label position distribution:\n", positions.value_counts().sort_index())



Loaded 134559 samples for Train from data/processed/train.jsonl
Loaded 6040 samples for Calibration from data/processed/cal.jsonl
Loaded 6040 samples for Test from data/processed/test.jsonl

=== Train Sample ===


Unnamed: 0,user_id,history,history_titles,history_genres,candidates,candidates_titles,candidates_genres,label,label_title
0,1,"[1028, 1097, 914, 2797, 1287]","[Mary Poppins (1964), E.T. the Extra-Terrestri...","[[Children's, Comedy, Musical], [Children's, D...","[1498, 943, 2678, 432, 3545, 389, 1605, 2762, ...","[Inventing the Abbotts (1997), Ghost and Mrs. ...","[[Drama, Romance], [Drama, Romance], [Drama], ...",2762,"Sixth Sense, The (1999)"
1,1,"[1028, 1097, 914, 2797, 1287, 2762]","[Mary Poppins (1964), E.T. the Extra-Terrestri...","[[Children's, Comedy, Musical], [Children's, D...","[301, 1591, 1702, 2812, 1232, 2868, 1633, 3097...","[Picture Bride (1995), Spawn (1997), Flubber (...","[[Drama, Romance], [Action, Adventure, Sci-Fi,...",1246,Dead Poets Society (1989)
2,1,"[1028, 1097, 914, 2797, 1287, 2762, 1246]","[Mary Poppins (1964), E.T. the Extra-Terrestri...","[[Children's, Comedy, Musical], [Children's, D...","[3164, 585, 933, 1795, 1176, 1085, 1393, 3285,...","[Alley Cats, The (1968), Brady Bunch Movie, Th...","[[Drama], [Comedy], [Comedy, Romance, Thriller...",661,James and the Giant Peach (1996)
3,1,"[1028, 1097, 914, 2797, 1287, 2762, 1246, 661]","[Mary Poppins (1964), E.T. the Extra-Terrestri...","[[Children's, Comedy, Musical], [Children's, D...","[483, 2366, 1185, 470, 48, 2840, 3777, 3879, 2...","[King of the Hill (1993), King Kong (1933), My...","[[Drama], [Action, Adventure, Horror], [Drama]...",2918,Ferris Bueller's Day Off (1986)
4,1,"[1028, 1097, 914, 2797, 1287, 2762, 1246, 661,...","[Mary Poppins (1964), E.T. the Extra-Terrestri...","[[Children's, Comedy, Musical], [Children's, D...","[3362, 3, 531, 2409, 2372, 3431, 2189, 81, 385...","[Dog Day Afternoon (1975), Grumpier Old Men (1...","[[Comedy, Crime, Drama], [Comedy, Romance], [C...",531,"Secret Garden, The (1993)"



--- Train Stats ---
Total samples: 134559
Unique users: 6040
Unique items: 3706
History length stats:
 count    134559.000000
mean         15.744551
std           6.568758
min           5.000000
25%          10.000000
50%          16.000000
75%          21.000000
max          27.000000
Name: hist_len, dtype: float64
Candidate lengths: [20]
Label position distribution:
 0     6740
1     6681
2     6630
3     6693
4     6585
5     6714
6     6813
7     6659
8     6726
9     6682
10    6701
11    6882
12    6763
13    6895
14    6712
15    6668
16    6703
17    6828
18    6739
19    6745
Name: count, dtype: int64
