# MovieLens Data Preprocessing

In [213]:
import pandas as pd
import numpy as np

In [214]:
interaction_path = 'serendipity-sac2018/training.csv'
seren_path = 'serendipity-sac2018/answers.csv'
meta_path = 'serendipity-sac2018/movies.csv'

In [215]:
interactions_df = pd.read_csv(interaction_path)
seren_df = pd.read_csv(seren_path)
meta_df = pd.read_csv(meta_path)

In [216]:
print(f"原始交互数据: {interactions_df.shape}")
print(f"原始意外发现数据: {seren_df.shape}")
print(f"原始电影元数据: {meta_df.shape}")

原始交互数据: (9997850, 4)
原始意外发现数据: (2150, 22)
原始电影元数据: (49174, 8)


In [217]:
from datetime import datetime

# 定义日期字符串和对应的格式
date_str = "2017-06-01 00:00:00"
date_format = "%Y-%m-%d %H:%M:%S"

# 将字符串解析为 datetime 对象
dt = datetime.strptime(date_str, date_format)

timestamp = dt.timestamp() * 1000

print("日期:", dt)
print("时间戳:", timestamp)

日期: 2017-06-01 00:00:00
时间戳: 1496300400000.0


In [218]:
interactions_df = interactions_df[interactions_df['timestamp'] > timestamp]
len(interactions_df)

1088697

In [219]:
user_counts = interactions_df['userId'].value_counts()
valid_users = user_counts[user_counts >= 5].index
item_counts = interactions_df['movieId'].value_counts()
valid_items = item_counts[item_counts >= 5].index
filtered_interactions = interactions_df[interactions_df['userId'].isin(valid_users) & interactions_df['movieId'].isin(valid_items)]
print(f"过滤后的交互数据: {filtered_interactions.shape}")
print(f"过滤后的用户数: {len(valid_users)}")
print(f"过滤后的电影数: {len(valid_items)}")

过滤后的交互数据: (1049659, 4)
过滤后的用户数: 10657
过滤后的电影数: 11541


In [220]:
seren_users = seren_df['userId'].unique()
all_users = np.union1d(valid_users, seren_users)

In [221]:
missing_users = np.setdiff1d(seren_users, valid_users)
if len(missing_users) > 0:
    print(f"有 {len(missing_users)} 个用户在answers.csv中但在training.csv中交互少于5次")
    # 对于这些用户，我们仍然保留他们的所有交互记录
    missing_interactions = interactions_df[interactions_df['userId'].isin(missing_users)]
    filtered_interactions = pd.concat([filtered_interactions, missing_interactions])

有 90 个用户在answers.csv中但在training.csv中交互少于5次


In [222]:
# 获取所有唯一的用户ID和电影ID
unique_user_ids = filtered_interactions['userId'].unique()
unique_movie_ids = filtered_interactions['movieId'].unique()

In [223]:
# 创建映射字典
user_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_user_ids, 1)}
movie_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_movie_ids, 1)}

In [224]:
# 应用映射到交互数据
filtered_interactions_mapped = filtered_interactions.copy()
filtered_interactions_mapped['user_id'] = filtered_interactions_mapped['userId'].map(user_id_map)
filtered_interactions_mapped['item_id'] = filtered_interactions_mapped['movieId'].map(movie_id_map)

In [225]:
# 应用映射到意外发现数据
seren_df_mapped = seren_df.copy()
seren_df_mapped['user_id'] = seren_df_mapped['userId'].map(user_id_map)
seren_df_mapped['item_id'] = seren_df_mapped['movieId'].map(movie_id_map)

In [226]:
# 应用映射到电影元数据
meta_df_mapped = meta_df.copy()
# 只保留出现在交互数据中的电影
meta_df_mapped = meta_df_mapped[meta_df_mapped['movieId'].isin(unique_movie_ids)]
meta_df_mapped['item_id'] = meta_df_mapped['movieId'].map(movie_id_map)

In [227]:
# 5. 按时间戳排序，为每个用户构建交互序列
filtered_interactions_mapped = filtered_interactions_mapped.sort_values(['user_id', 'timestamp'])

# 6. 检查映射后的数据
print(f"映射后的交互数据: {filtered_interactions_mapped.shape}")
print(f"映射后的用户数: {len(filtered_interactions_mapped['user_id'].unique())}")
print(f"映射后的电影数: {len(filtered_interactions_mapped['item_id'].unique())}")

映射后的交互数据: (1049724, 6)
映射后的用户数: 10684
映射后的电影数: 11544


In [228]:
density  = filtered_interactions_mapped.shape[0] / (len(filtered_interactions_mapped['user_id'].unique()) * len(filtered_interactions_mapped['item_id'].unique()))
print(f"Density: {density * 100:.2f}%")

Density: 0.85%


In [229]:
# 7. 保存处理后的数据
# 保存主要交互数据
filtered_interactions_mapped[['user_id', 'item_id', 'rating', 'timestamp']].to_csv('ml/processed_interactions.csv', index=False)

# 保存意外发现数据
if not seren_df_mapped.empty:
    # 确保映射的item_id不是NaN（某些电影可能在过滤后的交互中不存在）
    seren_df_mapped = seren_df_mapped.dropna(subset=['user_id', 'item_id'])
    seren_df_mapped[['user_id', 'item_id', 'rating', 'timestamp', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 'q']].to_csv('ml/processed_serendipity.csv', index=False)

# 保存电影元数据
meta_df_mapped[['item_id', 'title', 'genres']].to_csv('ml/processed_movies.csv', index=False)

print("数据预处理完成!")

数据预处理完成!


In [230]:
# 5. 定义serendipity (方案4 - 相关但出乎意料)
# s5: "这部电影与我通常观看的电影不同"
# s6: "我对MovieLens向我推荐这部电影感到惊讶"
# s7: "我很高兴看了这部电影"
seren_df_mapped['serendipity'] = (
    ((seren_df_mapped['s5'] >= 3) | (seren_df_mapped['s6'] >= 3)) & 
    (seren_df_mapped['s7'] >= 3)
).astype(int)

# 6. 输出为user-item-serendipity格式
# 删除映射失败的记录(NaN)
output_df = seren_df_mapped.dropna(subset=['user_id', 'item_id'])

# 提取需要的列并排序
serendipity_output = output_df[['user_id', 'item_id', 'serendipity']].copy()
serendipity_output = serendipity_output.sort_values(['user_id', 'item_id'])

# 确保user_id和item_id是整数
serendipity_output['user_id'] = serendipity_output['user_id'].astype(int)
serendipity_output['item_id'] = serendipity_output['item_id'].astype(int)

# 7. 保存结果
serendipity_output.to_csv('ml/user_item_serendipity.csv', index=False)

In [231]:
# 8. 输出统计信息
total_interactions = len(serendipity_output)
serendipity_count = serendipity_output['serendipity'].sum()
serendipity_percentage = (serendipity_count / total_interactions) * 100

print(f"总交互数: {total_interactions}")
print(f"Serendipity数量: {serendipity_count}")
print(f"Serendipity比例: {serendipity_percentage:.2f}%")

总交互数: 1548
Serendipity数量: 941
Serendipity比例: 60.79%
