## 1. 加载数据集

In [1]:
import pandas as pd
ratings = pd.read_csv('ratings.dat',
    sep='::', engine='python',
    names=['userId','movieId','rating','timestamp']
)
movies = pd.read_csv('movies.dat',
    sep='::', engine='python',
    names=['movieId','title','genres'], encoding='ISO-8859-1'
)
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

## 2. 协同过滤 (Collaborative Filtering, CF)

In [2]:
from surprise import Dataset, Reader, KNNBasic
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)
trainset = data.build_full_trainset()
algo = KNNBasic(sim_options={'name':'cosine','user_based':False})
algo.fit(trainset)

# 假设为用户 1 进行推荐，可修改
user_id = 1
inner_uid = trainset.to_inner_uid(user_id)
rated = {iid for (iid, _) in trainset.ur[inner_uid]}
predictions = [
    (trainset.to_raw_iid(iid), algo.predict(user_id, trainset.to_raw_iid(iid)).est)
    for iid in trainset.all_items() if iid not in rated
]
n_cf = sorted(predictions, key=lambda x: x[1], reverse=True)
top_n_cf = n_cf[:10]
print("使用 KNN 计算预测得分 Top10 推荐:")
for mid, score in top_n_cf:
    title = movies.loc[movies.movieId==int(mid),'title'].values[0]
    print(f"{title}, 预测评分: {score:.2f}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
使用 KNN 计算预测得分 Top10 推荐:
Loves of Carmen, The (1948), 预测评分: 5.00
Voyage to the Beginning of the World (1997), 预测评分: 5.00
Back Stage (2000), 预测评分: 4.67
Chain of Fools (2000), 预测评分: 4.67
Silence of the Palace, The (Saimt el Qusur) (1994), 预测评分: 4.67
Song of Freedom (1936), 预测评分: 4.67
Smoking/No Smoking (1993), 预测评分: 4.67
Naked Man, The (1998), 预测评分: 4.50
Schlafes Bruder (Brother of Sleep) (1995), 预测评分: 4.50
Project Moon Base (1953), 预测评分: 4.50


## 3. 将电影所属的类别向量化。将用户给出高评分的电影的类型进行加权平均，得到用户喜好的电影种类向量，然后依据电影的类型与用户喜欢类型之间的相似度来推荐。

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(token_pattern='[^|]+')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
high = ratings[(ratings.userId==user_id)&(ratings.rating>=4.0)]
indices = [movies.index[movies.movieId==mid][0] for mid in high.movieId if not movies.index[movies.movieId==mid].empty]
if indices:
    profile = np.asarray(tfidf_matrix[indices].mean(axis=0)).ravel()
    cb_scores = cosine_similarity(profile.reshape(1,-1), tfidf_matrix).flatten()
else:
    cb_scores = np.zeros(movies.shape[0])
top_n_cb = [(movies.movieId[idx], cb_scores[idx]) for idx in np.argsort(cb_scores)[::-1][:10]]
print("根据与已评分电影类型的相似度 Top10 推荐:")
for mid, score in top_n_cb:
    title = movies.loc[movies.movieId==mid,'title'].values[0]
    print(f"{title}, 相似度分数: {score:.2f}")

根据与已评分电影类型的相似度 Top10 推荐:
Wide Awake (1998), 相似度分数: 0.71
Babe (1995), 相似度分数: 0.71
Pollyanna (1960), 相似度分数: 0.71
Aladdin (1992), 相似度分数: 0.71
Steamboat Willie (1940), 相似度分数: 0.71
Jungle Book, The (1967), 相似度分数: 0.71
Lady and the Tramp (1955), 相似度分数: 0.68
Little Mermaid, The (1989), 相似度分数: 0.68
Little Princess, A (1995), 相似度分数: 0.68
Prancer (1989), 相似度分数: 0.68


## 4. 训练一个 MLP 来进行最终评分

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

# 从公共数据集中提取出用于训练 MLP 的数据
train_data = []
cf_cache = {
    (uid, iid): algo.predict(uid, iid).est
    for uid, iid, _ in trainset.build_testset()
}

for uid, iid, true_r in trainset.build_testset():
    cf_pred = cf_cache[(uid, iid)]  # 查字典，不调用 algo.predict
    idx = movies.index[movies.movieId==iid][0]
    cb_pred = cb_scores[idx]
    train_data.append(([cf_pred, cb_pred], true_r))

# 构建 Dataset
class RecDataset(Dataset):
    def __init__(self, data):
        self.features = torch.tensor([d[0] for d in data], dtype=torch.float32)
        self.targets  = torch.tensor([d[1] for d in data], dtype=torch.float32).unsqueeze(1)
    def __len__(self):
        return len(self.targets)
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

ds = RecDataset(train_data)
loader = DataLoader(ds, batch_size=256, shuffle=True)

# 定义 MLP 模型
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16, bias=True),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(16, 1)
        )
    def forward(self, x):
        return self.net(x)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# 训练 MLP 模型
model.train()
for epoch in range(5):
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        loss = criterion(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
    print(f"Epoch {epoch+1}, MSE: {total_loss/len(ds):.4f}")

Using device: cuda
Epoch 1, MSE: 1.4758
Epoch 2, MSE: 0.8493
Epoch 3, MSE: 0.8052
Epoch 4, MSE: 0.8035
Epoch 5, MSE: 0.8028


## 5. 根据 MLP 的预测结果进行推荐

In [None]:
# 利用 MLP 生成预测分
model.eval()
nn_scores = {}
with torch.no_grad():
    for mid, cf_pred in n_cf:
        idx = movies.index[movies.movieId==mid][0]
        cb_pred = cb_scores[idx]
        feats = torch.tensor([[cf_pred, cb_pred]], dtype=torch.float32).to(device)
        nn_scores[mid] = model(feats).item()

top_nn = sorted(nn_scores.items(), key=lambda x: x[1], reverse=True)[:10]
print("\n最终评分 Top10 推荐:")
for mid, score in top_nn:
    title = movies.loc[movies.movieId==mid,'title'].values[0]
    print(f"{title} (MovieID: {mid}), 预测融合评分: {score:.2f}")


最终评分 Top10 推荐:
Loves of Carmen, The (1948) (MovieID: 3209), 预测融合评分: 4.75
Voyage to the Beginning of the World (1997) (MovieID: 1915), 预测融合评分: 4.75
Back Stage (2000) (MovieID: 3890), 预测融合评分: 4.58
Smoking/No Smoking (1993) (MovieID: 3530), 预测融合评分: 4.51
Chain of Fools (2000) (MovieID: 3323), 预测融合评分: 4.49
Silence of the Palace, The (Saimt el Qusur) (1994) (MovieID: 127), 预测融合评分: 4.44
Song of Freedom (1936) (MovieID: 3382), 预测融合评分: 4.44
Outside Ozona (1998) (MovieID: 2438), 预测融合评分: 4.36
Project Moon Base (1953) (MovieID: 3779), 预测融合评分: 4.33
Vampyros Lesbos (Las Vampiras) (1970) (MovieID: 3216), 预测融合评分: 4.32
