In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split  # 新增

# 检查GPU是否可用，并设置运行设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("使用设备：", device)

# 读取数据
ratings_df = pd.read_csv("ml-32m/ratings.csv")
movies_df = pd.read_csv("ml-32m/movies.csv")

# 构造用户和电影ID到索引的映射
unique_users = ratings_df['userId'].unique()
unique_items = ratings_df['movieId'].unique()
user2idx = {user: idx for idx, user in enumerate(unique_users)}
item2idx = {item: idx for idx, item in enumerate(unique_items)}

# 为了方便后续计算，将原始的 userId 和 movieId 转换为索引
ratings_df['user_idx'] = ratings_df['userId'].map(user2idx)
ratings_df['movie_idx'] = ratings_df['movieId'].map(item2idx)

num_users = len(unique_users)
num_items = len(unique_items)
print("用户数量：", num_users, "电影数量：", num_items)

# -----------------------------
# 数据划分：训练集 80%，验证集 10%，测试集 10%
# -----------------------------
train_val_df, test_df = train_test_split(ratings_df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1111, random_state=42)  # 0.1111≈10%/90%

print("训练集大小：", len(train_df))
print("验证集大小：", len(val_df))
print("测试集大小：", len(test_df))

# -----------------------------
# 自定义 Dataset
# -----------------------------
class MovieRatingDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        user = torch.tensor(row['user_idx'], dtype=torch.long)
        item = torch.tensor(row['movie_idx'], dtype=torch.long)
        rating = torch.tensor(row['rating'], dtype=torch.float)
        return user, item, rating

train_dataset = MovieRatingDataset(train_df)
val_dataset = MovieRatingDataset(val_df)
test_dataset = MovieRatingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

# -----------------------------
# 定义神经协同过滤模型（NCF）
# -----------------------------
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        
    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        x = torch.cat([user_emb, item_emb], dim=-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        rating = torch.sigmoid(x) * 5.0
        return rating.squeeze()

model = NCF(num_users, num_items, embedding_dim=32).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 模型训练
# -----------------------------
num_epochs = 1  
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", ncols=100)
    for step, (user, item, rating) in enumerate(progress_bar):
        user, item, rating = user.to(device), item.to(device), rating.to(device)
        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * user.size(0)
        progress_bar.set_postfix({'step': step, 'loss': loss.item()})
    epoch_loss /= len(train_dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# -----------------------------
# 定义推荐函数（与原代码基本保持一致）
# -----------------------------
def recommend_movies(model, user_id, ratings_df, movies_df, user2idx, item2idx, top_n=5):
    model.eval()
    user_idx = user2idx[user_id]
    rated_items = ratings_df[ratings_df["userId"] == user_id]["movie_idx"].tolist()
    all_item_indices = np.arange(num_items)
    candidate_indices = [i for i in all_item_indices if i not in rated_items]
    if len(candidate_indices) == 0:
        print("用户已经对所有电影评分，无可推荐项目。")
        return None
    user_tensor = torch.tensor([user_idx] * len(candidate_indices), dtype=torch.long).to(device)
    item_tensor = torch.tensor(candidate_indices, dtype=torch.long).to(device)
    with torch.no_grad():
        preds = model(user_tensor, item_tensor)
    preds = preds.cpu().numpy()
    top_indices = np.argsort(preds)[::-1][:top_n]
    recommended_item_indices = [candidate_indices[i] for i in top_indices]
    idx2item = {idx: movie for movie, idx in item2idx.items()}
    recommended_movie_ids = [idx2item[i] for i in recommended_item_indices]
    recommendations = movies_df[movies_df["movieId"].isin(recommended_movie_ids)]
    return recommendations

# -----------------------------
# 测试推荐功能
# -----------------------------
user_id_to_recommend = unique_users[0]
recs = recommend_movies(model, user_id_to_recommend, ratings_df, movies_df, user2idx, item2idx, top_n=5)
if recs is not None:
    print(f"\n对用户 {user_id_to_recommend} 的电影推荐：")
    print(recs)

In [None]:
model_path = "model/ncf_model.pth"
def save_model(model, optimizer, epoch, filepath=model_path):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }
    torch.save(checkpoint, filepath)
    print(f"模型已保存到 {filepath}")
save_model(model, optimizer, num_epochs, filepath=model_path)

In [None]:
import math

def evaluate_model(model, dataloader, criterion, device):
    """
      mse: 均方误差
      rmse: 根均方误差
    """
    model.eval()
    total_loss = 0.0
    total_samples = 0

    with torch.no_grad():
        for user, item, rating in dataloader:
            user = user.to(device)
            item = item.to(device)
            rating = rating.to(device)

            output = model(user, item)
            loss = criterion(output, rating)
            batch_size = user.size(0)
            total_loss += loss.item() * batch_size
            total_samples += batch_size

    mse = total_loss / total_samples if total_samples > 0 else 0.0
    rmse = math.sqrt(mse)
    print(f"Evaluation Metrics: MSE = {mse:.4f}, RMSE = {rmse:.4f}")
    return mse, rmse

# 示例：使用 test_loader 评估模型
mse, rmse = evaluate_model(model, test_loader, criterion, device)