# 数据读取

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# 读取用户数据集，指定列名
users = pd.read_csv('users.dat', sep='::', engine='python', header=None,
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
print("Users 数据集预览：")
print(users.head())

# 读取电影数据集，指定列名
movies = pd.read_csv('movies.dat', sep='::', engine='python', header=None, encoding='latin1',
                     names=['MovieID', 'Title', 'Genres'])
    
# 将 Genres 列中的字符串按 "|" 分割，转换成列表
movies['Genres'] = movies['Genres'].apply(lambda x: x.split('|'))

print("\nMovies 数据集预览：")
print(movies.head())



# 读取 ratings 数据集
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', header=None,
                        names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
print("Ratings 数据集预览：")
print(ratings.head())

# 对于隐反馈数据，只需要记录 (UserID, MovieID) 对，忽略 Rating 和 Timestamp
data = list(zip(ratings['UserID'], ratings['MovieID']))


Users 数据集预览：
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455

Movies 数据集预览：
   MovieID                               Title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                             Genres  
0   [Animation, Children's, Comedy]  
1  [Adventure, Children's, Fantasy]  
2                 [Comedy, Romance]  
3                   [Comedy, Drama]  
4                          [Comedy]  
Ratings 数据集预览：
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408

# 数据集划分

In [2]:
import random

def SplitData(data, M, k, seed):
    """
    将数据集 data 均匀随机划分为 M 份，
    k 为当前划分的编号（0 <= k < M），
    如果某条数据被随机选中为 k，则划分到测试集，
    否则划分到训练集。

    参数:
    - data: 用户行为数据，格式为 [(user, item), ...]
    - M: 划分份数，例如 8
    - k: 本次实验选用的测试集编号，0 <= k < M
    - seed: 随机种子，用于保证实验可重复

    返回:
    - train: 训练集列表
    - test: 测试集列表
    """
    test = []
    train = []
    random.seed(seed)
    for user, item in data:
        # 随机生成 0 到 M-1 之间的一个整数
        if random.randint(0, M - 1) == k:
            test.append([user, item])
        else:
            train.append([user, item])
    return train, test


# 这里选择 M=8，k=0 作为一次实验（可循环不同 k 得到多个结果）
M = 8
k = 0
seed = 42
train_data, test_data = SplitData(data, M, k, seed)

print(f"\n训练集大小: {len(train_data)}, 测试集大小: {len(test_data)}")


# ---------------------------
# 将划分后的数据转换为 DataFrame，并构造索引映射
# ---------------------------
# 转换为 DataFrame，并指定列名
train_df = pd.DataFrame(train_data, columns=["UserID", "MovieID"])
test_df  = pd.DataFrame(test_data, columns=["UserID", "MovieID"])

# 构造用户索引映射：这里我们使用 users 数据集中的 UserID
user2index = {uid: idx for idx, uid in enumerate(users['UserID'].unique())}
# 构造电影索引映射：使用 movies 数据集中的 MovieID
movie2index = {mid: idx for idx, mid in enumerate(movies['MovieID'].unique())}

# 添加索引列和隐反馈 Interaction（均为1）
train_df['UserIndex'] = train_df['UserID'].map(user2index)
train_df['ItemIndex'] = train_df['MovieID'].map(movie2index)
train_df['Interaction'] = 1

test_df['UserIndex'] = test_df['UserID'].map(user2index)
test_df['ItemIndex'] = test_df['MovieID'].map(movie2index)
test_df['Interaction'] = 1

num_users = len(user2index)
num_items = len(movie2index)
print(f"\n用户数: {num_users}, 电影数: {num_items}")



训练集大小: 874948, 测试集大小: 125261

用户数: 6040, 电影数: 3883


# 正负样本采样

In [22]:

def sample_negative_samples(train_df, num_items, neg_ratio=10, seed=42):
    """
    基于已有的正样本 train_df，为每个用户采样一定数量的负样本，返回扩充后的训练集。
    - train_df: 只包含正样本的 DataFrame，至少包含 [UserIndex, ItemIndex]
    - num_items: 物品总数
    - neg_ratio: 负样本与正样本比例，如 4 表示采样 4 倍的负样本
    - seed: 随机种子
    返回: 包含正负样本的 DataFrame，新增的负样本 Interaction=0
    """
    random.seed(seed)
    
    # 构建 user->正样本物品集合
    user_pos_items = {}
    for row in train_df.itertuples():
        u = row.UserIndex
        i = row.ItemIndex
        user_pos_items.setdefault(u, set()).add(i)
    
    # 将原 DataFrame 复制，并确保有 Interaction=1 列
    train_df['Interaction'] = 1
    expanded_data = [tuple(x) for x in train_df[['UserIndex', 'ItemIndex', 'Interaction']].values]
    
    # 针对每个用户，采样未交互物品作为负样本
    for u, pos_items in user_pos_items.items():
        num_pos = len(pos_items)
        # 负样本数量
        num_neg = num_pos * neg_ratio
        
        # 从所有物品里排除已交互物品，随机选取负样本
        candidates = [it for it in range(num_items) if it not in pos_items]
        if len(candidates) <= num_neg:
            # 若可供负采样的物品不够，就全选
            neg_items = candidates
        else:
            neg_items = random.sample(candidates, num_neg)
        
        # 将负样本加入数据集，Interaction=0
        for it in neg_items:
            expanded_data.append((u, it, 0))
    
    # 转回 DataFrame
    new_train_df = pd.DataFrame(expanded_data, columns=['UserIndex', 'ItemIndex', 'Interaction'])
    return new_train_df

new_train_df = sample_negative_samples(train_df, num_items, neg_ratio=4, seed=42)

# LFM模型

In [3]:
class LFM(nn.Module):
    def __init__(self, num_users, num_items, latent_factors):
        super(LFM, self).__init__()
        self.user_factors = nn.Embedding(num_users, latent_factors)
        self.item_factors = nn.Embedding(num_items, latent_factors)
        # 参数初始化
        nn.init.normal_(self.user_factors.weight, std=0.1)
        nn.init.normal_(self.item_factors.weight, std=0.1)
        
    def forward(self, user_indices, item_indices):
        user_vecs = self.user_factors(user_indices)
        item_vecs = self.item_factors(item_indices)
        preds = (user_vecs * item_vecs).sum(dim=1)
        return preds

## 训练

In [29]:
# ---------------------------
# 训练 LFM 模型（利用 PyTorch GPU 加速）
# ---------------------------
def train_lfm_pytorch(new_train_df, num_users, num_items, latent_factors, lr, reg, epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"训练设备: {device}")
    
    model = LFM(num_users, num_items, latent_factors).to(device)
    #optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=reg) #reg其实就是λ
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=reg)

    criterion = nn.MSELoss()
    
    # 构造训练数据的 tensor
    user_indices = torch.LongTensor(new_train_df['UserIndex'].values).to(device)
    item_indices = torch.LongTensor(new_train_df['ItemIndex'].values).to(device)
    targets = torch.FloatTensor(new_train_df['Interaction'].values).to(device)
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        preds = model(user_indices, item_indices)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
    return model


# 评测

In [18]:
from collections import defaultdict
import math

def get_user_interacted_items(train_df):
    """
    从训练集中获取每个用户交互过的物品集合，格式: {user: set(item)}
    """
    user_item_dict = defaultdict(set)
    for row in train_df.itertuples():
        user = row.UserIndex
        item = row.ItemIndex
        user_item_dict[user].add(item)
    return user_item_dict

def get_item_popularity(train_df):
    """
    统计物品在训练集中被交互的次数，用于计算流行度
    """
    item_pop = defaultdict(int)
    for row in train_df.itertuples():
        item = row.ItemIndex
        item_pop[item] += 1
    return item_pop

def recommend_topN_vectorized(model, user, user_interacted_items, num_items, N=10):
    """
    为单个用户生成 Top-N 推荐列表（向量化实现）
    
    参数:
      - model: 训练好的 LFM 模型
      - user: 用户的 UserIndex
      - user_interacted_items: 用户已交互的物品集合
      - num_items: 物品总数
      - N: 推荐列表长度
      
    返回:
      - rec_items: 推荐的物品索引列表（长度为 N，按预测分数降序排序）
    """
    device = next(model.parameters()).device
    # 构造候选物品列表，排除已交互的物品
    candidates = [i for i in range(num_items) if i not in user_interacted_items]
    if not candidates:
        return []
    # 将所有候选物品一次性放入张量中，重复用户索引与候选物品对应
    user_tensor = torch.LongTensor([user] * len(candidates)).to(device)
    item_tensor = torch.LongTensor(candidates).to(device)
    
    model.eval()
    with torch.no_grad():
        preds = model(user_tensor, item_tensor)
    
    preds = preds.cpu().numpy()
    # 将候选物品和预测分数组合后排序
    candidate_scores = list(zip(candidates, preds))
    candidate_scores.sort(key=lambda x: x[1], reverse=True)
    
    rec_items = [item for item, score in candidate_scores[:N]]
    return rec_items


def evaluate_lfm_topN(model, train_df, test_df, num_users, num_items, N=10):
    """
    评测 LFM 模型 Top-N 推荐指标：Precision, Recall, Coverage, Popularity
    """
    # 构造训练集用户-物品字典
    def get_user_interacted(df):
        user_items = {}
        for row in df.itertuples():
            user = row.UserIndex
            item = row.ItemIndex
            user_items.setdefault(user, set()).add(item)
        return user_items

    train_user_item = get_user_interacted(train_df)
    
    # 构造测试集用户-物品字典
    test_user_item = get_user_interacted(test_df)
    
    # 统计物品流行度（在训练集上）
    item_popularity = {}
    for items in train_user_item.values():
        for item in items:
            item_popularity[item] = item_popularity.get(item, 0) + 1
    all_items = set(range(num_items))
    
    hit = 0
    total_precision = 0  # 总推荐数：N * 用户数
    total_recall = 0     # 测试集中交互的总数
    recommended_items_set = set()
    total_pop = 0
    total_recommended = 0

    # 所有参与评测的用户
    users = set(list(train_user_item.keys()) + list(test_user_item.keys()))
    
    for user in users:
        interacted_items = train_user_item.get(user, set())
        rec_items = recommend_topN_vectorized(model, user, interacted_items, num_items, N=N)
        true_items = test_user_item.get(user, set())
        hit += len(set(rec_items) & true_items)
        total_precision += N
        total_recall += len(true_items)
        recommended_items_set.update(rec_items)
        
        for item in rec_items:
            total_pop += math.log(1 + item_popularity.get(item, 0))
            total_recommended += 1

    precision = hit / (total_precision + 1e-10)
    recall = hit / (total_recall + 1e-10)
    coverage = len(recommended_items_set) / len(all_items)
    popularity = total_pop / (total_recommended + 1e-10)
    
    return precision, recall, coverage, popularity


In [34]:
model = train_lfm_pytorch(new_train_df, num_users, num_items, latent_factors=100, lr=0.02, reg=0.01, epochs=1000)

训练设备: cuda
Epoch 1/1000, Loss: 0.2133
Epoch 2/1000, Loss: 0.2133
Epoch 3/1000, Loss: 0.2133
Epoch 4/1000, Loss: 0.2133
Epoch 5/1000, Loss: 0.2132
Epoch 6/1000, Loss: 0.2132
Epoch 7/1000, Loss: 0.2132
Epoch 8/1000, Loss: 0.2131
Epoch 9/1000, Loss: 0.2131
Epoch 10/1000, Loss: 0.2130
Epoch 11/1000, Loss: 0.2130
Epoch 12/1000, Loss: 0.2129
Epoch 13/1000, Loss: 0.2129
Epoch 14/1000, Loss: 0.2128
Epoch 15/1000, Loss: 0.2127
Epoch 16/1000, Loss: 0.2127
Epoch 17/1000, Loss: 0.2126
Epoch 18/1000, Loss: 0.2126
Epoch 19/1000, Loss: 0.2125
Epoch 20/1000, Loss: 0.2124
Epoch 21/1000, Loss: 0.2124
Epoch 22/1000, Loss: 0.2123
Epoch 23/1000, Loss: 0.2122
Epoch 24/1000, Loss: 0.2122
Epoch 25/1000, Loss: 0.2121
Epoch 26/1000, Loss: 0.2120
Epoch 27/1000, Loss: 0.2120
Epoch 28/1000, Loss: 0.2119
Epoch 29/1000, Loss: 0.2118
Epoch 30/1000, Loss: 0.2118
Epoch 31/1000, Loss: 0.2117
Epoch 32/1000, Loss: 0.2116
Epoch 33/1000, Loss: 0.2116
Epoch 34/1000, Loss: 0.2115
Epoch 35/1000, Loss: 0.2114
Epoch 36/1000, Los

In [36]:
N = 20  # 推荐列表长度
precision, recall, coverage, popularity = evaluate_lfm_topN(
    model, train_df, test_df, num_users, num_items, N
)

print(f"Precision={precision:.4f}, Recall={recall:.4f}, Coverage={coverage:.4f}, Popularity={popularity:.4f}")

Precision=0.0058, Recall=0.0056, Coverage=0.9992, Popularity=4.1984
