# 数据读取

In [15]:
import pandas as pd
import random
import math
from collections import defaultdict

# 读取用户数据集，指定列名
users = pd.read_csv('users.dat', sep='::', engine='python', header=None,
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
print("Users 数据集预览：")
print(users.head())

# 读取电影数据集，指定列名
movies = pd.read_csv('movies.dat', sep='::', engine='python', header=None, encoding='latin1',
                     names=['MovieID', 'Title', 'Genres'])
    
# 将 Genres 列中的字符串按 "|" 分割，转换成列表
movies['Genres'] = movies['Genres'].apply(lambda x: x.split('|'))

print("\nMovies 数据集预览：")
print(movies.head())



# 读取 ratings 数据集
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', header=None,
                        names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
print("Ratings 数据集预览：")
print(ratings.head())

# 对于隐反馈数据，只需要记录 (UserID, MovieID) 对，忽略 Rating 和 Timestamp
data = list(zip(ratings['UserID'], ratings['MovieID']))


Users 数据集预览：
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455

Movies 数据集预览：
   MovieID                               Title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                             Genres  
0   [Animation, Children's, Comedy]  
1  [Adventure, Children's, Fantasy]  
2                 [Comedy, Romance]  
3                   [Comedy, Drama]  
4                          [Comedy]  
Ratings 数据集预览：
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408

# 数据集划分

In [16]:
import random

def SplitData(data, M, k, seed):
    """
    将数据集 data 均匀随机划分为 M 份，
    k 为当前划分的编号（0 <= k < M），
    如果某条数据被随机选中为 k，则划分到测试集，
    否则划分到训练集。

    参数:
    - data: 用户行为数据，格式为 [(user, item), ...]
    - M: 划分份数，例如 8
    - k: 本次实验选用的测试集编号，0 <= k < M
    - seed: 随机种子，用于保证实验可重复

    返回:
    - train: 训练集列表
    - test: 测试集列表
    """
    test = []
    train = []
    random.seed(seed)
    for user, item in data:
        # 随机生成 0 到 M-1 之间的一个整数
        if random.randint(0, M - 1) == k:
            test.append([user, item])
        else:
            train.append([user, item])
    return train, test


# 这里选择 M=8，k=0 作为一次实验（可循环不同 k 得到多个结果）
M = 8
k = 0
seed = 42
train_data, test_data = SplitData(data, M, k, seed)

print(f"\n训练集大小: {len(train_data)}, 测试集大小: {len(test_data)}")



训练集大小: 874948, 测试集大小: 125261


# 用户-物品集 & 用户相似度

## 用户物品集

In [17]:
# ---------------------------
# 构建用户-物品字典（训练集）
# ---------------------------
def build_user_item_dict(train_data):
    """
    根据训练集，生成 {user: set(items)} 结构
    """
    user_item = defaultdict(set)
    for user, item in train_data:
        user_item[user].add(item)
    return user_item

user_item = build_user_item_dict(train_data)


## 用户相似度

In [18]:
# ---------------------------
# 计算用户相似度
# ---------------------------
def calc_user_similarity(user_item):
    """
    计算用户之间的相似度
    使用余弦相似度公式： sim(u,v) = |I(u)∩I(v)| / sqrt(|I(u)|*|I(v)|)
    返回一个字典： {u: {v: similarity}}
    """
    # 建立物品到用户的倒排表
    item_users = defaultdict(set)
    for u, items in user_item.items():
        for i in items:         
            item_users[i].add(u)

    # 共现矩阵：统计两个用户共同交互的物品数
    co_occurrence = defaultdict(lambda: defaultdict(int))
    user_item_count = {u: len(items) for u, items in user_item.items()}

    for i, users in item_users.items():
        for u in users:
            for v in users:
                if u == v:
                    continue
                co_occurrence[u][v] += 1

    # 计算余弦相似度
    user_similarity = defaultdict(dict)
    for u, related_users in co_occurrence.items():
        for v, count in related_users.items():
            user_similarity[u][v] = count / math.sqrt(user_item_count[u] * user_item_count[v])
    return user_similarity

user_similarity = calc_user_similarity(user_item)

In [None]:
# GPU加速版本
import numpy as np
import scipy.sparse as sp
import math
from collections import defaultdict

def calc_user_similarity_sparse(user_item):
    """
    计算用户之间的相似度，采用稀疏矩阵加速计算
    使用余弦相似度公式：
      sim(u, v) = |I(u)∩I(v)| / sqrt(|I(u)| * |I(v)|)
      
    参数:
      user_item: {user: set(item)}
      
    返回:
      user_similarity: {user: {other_user: similarity}}
    """
    # 为用户和物品建立索引映射
    users = list(user_item.keys())
    user_index = {u: idx for idx, u in enumerate(users)}
    
    items = set()
    for u, it_set in user_item.items():
        items.update(it_set)
    items = list(items)
    item_index = {i: idx for idx, i in enumerate(items)}

    # 构建用户-物品稀疏矩阵 (行: 用户，列: 物品)
    row = []
    col = []
    data = []
    for u, it_set in user_item.items():
        for i in it_set:
            row.append(user_index[u])
            col.append(item_index[i])
            data.append(1)  # 隐反馈，交互记为1

    n_users = len(users)
    n_items = len(items)
    user_item_mat = sp.csr_matrix((data, (row, col)), shape=(n_users, n_items))
    
    # 计算共现矩阵：用户与用户之间的共同物品数
    co_occurrence_mat = user_item_mat.dot(user_item_mat.T).toarray()  # (n_users, n_users)
    
    # 每个用户的交互数（即行和）
    user_counts = np.array(user_item_mat.sum(axis=1)).flatten()
    
    # 构建归一化矩阵：sqrt(|I(u)|*|I(v)|)
    norm_matrix = np.outer(user_counts, user_counts)
    norm_matrix = np.sqrt(norm_matrix)
    
    # 计算余弦相似度：逐元素除法，注意避免除以0
    similarity_mat = np.divide(co_occurrence_mat, norm_matrix, where=(norm_matrix != 0))
    
    # 将相似度矩阵转换为字典格式，仅保留相似度大于0且 u != v 的部分
    user_similarity = {}
    for i, u in enumerate(users):
        user_similarity[u] = {}
        for j, v in enumerate(users):
            if i != j and similarity_mat[i, j] > 0:
                user_similarity[u][v] = similarity_mat[i, j]
    return user_similarity

# 示例用法，假设 user_item 已经由训练数据构建：
# user_item = build_user_item_dict(train_data)
user_similarity = calc_user_similarity_sparse(user_item)


# 推荐UserCF

In [None]:
# ---------------------------
# 推荐函数（UserCF）
# ---------------------------
def recommend(user, user_item, user_similarity, K=20, N=10):
    """
    为目标用户 user 推荐 N 个物品
    K: 考虑相似度最高的 K 个邻居
    """
    interacted_items = user_item.get(user, set())
    scores = defaultdict(float)
    
    # 若用户在训练集中没有行为，直接返回空推荐
    if user not in user_similarity:
        return []
    
    # 选取相似度最高的 K 个邻居
    neighbors = sorted(user_similarity[user].items(), key=lambda x: x[1], reverse=True)[:K]
    
    for neighbor, sim in neighbors: # sim表示相似度
        for item in user_item.get(neighbor, set()): #这里遍历了邻居用户 neighbor 交互过的所有物品
            if item in interacted_items:  #跳过当前用户已经交互的物品，即协同过滤中的过滤
                continue
            scores[item] += sim  # 可根据需要加入加权策略
    
    ranked_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:N]
    return [item for item, score in ranked_items]

# 评测函数

In [None]:
# ---------------------------
# 评测函数
# ---------------------------
def evaluate(user_item, test_data, user_similarity, K=20, N=10):
    """
    在测试集上评测推荐效果，计算 Precision、Recall、Coverage 和 Popularity
    """
    # 构建测试集的用户-物品字典
    test_user_item = defaultdict(set)
    for user, item in test_data:
        test_user_item[user].add(item)
    
    # 计算所有物品的流行度（出现次数），用于流行度评估
    item_popularity = defaultdict(int)
    for items in user_item.values():
        for item in items:
            item_popularity[item] += 1
    all_items = set(item_popularity.keys())
    
    hit = 0
    total_precision = 0
    total_recall = 0
    recommended_items_set = set()
    total_test_items = 0

    # 用于计算推荐列表的平均流行度
    total_popularity = 0
    total_rec = 0
    
    users = set(list(user_item.keys()) + list(test_user_item.keys()))
    
    for user in users:
        rec_items = recommend(user, user_item, user_similarity, K=K, N=N)
        true_items = test_user_item.get(user, set()) #从测试集中获取当前用户的真实交互物品集合
        hit += len(set(rec_items) & true_items) #计算当前用户推荐列表中有多少物品出现在真实交互集合中，即交集的大小
        
        total_precision += N #precision 分母
        total_recall += len(true_items) #recall 分母
        
        recommended_items_set.update(rec_items)
        
        total_test_items += len(true_items) #统计所有用户在测试集中真实交互物品的总数量，计算召回率
        
        #计算流行度
        for item in rec_items:
            total_popularity += math.log(1 + item_popularity.get(item, 0))  #取对数平滑数据防止极值
            total_rec += 1

    precision = hit / (total_precision + 1e-10)
    recall = hit / (total_recall + 1e-10)
    coverage = len(recommended_items_set) / len(all_items)
    popularity = total_popularity / (total_rec + 1e-10)
    
    return precision, recall, coverage, popularity

# 实验运行和结果输出

In [22]:
# ---------------------------
# 执行一次实验并输出评测结果
# ---------------------------
K = 80   # 邻居数
N = 10   # 推荐列表长度
precision, recall, coverage, popularity = evaluate(user_item, test_data, user_similarity, K=K, N=N)
print(f"\n评测结果：")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"Coverage:  {coverage:.4f}")
print(f"Popularity:{popularity:.4f}")


评测结果：
Precision: 0.2510
Recall:    0.1210
Coverage:  0.2047
Popularity:7.2850
