In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### 生成文本表示

In [4]:
from transformers import BertTokenizer, BertModel
import torch

#tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
#model = BertModel.from_pretrained('bert-base-chinese').cuda()

model_path = "../../dataroot/models/bert-base-chinese"           #从本地文件夹装载模型的方法
tokenizer = BertTokenizer.from_pretrained("../../dataroot/models/bert-base-chinese")
model = BertModel.from_pretrained(model_path)


In [5]:
print(tokenizer.tokenize('I have a good time, thank you.'))

bert = BertModel.from_pretrained(model_path)

print('load bert model over')

['[UNK]', 'have', 'a', 'good', 'time', ',', 'than', '##k', 'you', '.']
load bert model over


In [6]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\selected_book_top_1200_data_tag.csv')

tag_embedding_dict = {}

with torch.no_grad():
    for index, rows in tqdm(loaded_data.iterrows()):
        # 将标签列表转换为字符串
        tags_str = " ".join(rows.Tags)
        # 使用BERT中文模型对标签进行编码
        inputs = tokenizer(tags_str, truncation=True, return_tensors='pt',max_length=512)
        outputs = model(inputs.input_ids.cpu(), inputs.token_type_ids.cpu(), inputs.attention_mask.cpu())
        # 使用最后一层的平均隐藏状态作为标签的向量表示
        tag_embedding = outputs.last_hidden_state.mean(dim=1).cpu()
        tag_embedding_dict[rows.Book] = tag_embedding


94it [01:07,  1.38it/s]


KeyboardInterrupt: 

In [26]:
import pickle

# 将映射表存储为二进制文件
with open('data/tag_embedding_dict.pkl', 'wb') as f:
    pickle.dump(tag_embedding_dict, f)


In [27]:
# 从二进制文件中读取映射表
with open('data/tag_embedding_dict.pkl', 'rb') as f:
    tag_embedding_dict = pickle.load(f)

In [28]:
# 读loaded_data取保存的 CSV 文件
loaded_data = pd.read_csv('data\\book_score.csv')

# 显示加载的数据
print(loaded_data)

           User     Book  Rate                       Time         Tag
0       1398478  1467022     0  2011-03-29T12:48:35+08:00         NaN
1       1398478  1777823     0  2011-02-02T21:58:55+08:00         NaN
2       1398478  1902628     0  2011-01-31T15:57:58+08:00         NaN
3       1398478  1878708     0  2011-01-26T11:27:59+08:00         NaN
4       1398478  4238362     0  2011-01-21T13:04:15+08:00         NaN
...         ...      ...   ...                        ...         ...
637249  4507957  1125186     4  2009-07-04T08:02:13+08:00  张爱玲,半生缘,爱情
637250  4507957  1002299     5  2009-07-04T08:01:28+08:00  金庸,武侠,笑傲江湖
637251  4507957  1001136     4  2009-07-04T07:55:17+08:00     彼得・潘,童话
637252  4507957  1021615     5  2009-07-04T07:53:54+08:00   小王子,童话,经典
637253  4507957  1962929     5  2009-06-29T22:13:37+08:00          爱情

[637254 rows x 5 columns]


In [29]:
class BookRatingDataset(Dataset):
    def __init__(self, data, user_to_idx, book_to_idx, tag_embedding_dict):
        self.data = data
        self.user_to_idx = user_to_idx # 用户ID到索引的映射字典
        self.book_to_idx = book_to_idx
        self.tag_embedding_dict = tag_embedding_dict

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_to_idx[row['User']]
        book = self.book_to_idx[row['Book']]
        rating = row['Rate'].astype('float32')
        text_embedding = self.tag_embedding_dict.get(row['Book'])
        return user, book, rating, text_embedding

# 自定义矩阵分解模型
class MatrixFactorization(nn.Module):
    # 初始化模型参数
    def __init__(self, num_users, num_books, embedding_dim, hidden_state):
        super(MatrixFactorization, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings = nn.Embedding(num_books, embedding_dim)
        self.linear_embedding = nn.Linear(hidden_state, embedding_dim)
        self.output = nn.Linear(embedding_dim, 6)
    # 定义前向传播
    def forward(self, user, book, tag_embedding):
        user_embedding = self.user_embeddings(user)
        book_embedding = self.book_embeddings(book)
        tag_embedding_proj = self.linear_embedding(tag_embedding)
        book_intergrate = book_embedding + tag_embedding_proj
        return (user_embedding * book_intergrate).sum(dim = 1)
        
def create_id_mapping(id_list):
    # 从ID列表中删除重复项并创建一个排序的列表
    unique_ids = sorted(set(id_list))
    
    # 创建将原始ID映射到连续索引的字典
    id_to_idx = {id: idx for idx, id in enumerate(unique_ids)}
    
    # 创建将连续索引映射回原始ID的字典
    idx_to_id = {idx: id for id, idx in id_to_idx.items()}
    
    return id_to_idx, idx_to_id

# 按用户分组计算NDCG
def compute_ndcg(group):
    true_ratings = group['true'].tolist()
    pred_ratings = group['pred'].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k = 50)

In [30]:
user_ids = loaded_data['User'].unique()
book_ids = loaded_data['Book'].unique()

user_to_idx, idx_to_user = create_id_mapping(user_ids) #因为有两个返回值，所以用两个变量接收
book_to_idx, idx_to_book = create_id_mapping(book_ids)

# 划分训练集和测试集
train_data, test_data = train_test_split(loaded_data, test_size=0.5, random_state=42)

# 创建训练集和测试集的数据集对象
train_dataset = BookRatingDataset(train_data, user_to_idx, book_to_idx, tag_embedding_dict)
test_dataset = BookRatingDataset(test_data, user_to_idx, book_to_idx, tag_embedding_dict)

# 创建训练集和测试集的数据加载器
train_dataloader = DataLoader(train_dataset, batch_size=4096, shuffle=True, drop_last = True)
test_dataloader = DataLoader(test_dataset, batch_size=4096, shuffle=False, drop_last = True)

num_users = loaded_data['User'].nunique()  
num_books = loaded_data['Book'].nunique() 
embedding_dim, hidden_state = 32, 768

model = MatrixFactorization(num_users, num_books, embedding_dim, hidden_state).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

### 训练

In [31]:
num_epochs = 20
lambda_u, lambda_b = 0.001, 0.001

for epoch in range(num_epochs):
    model.train()
    total_loss_train, total_loss_test = 0.0, 0.0

    # 训练模型
    for idx, (user_ids, book_ids, ratings, tag_embedding) in tqdm(enumerate(train_dataloader)):
        """
        train_dataloader返回的是一个迭代器，每次迭代返回一个batch的数据
        enumerate(train_dataloader) 将数据加载器包装为一个可迭代对象，并同时返回迭代的索引值和对应的数据批次
        idx 是当前迭代的索引值，从 0 开始计数
        (user_ids, book_ids, ratings, tag_embedding) 是从 train_dataloader 中获取的一个批次的数据。这里假设训练数据是由用户ID、图书ID、评分和标签嵌入等组成的元组或列表。
        """
        # 在循环的每次迭代中，使用变量 user_ids、book_ids、ratings 和 tag_embedding 来访问当前批次中的用户ID、图书ID、评分和标签嵌入数据。
        # 在循环体内进行模型的前向传播、损失计算、梯度更新等操作。
        # 使用user_ids, book_ids, ratings进行训练

        optimizer.zero_grad()
        
        predictions = model(user_ids.to(device), book_ids.to(device), tag_embedding.squeeze(1).to(device))
        loss = criterion(predictions, ratings.to(device)) + lambda_u * model.user_embeddings.weight.norm(2) + lambda_b * model.book_embeddings.weight.norm(2)
        
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        
        # if idx % 100 == 0:
        #     print(f'Step {idx}, Loss: {loss.item()}')

    output_loss_train = total_loss_train / (idx + 1) 

    results = []
    model.eval()

    # 对模型进行评估
    with torch.no_grad():
        for idx, (user_ids, item_ids, true_ratings, tag_embedding) in enumerate(test_dataloader):
            pred_ratings = model(user_ids.to(device), item_ids.to(device), tag_embedding.squeeze(1).to(device))

            loss = criterion(pred_ratings, ratings.to(device))
            total_loss_test += loss.item()

            # 将结果转换为 numpy arrays
            user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
            pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
            true_ratings_np = true_ratings.numpy().reshape(-1, 1)

            # 将这三个 arrays 合并成一个 2D array
            batch_results = np.column_stack((user_ids_np, pred_ratings_np, true_ratings_np))

            # 将这个 2D array 添加到 results
            results.append(batch_results)

        # 将结果的 list 转换为一个大的 numpy array
        results = np.vstack(results)

        # 将结果转换为DataFrame
        results_df = pd.DataFrame(results, columns=['user', 'pred', 'true'])
        results_df['user'] = results_df['user'].astype(int)

        ndcg_scores = results_df.groupby('user').apply(compute_ndcg)

        # 计算平均NDCG
        avg_ndcg = ndcg_scores.mean()
        print(f'Epoch {epoch}, Train loss: {output_loss_train}, Test loss:, {total_loss_test / (idx + 1)}, Average NDCG: {avg_ndcg}')

77it [00:18,  4.19it/s]


Epoch 0, Train loss: 57.88074151571695, Test loss:, 31.893903187343053, Average NDCG: 0.6682999768602526


77it [00:18,  4.17it/s]


Epoch 1, Train loss: 26.415488899528206, Test loss:, 23.89678011312113, Average NDCG: 0.6684748914826798


77it [00:18,  4.24it/s]


Epoch 2, Train loss: 16.6050007807744, Test loss:, 17.09768617307985, Average NDCG: 0.6699033616216872


77it [00:18,  4.20it/s]


Epoch 3, Train loss: 9.859439081959911, Test loss:, 13.525147202727082, Average NDCG: 0.6709516525830644


77it [00:19,  4.02it/s]


Epoch 4, Train loss: 6.8573274736280565, Test loss:, 11.090482538396662, Average NDCG: 0.6727843730797698


77it [00:18,  4.25it/s]


Epoch 5, Train loss: 5.216031761912556, Test loss:, 10.037197199734775, Average NDCG: 0.6741032475330729


77it [00:17,  4.33it/s]


Epoch 6, Train loss: 4.2635035236160475, Test loss:, 9.180692016304313, Average NDCG: 0.6761095150469112


77it [00:18,  4.27it/s]


Epoch 7, Train loss: 3.6802716533859057, Test loss:, 8.554117846798587, Average NDCG: 0.6775534713287976


77it [00:17,  4.32it/s]


Epoch 8, Train loss: 3.2833151136125838, Test loss:, 8.041985499394404, Average NDCG: 0.6793426813382448


77it [00:18,  4.27it/s]


Epoch 9, Train loss: 3.032642736063375, Test loss:, 7.7694041636083035, Average NDCG: 0.6810151405064631


77it [00:17,  4.29it/s]


Epoch 10, Train loss: 2.8556215329603716, Test loss:, 7.654856700401802, Average NDCG: 0.6821635804094829


77it [00:17,  4.32it/s]


Epoch 11, Train loss: 2.729936497552054, Test loss:, 7.6966022082737515, Average NDCG: 0.6838406504915127


77it [00:17,  4.31it/s]


Epoch 12, Train loss: 2.69841937275676, Test loss:, 7.381534285359568, Average NDCG: 0.6850980623034627


77it [00:17,  4.33it/s]


Epoch 13, Train loss: 2.651248953559182, Test loss:, 7.399419177662242, Average NDCG: 0.6861964079368137


77it [00:18,  4.08it/s]


Epoch 14, Train loss: 2.59612952888786, Test loss:, 7.301198160493529, Average NDCG: 0.6868285206577462


77it [00:19,  3.94it/s]


Epoch 15, Train loss: 2.5233867725768646, Test loss:, 7.6227633179008185, Average NDCG: 0.6873710815623847


77it [00:19,  3.91it/s]


Epoch 16, Train loss: 2.5309912910709134, Test loss:, 7.4007358365244675, Average NDCG: 0.6884088626556116


77it [00:19,  3.90it/s]


Epoch 17, Train loss: 2.503222131109857, Test loss:, 7.173255957566298, Average NDCG: 0.688401650197915


77it [00:18,  4.11it/s]


Epoch 18, Train loss: 2.519645811675431, Test loss:, 7.113330816293692, Average NDCG: 0.689379363514275


77it [00:22,  3.40it/s]


Epoch 19, Train loss: 2.5000851371071557, Test loss:, 7.462568091107653, Average NDCG: 0.6902590802917931
