In [28]:
import pandas as pd
import torch
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import numpy as np
# Đọc dữ liệu
df = pd.read_csv('./Data/Data_final/ratings.csv')

# Chuyển đổi người dùng và sản phẩm thành các chỉ số (index)
user_ids = df['UserID'].unique().tolist()
item_ids = df['MovieID'].unique().tolist()

user2idx = {x: i for i, x in enumerate(user_ids)}
item2idx = {x: i for i, x in enumerate(item_ids)}

df['user'] = df['UserID'].apply(lambda x: user2idx[x])
df['item'] = df['MovieID'].apply(lambda x: item2idx[x])

# Chuyển đổi rating thành nhãn nhị phân (implicit feedback)
df['label'] = (df['Rating'] >= 4).astype(float)

# Chia dữ liệu thành tập huấn luyện và kiểm tra
train, test = train_test_split(df[['user', 'item', 'label']], test_size=0.2, random_state=42)

train_data = torch.utils.data.TensorDataset(
    torch.tensor(train['user'].values, dtype=torch.long),
    torch.tensor(train['item'].values, dtype=torch.long),
    torch.tensor(train['label'].values, dtype=torch.float32)
)

test_data = torch.utils.data.TensorDataset(
    torch.tensor(test['user'].values, dtype=torch.long),
    torch.tensor(test['item'].values, dtype=torch.long),
    torch.tensor(test['label'].values, dtype=torch.float32)
)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=False)


In [29]:
import torch.nn.functional as F

class NCF(pl.LightningModule):
    def __init__(self, num_users, num_items, factors=8, layers=[64, 32, 16, 8], lr=0.001):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, factors)
        self.item_embedding = nn.Embedding(num_items, factors)
        self.fc_layers = nn.Sequential()
        input_size = factors * 2  # Vì chúng ta ghép nối vector nhúng của người dùng và sản phẩm

        for i in range(len(layers)):
            self.fc_layers.add_module(f'fc_layer{i}', nn.Linear(input_size, layers[i]))
            self.fc_layers.add_module(f'relu_layer{i}', nn.ReLU())
            input_size = layers[i]
            
        self.output_layer = nn.Linear(input_size, 1)
        self.lr = lr
    
    def forward(self, user, item):
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        x = torch.cat([user_embedding, item_embedding], dim=-1)
        x = self.fc_layers(x)
        output = torch.sigmoid(self.output_layer(x))
        return output
    
    def training_step(self, batch, batch_idx):
        user, item, label = batch
        prediction = self(user, item)
        loss = F.binary_cross_entropy(prediction, label.unsqueeze(1))
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


In [30]:
# Huấn luyện lại mô hình
num_users = len(user2idx)
num_items = len(item2idx)

model = NCF(num_users, num_items)
trainer = Trainer(max_epochs=20)
trainer.fit(model, train_dataloader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type       | Params
----------------------------------------------
0 | user_embedding | Embedding  | 8.0 K 
1 | item_embedding | Embedding  | 41.8 K
2 | fc_layers      | Sequential | 3.8 K 
3 | output_layer   | Linear     | 9     
----------------------------------------------
53.7 K    Trainable params
0         Non-trainable params
53.7 K    Total params
0.215     Total estimated model params size (MB)
/home/tuyen/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 19: 100%|██████████| 1276/1276 [00:06<00:00, 197.87it/s, v_num=7]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 1276/1276 [00:06<00:00, 197.20it/s, v_num=7]


In [31]:
import numpy as np

def hit_ratio_at_k(predictions, labels, k):
    _, top_k_indices = predictions.topk(min(k, predictions.size(1)), dim=1)
    hits = 0
    for i in range(labels.size(0)):
        if labels[i] in top_k_indices[i]:
            hits += 1
    return hits / labels.size(0)

def evaluate(model, dataloader, k=10):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for user, item, label in dataloader:
            predictions = model(user, item)
            all_predictions.append(predictions)
            all_labels.append(label)
    
    all_predictions = torch.cat(all_predictions)
    all_labels = torch.cat(all_labels)
    return hit_ratio_at_k(all_predictions, all_labels, k)

# Đánh giá mô hình
hit_ratio = evaluate(model, test_dataloader, k=10)
print(f'Hit Ratio @10: {hit_ratio:.4f}')


Hit Ratio @10: 0.4485
