In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import joblib

In [2]:
# ----------------------------
# BƯỚC 1: Load dữ liệu
# ----------------------------
def load_data(base_dir='ml-1m'):
    ratings = pd.read_csv(os.path.join(base_dir, 'ratings.dat'), sep='::', engine='python',
                          names=['userId', 'movieId', 'rating', 'timestamp'])
    users = pd.read_csv(os.path.join(base_dir, 'users.dat'), sep='::', engine='python',
                        names=['userId', 'gender', 'age', 'occupation', 'zip'])
    movies = pd.read_csv(os.path.join(base_dir, 'movies.dat'), sep='::', engine='python',
                         names=['movieId', 'title', 'genres'], encoding='latin-1')
    return ratings, users, movies

ratings_df, users_df, movies_df = load_data()

In [3]:
# ----------------------------
# BƯỚC 2: Tiền xử lý & Encode
# ----------------------------
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

ratings_df['user'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['item'] = item_encoder.fit_transform(ratings_df['movieId'])

n_users = ratings_df['user'].nunique()
n_items = ratings_df['item'].nunique()

print(f"✔️ Số lượng người dùng: {n_users}")
print(f"✔️ Số lượng phim: {n_items}")

✔️ Số lượng người dùng: 6040
✔️ Số lượng phim: 3706


In [4]:
# ----------------------------
# BƯỚC 3: Dataset & DataLoader
# ----------------------------
class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user'].values, dtype=torch.long)
        self.items = torch.tensor(df['item'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

dataset = RatingDataset(ratings_df)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

In [5]:
# ----------------------------
# BƯỚC 4: Mô hình NCF
# ----------------------------
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=50):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.fc1 = nn.Linear(emb_size * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 1)

    def forward(self, user, item):
        u = self.user_emb(user)
        i = self.item_emb(item)
        x = torch.cat([u, i], dim=-1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.output(x).squeeze()

model = NCF(n_users, n_items)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [6]:
# ----------------------------
# BƯỚC 5: Huấn luyện mô hình
# ----------------------------
n_epochs = 5
model.train()
for epoch in range(n_epochs):
    epoch_loss = 0
    for users, items, ratings in tqdm(dataloader, desc=f"Epoch {epoch+1}/{n_epochs}"):
        preds = model(users, items)
        loss = loss_fn(preds, ratings)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"📉 Epoch {epoch+1}: Loss = {epoch_loss / len(dataloader):.4f}")

Epoch 1/5: 100%|██████████| 3908/3908 [00:20<00:00, 193.55it/s]


📉 Epoch 1: Loss = 1.0071


Epoch 2/5: 100%|██████████| 3908/3908 [00:21<00:00, 184.25it/s]


📉 Epoch 2: Loss = 0.8467


Epoch 3/5: 100%|██████████| 3908/3908 [00:19<00:00, 198.12it/s]


📉 Epoch 3: Loss = 0.8267


Epoch 4/5: 100%|██████████| 3908/3908 [00:20<00:00, 192.39it/s]


📉 Epoch 4: Loss = 0.8155


Epoch 5/5: 100%|██████████| 3908/3908 [00:20<00:00, 191.59it/s]

📉 Epoch 5: Loss = 0.8071





In [7]:
# ----------------------------
# BƯỚC 6: Lưu mô hình
# ----------------------------
torch.save(model.state_dict(), 'ncf_model.pt')
joblib.dump(user_encoder, 'user_encoder.pkl')
joblib.dump(item_encoder, 'item_encoder.pkl')
print("✅ Đã lưu mô hình vào ncf_model.pt")

✅ Đã lưu mô hình vào ncf_model.pt
