In [21]:
import pandas as pd
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
df = pd.read_csv('datasets/ratings.csv')

In [24]:
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, item):
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]

        return {
            'users': torch.tensor(users, dtype=torch.long),
            'movies': torch.tensor(movies, dtype=torch.long),
            'ratings': torch.tensor(ratings, dtype=torch.long)
        }

In [25]:
class RecommendationSystemModel(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=256, hidden_dim=256, dropout_rate=0.2):
        super(RecommendationSystemModel, self).__init__()
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.hidden_dim = hidden_dim

        # E,bedding layers
        self.users_embedding = nn.Embedding(
            num_embeddings=self.num_users, embedding_dim=self.embedding_size
        )
        self.movies_embedding = nn.Embedding(
            num_embeddings=self.num_movies, embedding_dim=self.embedding_size
        )

        # Hidden layers
        self.fc1 = nn.Linear(2 * self.embedding_size, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, 1)

        # Dropout layer
        self.dropout = nn.Dropout(p=dropout_rate)

        # Function activation
        self.act = nn.ReLU()
        # self.users_embedding = nn.Embedding(n_users, 32)
        # self.movies_embedding = nn.Embedding(n_movies, 32)
        # self.out = nn.Linear(64, 1)

    def forward(self, users, movies):
        user_embeds = self.users_embedding(users)
        movie_embeds = self.movies_embedding(movies)
        combined = torch.cat([user_embeds, movie_embeds], dim=1)    # об'єднання двох тенсорів

        x = self.act(self.fc1(combined))
        x = self.dropout(x)
        output = self.fc2(x)
        return output

In [26]:
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

df_train, df_valid = model_selection.train_test_split(df, test_size=0.1, random_state=42, stratify=df.rating.values)
# test_size вказує на те що 10% даних будуть використані для df_valid
# random_state Встановлює початкове значення для генератора випадкових чисел, щоб забезпечити відтворюваність результатів. Використання одного і того ж random_state дозволяє отримувати однакові результати кожного разу при запуску коду, що корисно для відладки і порівняння моделей.
# stratify Використовується для забезпечення пропорційного розподілу класів у обох підмножинах. Тут db.ratings.values — це значення рейтингу, яке використовується для стратифікації, тобто для того, щоб зберегти розподіл рейтингів у навчальному і валідаційному наборах даних. Це корисно, якщо у вас є категоріальні дані, і ви хочете, щоб кожна підмножина даних зберігала пропорції класів, подібні до початкового набору даних.

train_dataset = MovieDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values,
    ratings=df_train.rating.values
)

valid_dataset = MovieDataset(
    users=df_valid.userId.values,
    movies=df_valid.movieId.values,
    ratings=df_valid.rating.values
)

In [27]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2
)

validation_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2
)

In [28]:
model = RecommendationSystemModel(
    num_users=len(lbl_user.classes_),
    num_movies=len(lbl_movie.classes_),
    embedding_size=128,
    hidden_dim=256,
    dropout_rate=0.1
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)    # зменшить швидкість навчання на 30% кожні 3 епохи. Це може допомогти моделі краще сходитися до мінімуму функції втрат за рахунок поступового зменшення темпу навчання.

loss_fn = nn.MSELoss()

In [None]:
import sys

EPOCHS = 2

def log_progress(epoch, step, total_loss, log_progress_step, data_size, losses):
    avg_loss = total_loss / log_progress_step
    sys.stderr.write(
        f"\r{epoch+1:02d}/{EPOCHS:02d} | Step: {step}/{data_size} | Avg Loss: {avg_loss:<6.9f}"
    )
    sys.stderr.flush()
    losses.append(avg_loss)

total_loss = 0
loss_progress_step = 100
losses = []
train_dataset_size = len(train_dataset)
print(f"Training on {train_dataset_size} samples...")

model.train()
for e in range(EPOCHS):
    step_count = 0
    for i, train_data in enumerate(train_loader):
        output = model(
            train_data['users'].to(device), train_data['movies'].to(device)
        )
        # Reshape the model output to match the target's shape
        output = output.squeeze()   # Removes the singleton dimension
        ratings = (
            train_data['ratings'].to(torch.float32).to(device)
        )   # Assuming ratings is already 1D
        loss = loss_fn(output, ratings)
        total_loss += loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Increment step count by the actual size of the batch
        step_count += len(train_data['users'])

        # Check if it's time to log progress
        if step_count % loss_progress_step == 0 or i == len(train_loader) - 1:
            log_progress(e, step_count, total_loss, loss_progress_step, train_dataset_size, losses)
            total_loss = 0

Training on 22500085 samples...


In [None]:
from sklearn.metrics import mean_squared_error

y_pred = []
y_true = []

model.eval()

with torch.no_grad():
    for i, valid_data in enumerate(validation_loader):
        output = model(
            valid_data['users'].to(device),
            valid_data['movies'].to(device)
        )
        ratings = valid_data['ratings'].to(device)
        y_pred.extend(output.cpu().numpy())
        y_true.extend(ratings.cpu().numpy())

rms = mean_squared_error(y_true, y_pred, squared=False)     # Root Mean Square Error
print(f'RMSE: {rms:.4f}')

In [None]:
from collections import defaultdict

def calculate_precision_recall(user_ratings, k, threshold):
    user_ratings.sort(key=lambda x: x[0], reverse=True)
    n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
    n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
    n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k])
    precision = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
    recall = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    return precision, recall

user_ratings_comparison = defaultdict(list)

with torch.no_grad():
    for valid_data in validation_loader:
        users = valid_data['users'].to(device)
        movies = valid_data['movies'].to(device)
        ratings = valid_data['ratings'].to(device)
        output = model(users, movies)

        for user, pred, true in zip(users, output, ratings):
            user_ratings_comparison[user.item()].append((pred[0].item(), true.item()))


user_precisions = dict()
user_based_recall = dict()

k = 50
threshold = 50

for user_id, user_ratings in user_ratings_comparison.items():
    precision, recall = calculate_precision_recall(user_ratings, k, threshold)
    user_precisions[user_id] = precision
    user_based_recall[user_id] = recall


average_precision = sum(prec for prec in user_precisions.values()) / len(user_precisions)
average_recall = sum(prec for prec in user_based_recall.values()) / len(user_based_recall)

print(f"precision @ {k}: {average_precision:.4f}")
print(f"recall @ {k}: {average_recall:.4f}")

In [9]:
model.eval()
df_movies = pd.read_csv('movies.csv')

def get_user_recommendations(user_id, num_recomendations=10):
    user_tensor = torch.tensor([user_id] * len(lbl_movie.classes_)).to(device)
    movie_tensor = torch.tensor(range(len(lbl_movie.classes_))).to(device)

    with torch.no_grad():
        predictions = model(user_tensor, movie_tensor)

    movies_title = []

    movie_id_to_title = dict(zip(df_movies['movieId'], df_movies['title']))

    movie_predictions = list(zip(lbl_movie.classes_, predictions.squeeze().cpu().numpy()))

    movie_predictions.sort(key=lambda x: x[1], reverse=True)

    for movie_id, prediction_rating in movie_predictions:
        try:
            original_movie_id = lbl_movie.inverse_transform([movie_id])[0]
            movie_title = movie_id_to_title.get(original_movie_id, 'Unknown Title')
            movies_title.append(movie_title)
            if len(movies_title) == num_recomendations:
              break
        except ValueError:
            continue
    return movies_title