In [1]:
!pip install pytorch_lightning



In [2]:
!pip install tabml



In [1]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.multiprocessing
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import tabml.datasets

GLOBAL_SEED = 42  # number of life
torch.manual_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
torch.multiprocessing.set_sharing_strategy('file_system')

In [4]:
df_dict = tabml.datasets.download_movielen_1m()
users, movies, ratings = df_dict["users"], df_dict["movies"], df_dict["ratings"]
ratings["Rating"] = ratings["Rating"] - 3  # rating range (-2, 2)
train_ratings, validation_ratings = train_test_split(
    ratings, test_size=0.1, random_state=GLOBAL_SEED
)

In [5]:
# map movie id and user id to indexes.
movie_index_by_id = {id: idx for idx, id in enumerate(movies["MovieID"])}
user_index_by_id = {id: idx for idx, id in enumerate(users["UserID"])}


class MLDataset(Dataset):
    def __init__(self, ratings: pd.DataFrame):
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, index):
        user_id = self.ratings["UserID"].iloc[index]
        movie_id = self.ratings["MovieID"].iloc[index]
        rating = self.ratings["Rating"].iloc[index]
        user_index = user_index_by_id[user_id]
        movie_index = movie_index_by_id[movie_id]
        return user_index, movie_index, rating


training_data = MLDataset(train_ratings)
validation_data = MLDataset(validation_ratings)
batch_size = 1024
train_dataloader = DataLoader(
    training_data, batch_size=batch_size, shuffle=True, num_workers=5
)
validation_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=False, num_workers=5
)



In [6]:
!pip install jdc



In [7]:
import jdc

LR = 1
WEIGHT_DECAY = 5e-5


class MatrixFactorization(pl.LightningModule):
    """Pytorch lighting class for Matrix Factorization training.

    Attributes:
        n_users: number of users.
        n_items: number of items.
        n_factors: number of latent factors (or embedding size)
    """

    def __init__(self, n_users: int, n_items: int, n_factors: int = 40):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors
        self.user_biases = nn.Embedding(n_users, 1)
        self.item_biases = nn.Embedding(n_items, 1)
        self.bias = nn.Parameter(data=torch.rand(1))
        self.user_embeddings = nn.Embedding(n_users, n_factors)
        self.item_embeddings = nn.Embedding(n_items, n_factors)
        self.training_outputs = []
        self.validation_outputs = []

    def forward(self, users, items):
        """
        Forward pass through the model. For a single user and item, this
        looks like:
        bias + user_bias + item_bias + user_embeddings.dot(item_embeddings)

        Arguments:
            users: Array of user indices
            items : Array of item indices
        Returns:
            preds: Predicted ratings.
        """
        # select users and items from the batch
        batch_user_embs = self.user_embeddings(users)
        batch_item_embs = self.item_embeddings(items)

        preds = torch.reshape(
            torch.diag(
                torch.matmul(batch_user_embs, torch.transpose(batch_item_embs, 0, 1))
            ),
            (-1, 1),
        )
        # add bias
        preds += self.user_biases(users) + self.item_biases(items) + self.bias

        return torch.clip(preds.squeeze(), min=-2, max=2)

    def training_step(self, batch, batch_idx):
        users, items, rating = batch
        rating = rating.to(torch.float32)
        output = self.forward(users, items)
        loss = F.mse_loss(rating, output)
        self.log("batch_loss", loss)
        return {"loss": loss}  # for computing avg_loss in training_epoch_end

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
        return optimizer

In [8]:
def validation_step(self, batch, batch_idx):
    users, items, rating = batch
    rating = rating.to(torch.float32)
    output = self.forward(users, items)
    loss = F.mse_loss(rating, output)
    self.log("batch_loss", loss)
    self.validation_outputs.append({"loss": loss})

def on_training_epoch_end(self):
    avg_loss = torch.stack([x["loss"] for x in self.training_outputs]).mean()
    self.logger.experiment.add_scalars(
        "Loss", {"Train": avg_loss}, self.current_epoch
    )
    self.logger.experiment.add_scalars(
        "RMSE", {"Train": avg_loss ** 0.5}, self.current_epoch
    )
    self.training_outputs.clear()

def on_validation_epoch_end(self):
    avg_loss = torch.stack([x["loss"] for x in self.validation_outputs]).mean()
    self.logger.experiment.add_scalars(
        "Loss", {"Val": avg_loss}, self.current_epoch
    )
    self.logger.experiment.add_scalars(
        "RMSE", {"Val": avg_loss ** 0.5}, self.current_epoch
    )
    self.validation_outputs.clear()

In [9]:
# for tensorboard
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger("mf_tb_logs", name=f"lr{LR}_wd{WEIGHT_DECAY}")

n_users = len(user_index_by_id)
n_movies = len(movie_index_by_id)
n_factors = 40
model = MatrixFactorization(n_users=n_users, n_items=n_movies, n_factors=n_factors)
trainer = pl.Trainer(max_epochs=100, logger=logger, accelerator='auto')
trainer.fit(model, train_dataloader, validation_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:72: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.

  | Name            | Type      | Params
----------------------------------------------
0 | user_biases     | Embedding | 6.0 K 
1 | item_biases     | Embedding | 3.9 K 
2 | user_embeddings | Embedding | 241 K 
3 | item_embeddings | Embedding | 155 K 
  | other params    | n/a       | 1     
----------------------------------------------
406 K     Trainable params
0         Non-trainable params
406 K     Total params
1.627     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [10]:
def eval_model(model, train_dataloader):
    loss = 0
    for users, items, rating in train_dataloader:
        pred = model(users, items)
        loss += F.mse_loss(pred, rating)
    RMSE = (loss / len(train_dataloader))**.5
    return RMSE
    
print("Train RMSE: {:.3f}".format(eval_model(model, train_dataloader)))
print("Validation RMSE: {:.3f}".format(eval_model(model, validation_dataloader)))

Train RMSE: 0.797
Validation RMSE: 0.880


In [21]:
def recommend_movies(model, user_id, n_recommendations=5):
    # Convert the user_id to the corresponding user_index
    user_index = torch.tensor([user_index_by_id[user_id]])

    # Initialize a list to store the predicted ratings
    predicted_ratings = []

    # Use the model to predict the user's rating for each movie
    for movie_id in range(n_movies):
        item_index = torch.tensor([movie_id])
        predicted_rating = model(user_index, item_index)
        predicted_ratings.append(predicted_rating.item())

    # Convert the list of ratings to a tensor
    predicted_ratings = torch.tensor(predicted_ratings)

    # Get the indices of the top rated movies
    top_movie_indices = predicted_ratings.topk(n_recommendations).indices

    # Convert the indices back to movie IDs
    top_movie_ids = [movies["MovieID"].iloc[idx.item()] for idx in top_movie_indices]

    return top_movie_ids


user_id = 1  # replace with the ID of the user
recommended_movie_ids = recommend_movies(model, user_id)
print("Recommended movies for user", user_id, ":", recommended_movie_ids)


Recommended movies for user 1 : [527, 318, 858, 2028, 1198]


In [25]:
torch.save(model.state_dict(), 'model.pth')

In [30]:
model1 = MatrixFactorization(n_users=n_users, n_items=n_movies, n_factors=n_factors)
model1.load_state_dict(torch.load('model.pth'))

<All keys matched successfully>

In [35]:
user_id = 1  # replace with the ID of the user
recommended_movie_ids = recommend_movies(model1, user_id)
print("Recommended movies for user", user_id, ":", recommended_movie_ids)
    

Recommended movies for user 1 : [527, 318, 858, 2028, 1198]


In [29]:
def eval_model(model, train_dataloader):
    loss = 0
    for users, items, rating in train_dataloader:
        pred = model(users, items)
        loss += F.mse_loss(pred, rating)
    RMSE = (loss / len(train_dataloader))**.5
    return RMSE
    
print("Train RMSE: {:.3f}".format(eval_model(model1, train_dataloader)))
print("Validation RMSE: {:.3f}".format(eval_model(model1, validation_dataloader)))

Train RMSE: 0.797
Validation RMSE: 0.880


In [31]:
users, movies, ratings

(      UserID Gender  Age  Occupation Zip-code
 0          1      F    1          10    48067
 1          2      M   56          16    70072
 2          3      M   25          15    55117
 3          4      M   45           7    02460
 4          5      M   25          20    55455
 ...      ...    ...  ...         ...      ...
 6035    6036      F   25          15    32603
 6036    6037      F   45           1    76006
 6037    6038      F   56           1    14706
 6038    6039      F   45           0    01060
 6039    6040      M   25           6    11106
 
 [6040 rows x 5 columns],
       MovieID                               Title  \
 0           1                    Toy Story (1995)   
 1           2                      Jumanji (1995)   
 2           3             Grumpier Old Men (1995)   
 3           4            Waiting to Exhale (1995)   
 4           5  Father of the Bride Part II (1995)   
 ...       ...                                 ...   
 3878     3948             Mee