In [1]:
%cd ..

/Users/westford14/Desktop/projects/watchlist/watchlist-recommender


In [2]:
from ast import literal_eval

import torch
import torch.nn as nn
import torch.optim as optim
import lightning as L
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ratings = pd.read_csv("data/ratings.csv")
ratings = ratings.sample(n=1_000_000)
movies = pd.read_csv("data/movies_metadata.csv")

  movies = pd.read_csv("data/movies_metadata.csv")


#### Loading

In [4]:
ratings = ratings[["userId", "movieId", "rating"]].rename(columns={
    "userId": "user_id",
    "movieId": "movie_id",
})

In [5]:
movies = movies[["id", "original_title", "overview", "genres"]]
movies = movies.rename(columns={
    "id": "movie_id",
    "original_title": "title"
})

#### Cleaning

In [6]:
genres = [
    'Animation',
    'Comedy',
    'Family',
    'Adventure',
    'Fantasy',
    'Romance',
    'Drama',
    'Action',
    'Crime',
    'Thriller',
    'Horror',
    'History',
    'Science Fiction',
    'Mystery',
    'War',
    'Foreign',
    'Music',
    'Documentary',
    'Western'
]

In [7]:
def cleaner(x, genres=genres):
    ret = []
    for y in x:
        if y in genres:
            ret.append(y)
    return ret

In [8]:
movies["genres"] = movies["genres"].apply(lambda x: [y["name"] for y in literal_eval(x)])

In [9]:
movies["genres"] = movies["genres"].apply(cleaner)

In [10]:
movies["text"] = movies["title"] + movies["overview"] + movies["genres"].apply(lambda x: " ".join(x))

In [11]:
movies = movies[["movie_id", "text"]]

In [12]:
movies = movies.dropna()

In [13]:
movies["movie_id"] = pd.to_numeric(movies["movie_id"], errors="coerce")

In [14]:
movies = movies.dropna()

In [15]:
vectorizer = TfidfVectorizer(stop_words='english', strip_accents="ascii")
tfidf_output = vectorizer.fit_transform(movies["text"].values.tolist())

In [16]:
mapper = UMAP(n_components=5).fit(
    tfidf_output
)

In [17]:
umap_output = mapper.transform(tfidf_output)

In [18]:
umap_df = pd.DataFrame(umap_output, columns=[f"feature_{x}" for x in range(5)])

In [19]:
fulL_df = pd.concat([movies.reset_index(drop=True), umap_df], axis=1)

In [20]:
full_dataset = fulL_df.merge(ratings, on="movie_id")

In [21]:
full_dataset = full_dataset.drop(columns=["text"])

#### Training

In [107]:
class MatrixFactorization(L.LightningModule):
    def __init__(self, num_users, num_movies, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.text_embedding = nn.Embedding(num_movies, embedding_dim)
        self.embedding_dim = embedding_dim

    def forward(self, user_id, movie_id, text_features):
        user_vector = self.user_embedding(user_id)
        movie_vector = self.movie_embedding(movie_id)
        m = nn.ZeroPad1d((0, self.embedding_dim - text_features.shape[1]))
        text_vector = m(text_features)
        return (user_vector * movie_vector * text_vector).sum(1)

    def training_step(self, batch, batch_idx):
        user_id, item_id, rating, text_features = batch
        prediction = self(user_id, item_id, text_features)
        loss = nn.functional.mse_loss(prediction, rating)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        user_id, item_id, rating, text_features = batch
        prediction = self(user_id, item_id, text_features)
        loss = nn.functional.mse_loss(prediction, rating)
        self.log('validation_loss', loss)
        return {'validation_loss': loss, 'y_hat': prediction, 'y': rating}

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=0.01)

In [108]:
class MovieLensDataset(Dataset):
    def __init__(self, dataframe=None):
        self.dataframe = dataframe
        self.num_users = self.dataframe['user_id'].nunique()
        self.num_items = self.dataframe['movie_id'].nunique()
        self.text_items = self.dataframe['movie_id'].nunique()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        user_id = self.dataframe["user_id"].iloc[idx]
        movie_id = self.dataframe["movie_id"].iloc[idx]
        rating = self.dataframe["rating"].iloc[idx]
        text_features = self.dataframe[[f"feature_{x}" for x in range(5)]].iloc[idx]
        return (
            torch.tensor(user_id, dtype=torch.long),
            torch.tensor(movie_id, dtype=torch.long),
            torch.tensor(rating, dtype=torch.float),
            torch.tensor(text_features, dtype=torch.float)
        )

In [109]:
from sklearn.model_selection import train_test_split

In [110]:
train_df, test_df = train_test_split(full_dataset, test_size=0.7)
test_df, val_df = train_test_split(test_df, test_size=0.5)

train_dataset = MovieLensDataset(train_df)
val_dataset = MovieLensDataset(val_df)
test_dataset = MovieLensDataset(test_df)

In [111]:
train_data = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_data = DataLoader(val_dataset, batch_size=512, shuffle=True)
test_data = DataLoader(test_dataset, batch_size=512, shuffle=True)

In [119]:
import warnings
warnings.filterwarnings("ignore")

In [120]:
from lightning.pytorch.loggers import TensorBoardLogger

In [121]:
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint

In [125]:
model = MatrixFactorization(dataset.num_users, dataset.num_items, embedding_dim=20)

In [126]:
checkpoint_callback = ModelCheckpoint(
    monitor='validation_loss',
    dirpath='./checkpoints',  # saving to specific directory
    filename='best-checkpoint-nn-mf',  # consistent naming
    save_top_k=1,  # only keep the best
    mode='min',
    verbose=True
)

In [127]:
logger = TensorBoardLogger("lightning_logs", name="nn_mf")
trainer = L.Trainer(
    max_epochs=50,
    precision=16,
    logger=logger,
    enable_progress_bar=False,
    callbacks=[
        EarlyStopping(monitor="validation_loss", mode="min", min_delta=0.01),
        checkpoint_callback   
    ]
)
trainer.fit(model, train_data, val_data)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name            | Type      | Params | Mode 
------------------------------------------------------
0 | user_embedding  | Embedding | 2.8 M  | train
1 | movie_embedding | Embedding | 96.2 K | train
2 | text_embedding  | Embedding | 96.2 K | train
------------------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
11.813    Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 7:  60%|████████████████████████████████████████████████████████████████                                           | 154/257 [1:38:20<1:05:46,  0.03it/s, v_num=6]


Epoch 0, global step 257: 'validation_loss' reached 70.74042 (best 70.74042), saving model to '/Users/westford14/Desktop/projects/watchlist/watchlist-recommender/checkpoints/best-checkpoint-nn-mf-v2.ckpt' as top 1
Epoch 1, global step 514: 'validation_loss' reached 52.82177 (best 52.82177), saving model to '/Users/westford14/Desktop/projects/watchlist/watchlist-recommender/checkpoints/best-checkpoint-nn-mf-v2.ckpt' as top 1
Epoch 2, global step 771: 'validation_loss' reached 44.06007 (best 44.06007), saving model to '/Users/westford14/Desktop/projects/watchlist/watchlist-recommender/checkpoints/best-checkpoint-nn-mf-v2.ckpt' as top 1
Epoch 3, global step 1028: 'validation_loss' reached 38.81806 (best 38.81806), saving model to '/Users/westford14/Desktop/projects/watchlist/watchlist-recommender/checkpoints/best-checkpoint-nn-mf-v2.ckpt' as top 1
Epoch 4, global step 1285: 'validation_loss' reached 35.32833 (best 35.32833), saving model to '/Users/westford14/Desktop/projects/watchlist/wa

NameError: name 'exit' is not defined

### Content Only

In [None]:
class ContentModel(L.LightningModule):
    def __init__(self, num_movies, embedding_dim):
        super().__init__()
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.text_embedding = nn.Embedding(num_movies, embedding_dim)
        self.embedding_dim = embedding_dim

    def forward(self, movie_id, text_features):
        movie_vector = self.movie_embedding(movie_id)
        m = nn.ZeroPad1d((0, self.embedding_dim - text_features.shape[1]))
        text_vector = m(text_features)
        return (movie_vector * text_vector).sum(1)

    def training_step(self, batch, batch_idx):
        item_id, rating, text_features = batch
        prediction = self(item_id, text_features)
        loss = nn.functional.mse_loss(prediction, rating)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        item_id, rating, text_features = batch
        prediction = self(user_id, item_id, text_features)
        loss = nn.functional.mse_loss(prediction, rating)
        self.log('validation_loss', loss)
        return {'validation_loss': loss, 'y_hat': prediction, 'y': rating}

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=0.01)

In [None]:
class TMDBDataset(Dataset):
    def __init__(self, dataframe=None):
        self.dataframe = dataframe
        self.num_items = self.dataframe['movie_id'].nunique()
        self.text_items = self.dataframe['movie_id'].nunique()

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        movie_id = self.dataframe["movie_id"].iloc[idx]
        rating = self.dataframe["rating"].iloc[idx]
        text_features = self.dataframe[[f"feature_{x}" for x in range(5)]].iloc[idx]
        return (
            torch.tensor(movie_id, dtype=torch.long),
            torch.tensor(rating, dtype=torch.float),
            torch.tensor(text_features, dtype=torch.float)
        )

In [None]:
train_df, test_df = train_test_split(full_dataset, test_size=0.7)
test_df, val_df = train_test_split(test_df, test_size=0.5)

train_dataset = MovieLensDataset(train_df)
val_dataset = MovieLensDataset(val_df)
test_dataset = MovieLensDataset(test_df)

#### Evaluation / Prediction

In [None]:
best_model = MatrixFactorization.load_from_checkpoint(
    "./checkpoints/best-checkpoint-nn-mf-v2.ckpt"
)

In [None]:
trainer = L.Trainer(enable_progress_bar=True)
trainer.test(dataloaders=test_data)

In [None]:
model = MatrixFactorization.load_from_checkpoint(
    checkpoints_path[0], num_users=num_users, num_items=num_items, embedding_dim=embedding_dim
).cpu()

In [None]:
def predict_for_user(model, user_id, num_items, text_data, embedding_size=20, top_nb=10):
    model.eval()  # Set model to evaluation mode
    user_id_tensor = torch.tensor([user_id] * num_items, dtype=torch.long)
    movie_id_tensor = torch.tensor(range(num_items), dtype=torch.long)

    m = nn.ZeroPad1d((0, embedding_size - text_data.shape[0]))
    text_vector = m(text_data)
    
    with torch.no_grad():  # Disable gradient calculation
        predictions = model(user_id_tensor, item_id_tensor, movie_id_tensor, text_vector)
    
    top = sorted(enumerate(predictions), reverse=True, key=lambda i: i[1])
    print(f"Predicted rating for User `{user_id + 1}` with best:")
    for item_id, pred in top[:top_nb]:
        print(f"\t-> Item {item_id + 1}: {pred.item():.2f}")