In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import imdb
import seaborn as sns
import matplotlib.pyplot as plt
import re
from datetime import datetime
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset

In [43]:
class RecommendationModel(pl.LightningModule):
    def __init__(self, lr, users_count, movies_count, embedding_size, features_count, context_count):
        super(RecommendationModel, self).__init__()
        self.loss_fn = torch.nn.MSELoss()
        self.lr = lr

        self.user_embedding = nn.Embedding(users_count, embedding_size)
        self.movie_embedding = nn.Embedding(movies_count, embedding_size)
        self.um_dense = nn.Linear(2*embedding_size, embedding_size)
        self.um_dense_2 = nn.Linear(embedding_size, 10)

        self.movie_features = nn.Linear(features_count, 10)
        self.user_context = nn.Linear(context_count, context_count)

        self.final_fc1 = nn.Linear(10+10+2, 10)
        self.output = nn.Linear(10, 1)

    def forward(self, x):
        x_user = x['user'] 
        x_movie = x['movie']
        x_movie_features = x['movie_features']
        x_context_features = x['user_context']
        
        try:
            emb_user = self.user_embedding(x_user)
            emb_movie = self.movie_embedding(x_movie)   
        except:
            import pdb;pdb.set_trace()
        um_cat = torch.cat((emb_user, emb_movie), 1)
        um_fc1 = F.relu(self.um_dense(um_cat))
        um_fc2 = F.relu(self.um_dense_2(um_fc1))

        feat_fc = F.relu(self.movie_features(x_movie_features.float()))
        con_fc = F.relu(self.user_context(x_context_features.float()))

        um_cat_2 = torch.cat((um_fc2, feat_fc, con_fc), 1)
        final_fc = self.final_fc1(um_cat_2)
        
        output_layer = torch.sigmoid(self.output(final_fc))
        
        return output_layer

    def training_step(self, batch, batch_nb):
        x, y = batch
        preds = self(x)
        loss = self.loss_fn(preds.float(), y.float())

        self.log("train_loss", loss, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_nb):
        x_val, y_val = batch
        preds_val = self(x_val)
        loss_val = self.loss_fn(preds_val.float(), y_val.float())

        self.log("validation_loss", loss_val, on_epoch=True)
        return loss_val
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

class RecommendationDataset(Dataset):
    def __init__(self, data_path, features, contexts, dataset_type = 'train'):
        full_df = pd.read_csv(data_path)
        self.all_users = list(full_df['user'].unique())
        self.all_movies = list(full_df['product'].unique())

        X_train, X_test, y_train, y_test = train_test_split(full_df.loc[:, full_df.columns != 'y'], 
                                                            full_df['y'], 
                                                            test_size=.2, 
                                                            random_state=42)

        if dataset_type == 'train':
            tmp_x_user = X_train['user'].to_numpy()
            tmp_x_movie = X_train['product'].to_numpy()
            tmp_x_feats = X_train[features].to_numpy()
            tmp_x_contexts = X_train[contexts].to_numpy()
            tmp_y = y_train.to_numpy()
        elif dataset_type == 'test':
            tmp_x_user = X_test['user'].to_numpy()
            tmp_x_movie = X_test['product'].to_numpy()
            tmp_x_feats = X_test[features].to_numpy()
            tmp_x_contexts = X_test[contexts].to_numpy()
            tmp_y = y_test.to_numpy()

        self.x_user = torch.tensor(tmp_x_user, dtype=torch.int64)
        self.x_movie = torch.tensor(tmp_x_movie, dtype=torch.int64)
        self.x_feats_tensors = torch.tensor(tmp_x_feats, dtype=torch.int64)
        self.x_contexts_tensors = torch.tensor(tmp_x_contexts, dtype=torch.int64)
        self.y_data = torch.tensor(tmp_y, dtype=torch.int64).reshape(-1, 1)

    def __len__(self):
        return len(self.x_user)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # users_vector = torch.tensor([1 if x==self.x_user[idx] else 0 for x in self.all_users], 
        #                             dtype=torch.int64)
        # movies_vector = torch.tensor([1 if x==self.x_movie[idx] else 0 for x in self.all_movies], 
        #                             dtype=torch.int64)
        users_vector = self.x_user[idx]
        movies_vector = self.x_movie[idx]
        feats_vector = self.x_feats_tensors[idx]
        context_vector = self.x_contexts_tensors[idx]
        y_vector = self.y_data[idx]

        return {'user': users_vector, 
                'movie': movies_vector, 
                'movie_features': feats_vector,
                'user_context': context_vector}, y_vector

In [None]:
features = ['product', 'year', 'Documentary', 'Western', 'Mystery', 'IMAX', 'Drama', 'Biography', 'Film-Noir', 
            'Music', 'Short', 'Thriller', 'Sport', 'Fantasy', 'Family', 'Children', 'Crime', 'Horror', 'Adult', 
            'Animation', 'Comedy', 'History', 'Adventure', 'Romance', 'War', 'Musical', 'Sci-Fi', 'Action']
contexts = ['daytime', 'weekend']

batch_size = 4096

train_data = RecommendationDataset(dataset_type='train', 
                                   data_path='data/full_dataset.csv', 
                                   features=features, 
                                   contexts=contexts)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = RecommendationDataset(dataset_type='test', 
                                   data_path='data/full_dataset.csv', 
                                   features=features, 
                                   contexts=contexts)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

In [None]:
model = RecommendationModel(0.1, max(train_data.all_users)+1, max(train_data.all_movies)+1, 50, len(features), len(contexts))

trainer = pl.Trainer(max_epochs=100) #, accelerator='mps', devices=1)
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)