In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import imdb
import seaborn as sns
import matplotlib.pyplot as plt
import re
from datetime import datetime
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [3]:
# if torch.cuda.is_available():
#     device = torch.device('cuda')
# elif torch.backends.mps.is_available():
#     device = torch.device('mps')
# else:
device = torch.device('cpu')

print(f'Using {device} device')

Using cpu device


In [4]:
class RecommendationModel(nn.Module):
    def __init__(self, users_count, movies_count, embedding_size, features_count, context_count):
        super(RecommendationModel, self).__init__()
        self.user_embedding = nn.Embedding(users_count, embedding_size)
        self.movie_embedding = nn.Embedding(movies_count, embedding_size)
        self.um_dense = nn.Linear(2*embedding_size, embedding_size)
        self.um_dense_2 = nn.Linear(embedding_size, 10)

        self.movie_features = nn.Linear(features_count, 10)
        self.user_context = nn.Linear(context_count, context_count)

        self.final_fc1 = nn.Linear(10+10+2, 10)
        self.output = nn.Linear(10, 1)

    def forward(self, x):
        x_user = x['user'] 
        x_movie = x['movie']
        x_movie_features = x['movie_features']
        x_context_features = x['user_context']

        emb_user = self.user_embedding(x_user)
        emb_movie = self.movie_embedding(x_movie)
        um_cat = torch.cat((emb_user, emb_movie), 1)
        um_fc1 = F.relu(self.um_dense(um_cat))
        um_fc2 = F.relu(self.um_dense_2(um_fc1))

        feat_fc = F.relu(self.movie_features(x_movie_features))
        con_fc = F.relu(self.user_context(x_context_features))

        final_cat = F.sigmoid(self.output(con_fc))

        return final_cat

class RecommendationDataset(Dataset):
    def __init__(self, device, data_path, features, contexts, dataset_type = 'train'):
        self.device = device
        full_df = pd.read_csv(data_path)
        self.all_users = list(full_df['user'].unique())
        self.all_movies = list(full_df['product'].unique())

        X_train, X_test, y_train, y_test = train_test_split(full_df.loc[:, full_df.columns != 'y'], 
                                                            full_df['y'], 
                                                            test_size=.2, 
                                                            random_state=42)

        if dataset_type == 'train':
            tmp_x_user = X_train['user'].to_numpy()
            tmp_x_movie = X_train['product'].to_numpy()
            tmp_x_feats = X_train[features].to_numpy()
            tmp_x_contexts = X_train[contexts].to_numpy()
            tmp_y = y_train.to_numpy()
        elif dataset_type == 'test':
            tmp_x_user = X_test['user'].to_numpy()
            tmp_x_movie = X_test['product'].to_numpy()
            tmp_x_feats = X_test[features].to_numpy()
            tmp_x_contexts = X_test[contexts].to_numpy()
            tmp_y = y_test.to_numpy()
        
        self.x_user_tensors = torch.tensor(tmp_x_user, dtype=torch.int64, device=self.device)
        self.x_movie_tensors = torch.tensor(tmp_x_movie, dtype=torch.int64, device=self.device)
        self.x_feats_tensors = torch.tensor(tmp_x_feats, dtype=torch.int64, device=self.device)
        self.x_contexts_tensors = torch.tensor(tmp_x_contexts, dtype=torch.int64, device=self.device)
        self.y_data = torch.tensor(tmp_y, dtype=torch.int64, device=self.device)

    def __len__(self):
        return len(self.x_user_tensors)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        users_vector = torch.tensor([1 if x==self.x_user_tensors[idx].item() else 0 for x in self.all_users], dtype=torch.int64, device=self.device)
        movies_vector = torch.tensor([1 if x==self.x_movie_tensors[idx].item() else 0 for x in self.all_movies], dtype=torch.int64, device=self.device)
        feats_vector = self.x_feats_tensors[idx]
        context_vector = self.x_contexts_tensors[idx]
        y_vector = self.y_data[idx]

        return {'user': users_vector, 
                'movie': movies_vector, 
                'movie_features': feats_vector,
                'user_context': context_vector}, y_vector

In [5]:
features = ['product', 'year', 'Documentary', 'Western', 'Mystery', 'IMAX', 'Drama', 'Biography',
       'Film-Noir', 'Music', 'Short', 'Thriller', 'Sport', 'Fantasy', 'Family',
       'Children', 'Crime', 'Horror', 'Adult', 'Animation', 'Comedy',
       'History', 'Adventure', 'Romance', 'War', 'Musical', 'Sci-Fi', 'Action']
contexts = ['daytime', 'weekend']

train_data = RecommendationDataset(dataset_type='train', 
                                   data_path='data/full_dataset.csv', 
                                   features=features, 
                                   contexts=contexts, 
                                   device=device)
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)

In [6]:
for (batch_idx, batch) in enumerate(train_dataloader):
    X_batch, y_batch = batch
    break

0it [00:22, ?it/s]


In [9]:
X_batch.keys()

dict_keys(['user', 'movie', 'movie_features', 'user_context'])

In [11]:
for key in X_batch.keys():
    print(key, ': ', X_batch[key].shape)

user :  torch.Size([64, 283228])
movie :  torch.Size([64, 53889])
movie_features :  torch.Size([64, 28])
user_context :  torch.Size([64, 2])


In [15]:
X_batch['user']

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [None]:
model = RecommendationModel(len(train_data.all_users), len(train_data.all_movies), 50, len(features), len(contexts))