# Building a recommendation system using deep learning

[Abhishek Thakur](https://www.youtube.com/watch?v=MVB1cbe923A)

In [1]:
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [2]:
%mkdir /tmp/movie_recommendation
%cd /tmp/movie_recommendation/

/tmp/movie_recommendation


In [3]:
!kaggle competitions download -c predict-movie-ratings

Downloading train.csv.zip to /tmp/movie_recommendation
 83% 5.00M/6.00M [00:00<00:00, 24.5MB/s]
100% 6.00M/6.00M [00:00<00:00, 23.7MB/s]
Downloading test.csv.zip to /tmp/movie_recommendation
  0% 0.00/1.86M [00:00<?, ?B/s]
100% 1.86M/1.86M [00:00<00:00, 105MB/s]
Downloading sampleSubmission.csv.zip to /tmp/movie_recommendation
  0% 0.00/895k [00:00<?, ?B/s]
100% 895k/895k [00:00<00:00, 122MB/s]
Downloading train_v2.csv.zip to /tmp/movie_recommendation
 83% 5.00M/6.00M [00:00<00:00, 38.8MB/s]
100% 6.00M/6.00M [00:00<00:00, 38.3MB/s]
Downloading test_v2.csv.zip to /tmp/movie_recommendation
  0% 0.00/1.86M [00:00<?, ?B/s]
100% 1.86M/1.86M [00:00<00:00, 118MB/s]


In [4]:
!unzip sampleSubmission.csv.zip
!unzip train_v2.csv.zip
!unzip test_v2.csv.zip

Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    
Archive:  train_v2.csv.zip
  inflating: train_v2.csv            
Archive:  test_v2.csv.zip
  inflating: test_v2.csv             


## Import modules

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('train_v2.csv')

In [7]:
df.user.nunique()

6040

In [8]:
df.movie.nunique()

3676

In [9]:
df.shape

(750156, 4)

There are 6040 unique user with 3676 unique movies, with total rows as 750156 with 4 columns

In [10]:
df.rating.value_counts()

4    261916
3    195864
5    169556
2     80862
1     41958
Name: rating, dtype: int64

## The youtuber repositories

In [11]:
!pip install tez

Collecting tez
  Downloading tez-0.1.8-py3-none-any.whl (15 kB)
Installing collected packages: tez
Successfully installed tez-0.1.8


In [12]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
import torch.nn as nn
import pandas as pd
import numpy as np
import torch
import tez

In [20]:
class MovieDataset:
  def __init__(self, users, movies, ratings):
    self.users = users
    self.movies = movies
    self.ratings = ratings

  def __len__(self):
    return len(self.users)

  def __getitem__(self, item):
    user = self.users[item]
    movie = self.movies[item]
    rating = self.ratings[item]

    return {"user": torch.tensor(user, dtype=torch.long),
            "movie": torch.tensor(movie, dtype=torch.long),
            "rating": torch.tensor(rating, dtype=torch.float),}

In [32]:
class RecSysModel(tez.Model):
  def __init__(self, num_users, num_movies):
    super().__init__()
    self.user_embed = nn.Embedding(num_users, 32)
    self.movie_embed = nn.Embedding(num_movies, 32)
    self.out = nn.Linear(64, 1)
    self.step_scheduler_after = "epoch"

  def fetch_optimizer(self):
    opt = torch.optim.Adam(self.parameters(), lr=1e-3)
    return opt

  def fetch_scheduler(self):
    sch = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3, gamma=0.7)
    return sch
  
  def monitor_metrics(self, output, rating):
    output = output.detach().cpu().numpy()
    rating = rating.detach().cpu().numpy()
    return {
        'rmse': np.sqrt(metrics.mean_squared_error(rating, output))
    }

  def forward(self, users, movies, ratings=None):
    user_embeds = self.user_embed(users)
    movie_embeds = self.movie_embed(movies)
    output = torch.cat([user_embeds, movie_embeds], dim=1)
    output = self.out(output)

    loss = nn.MSELoss()(output, ratings.view(-1,1))
    calc_metrics = self.monitor_metrics(output, ratings.view(-1, 1))
    return output, loss, calc_metrics 

In [33]:
def train():
  df = pd.read_csv('train_v2.csv')
  # ID, user, movie, rating
  lbl_user = preprocessing.LabelEncoder()
  lbl_movie = preprocessing.LabelEncoder()

  df_user = lbl_user.fit_transform(df.user.values)
  df_movie = lbl_movie.fit_transform(df.movie.values)

  df_train, df_valid = model_selection.train_test_split(df, test_size=0.2, random_state=21, 
                                                        stratify=df.rating.values)
  
  train_dataset = MovieDataset(users=df_train.user.values, 
                               movies=df_train.movie.values, 
                               ratings=df_train.rating.values)
  
  valid_dataset = MovieDataset(users=df_valid.user.values, 
                               movies=df_valid.movie.values, 
                               ratings=df_valid.rating.values)
  
  model = RecSysModel(num_users=len(lbl_user.classes_), 
                      num_movies=len(lbl_movie.classes_))
  
  model.fit(
      train_dataset, valid_dataset, 
      train_bs=1024, valid_bs=1024,
      fp16=True
  )

In [34]:
if __name__ == "__main__":
  train()

  cpuset_checked))
  0%|          | 0/587 [00:00<?, ?it/s]


TypeError: ignored

Another failure ...