In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
import warnings
warnings.filterwarnings("ignore")

***

# START:

In [0]:
import numpy as np
import pandas as pd

from collections import Counter

import torch
import torch.nn as nn

from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

### DEVICE:

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### PATH:

In [0]:
path='gdrive/My Drive/netflix_colab/data/'

***

## FUNCTONS:

### 1. FN FOR TRAINING & VALIDATION:

In [0]:
def fn_train_eval(model, train_dataset, test_dataset, batch_size, optimizer, criterion, n_epochs):
    
  
    # INNER FUNCTIONS-----------------------------------------------
    
    def fn_train(model, iterator, optimizer, criterion):

        epoch_loss = 0
        model.train()
        for batch in iterator:
            
            movies  = Variable(batch[0][:, 0]).to(device)
            users   = Variable(batch[0][:, 1]).to(device)
            ratings = Variable(batch[1]).to(device)
            
            optimizer.zero_grad()                   # INITIALIZE
            
            predictions = model(users, movies)      # PREDICT
            loss = criterion(predictions, ratings)  # COMPUTE LOSS

            loss.backward()                         # BACK PROP
            optimizer.step()                        # GRADIENT DESCENT

            epoch_loss += loss.item()

        epoch_loss = epoch_loss/len(iterator)

        return epoch_loss 


    def fn_evaluate(model, iterator, criterion):

        epoch_loss = 0
        model.eval()
        with torch.no_grad():

            for batch in iterator:
            
                movies  = Variable(batch[0][:, 0]).to(device)
                users   = Variable(batch[0][:, 1]).to(device)
                ratings = Variable(batch[1]).to(device)


                predictions = model(users, movies)
                loss = criterion(predictions, ratings)

                epoch_loss += loss.item()

        epoch_loss = epoch_loss/len(iterator)

        return epoch_loss 
    
    
    # RUN--------------------------------------------------------------------

    train_iterator = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    test_iterator  = DataLoader(dataset=test_dataset,  batch_size=batch_size, shuffle=True, num_workers=2)

    listO_train_losses, listO_test_losses = [], []
    
    c1, c2 = 'print', 'print'
    for epoch in range(n_epochs):

        train_loss = fn_train(model, train_iterator, optimizer, criterion)
        if c1 == 'print':
            print('TRAINING >>')
            c1 = 'no_print'

        test_loss = fn_evaluate(model, test_iterator, criterion)
        if c2 == 'print':
            print('EVALUATING >>')
            c2 = 'no_print'

        listO_train_losses.append(train_loss)
        listO_test_losses.append(test_loss)
        
        print_params = [epoch+1, train_loss, test_loss]
        print('Epoch: {:03}  |  Train Loss: {:.3f}  | test Loss: {:.3f}  |'.format(*print_params))
        
    print()
    print('RETURNED: listO_train_losses, listO_test_losses, model ')
    
    return listO_train_losses, listO_test_losses, model 

### 2. PLOTTING FN:

In [0]:
def retrn_model_plots(listO_train_losses, listO_test_losses):
    
    import pylab
    pylab.rcParams["figure.figsize"] = (10, 5)

    xs = list(range(1, len(listO_train_losses)+1))
    ys = listO_train_losses
    ys_v = listO_test_losses


    pylab.plot(xs, ys, '-', label = 'train')
    pylab.plot(xs, ys_v, '-', label = 'test')

    pylab.legend()
    pylab.xlabel('EPOCHS')
    pylab.ylabel('LOSS')
    
    pylab.show()

***

### CHECK DATA:

In [0]:
df_train = pd.read_csv(path + 'df_final_trainset.csv', index_col = 0)
df_train.head()

Unnamed: 0,movie,user,rating
333064,0,0,3
911833,1,1,5
1287538,2,2,3
941269,3,3,3
1694504,4,4,4


In [0]:
df_train.movie.max(), df_train.user.max()

(4997, 49799)

***

# CREATE - TRAIN & TEST PYTORCH DATASET CLASSES:

In [0]:
class RecommenderData(Dataset):

    def __init__(self, path, csv_file_name):
        
        df = pd.read_csv(path + csv_file_name, index_col = 0)
                        
        self.len = df.shape[0]
        
        self.x_data = torch.from_numpy(df.iloc[:, :-1].values).type(torch.LongTensor)
        self.y_data = torch.from_numpy(df.iloc[:, -1].values).type(torch.FloatTensor)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [0]:
path = path

train_dataset = RecommenderData(path, 'df_final_trainset.csv')
test_dataset  = RecommenderData(path, 'df_final_testset.csv')

In [0]:
train_dataset.x_data

tensor([[    0,     0],
        [    1,     1],
        [    2,     2],
        ...,
        [ 3652,  8903],
        [   56, 10623],
        [  263, 39396]])

In [0]:
train_dataset.y_data

tensor([3., 5., 3.,  ..., 1., 4., 5.])

In [0]:
train_dataset.x_data[:, 0].max(), train_dataset.x_data[:, 1].max()

(tensor(4997), tensor(49799))

In [0]:
train_dataset.len, test_dataset.len

(2350408, 587329)

***

# MODEL:

In [0]:
class MatrixFactorization_1(torch.nn.Module):
    
    def __init__(self, n_users, n_items, embedding_dim):
        super().__init__()
        
	    # create user & item embeddings of same size:
        self.user_embeddings = torch.nn.Embedding(n_users, embedding_dim, sparse=False)
        self.item_embeddings = torch.nn.Embedding(n_items, embedding_dim, sparse=False)

    def forward(self, user, item):
        
    	# creating/learning latent variables:
        user_vec = self.user_embeddings(user)
        item_vec = self.item_embeddings(item)
        
        y_pred = (user_vec * item_vec).sum(1)
        return y_pred   

***

# MODEL TRAINING & EVALUATION:

### TRIAL 1: Embedding_dim = 100

In [0]:
n_users, n_items, embedding_dim = 50000, 5000, 100

model = MatrixFactorization_1(n_users, n_items, embedding_dim)

learning_rate = 5e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

model = model.to(device)
criterion = criterion.to(device)

train_dataset = train_dataset
test_dataset = test_dataset
batch_size = 1000
n_epochs = 30

%time listO_train_losses, listO_test_losses, model = fn_train_eval(model, train_dataset, test_dataset, batch_size, optimizer, criterion, n_epochs)

TRAINING >>
EVALUATING >>
Epoch: 001  |  Train Loss: 49.048  | test Loss: 19.628  |
Epoch: 002  |  Train Loss: 5.995  | test Loss: 6.882  |
Epoch: 003  |  Train Loss: 1.814  | test Loss: 5.190  |
Epoch: 004  |  Train Loss: 1.782  | test Loss: 4.398  |
Epoch: 005  |  Train Loss: 2.060  | test Loss: 3.681  |
Epoch: 006  |  Train Loss: 1.721  | test Loss: 3.121  |
Epoch: 007  |  Train Loss: 1.522  | test Loss: 2.798  |
Epoch: 008  |  Train Loss: 1.463  | test Loss: 2.472  |
Epoch: 009  |  Train Loss: 1.354  | test Loss: 2.258  |
Epoch: 010  |  Train Loss: 1.235  | test Loss: 2.050  |
Epoch: 011  |  Train Loss: 1.142  | test Loss: 1.928  |
Epoch: 012  |  Train Loss: 1.059  | test Loss: 1.795  |
Epoch: 013  |  Train Loss: 0.976  | test Loss: 1.723  |
Epoch: 014  |  Train Loss: 0.904  | test Loss: 1.635  |
Epoch: 015  |  Train Loss: 0.843  | test Loss: 1.591  |
Epoch: 016  |  Train Loss: 0.785  | test Loss: 1.542  |
Epoch: 017  |  Train Loss: 0.733  | test Loss: 1.517  |
Epoch: 018  |  Train

***

### TRIAL 2: Embedding_dim = 50

In [0]:
n_users, n_items, embedding_dim = 50000, 5000, 50

model = MatrixFactorization_1(n_users, n_items, embedding_dim)

learning_rate = 5e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

model = model.to(device)
criterion = criterion.to(device)

train_dataset = train_dataset
test_dataset = test_dataset
batch_size = 1000
n_epochs = 30

%time listO_train_losses, listO_test_losses, model = fn_train_eval(model, train_dataset, test_dataset, batch_size, optimizer, criterion, n_epochs)

TRAINING >>
EVALUATING >>
Epoch: 001  |  Train Loss: 27.620  | test Loss: 9.644  |
Epoch: 002  |  Train Loss: 3.283  | test Loss: 2.923  |
Epoch: 003  |  Train Loss: 1.288  | test Loss: 2.268  |
Epoch: 004  |  Train Loss: 1.114  | test Loss: 2.032  |
Epoch: 005  |  Train Loss: 1.120  | test Loss: 1.841  |
Epoch: 006  |  Train Loss: 1.054  | test Loss: 1.692  |
Epoch: 007  |  Train Loss: 0.967  | test Loss: 1.582  |
Epoch: 008  |  Train Loss: 0.911  | test Loss: 1.507  |
Epoch: 009  |  Train Loss: 0.864  | test Loss: 1.447  |
Epoch: 010  |  Train Loss: 0.814  | test Loss: 1.397  |
Epoch: 011  |  Train Loss: 0.769  | test Loss: 1.368  |
Epoch: 012  |  Train Loss: 0.731  | test Loss: 1.325  |
Epoch: 013  |  Train Loss: 0.695  | test Loss: 1.314  |
Epoch: 014  |  Train Loss: 0.663  | test Loss: 1.297  |
Epoch: 015  |  Train Loss: 0.636  | test Loss: 1.287  |
Epoch: 016  |  Train Loss: 0.610  | test Loss: 1.280  |
Epoch: 017  |  Train Loss: 0.588  | test Loss: 1.280  |
Epoch: 018  |  Train 

***

### TRIAL 3: Embedding_dim = 10

In [0]:
n_users, n_items, embedding_dim = 50000, 5000, 10

model = MatrixFactorization_1(n_users, n_items, embedding_dim)

learning_rate = 5e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

model = model.to(device)
criterion = criterion.to(device)

train_dataset = train_dataset
test_dataset = test_dataset
batch_size = 1000
n_epochs = 30

%time listO_train_losses, listO_test_losses, model = fn_train_eval(model, train_dataset, test_dataset, batch_size, optimizer, criterion, n_epochs)

TRAINING >>
EVALUATING >>
Epoch: 001  |  Train Loss: 14.176  | test Loss: 4.548  |
Epoch: 002  |  Train Loss: 1.971  | test Loss: 1.312  |
Epoch: 003  |  Train Loss: 1.050  | test Loss: 1.084  |
Epoch: 004  |  Train Loss: 0.947  | test Loss: 1.017  |
Epoch: 005  |  Train Loss: 0.898  | test Loss: 0.975  |
Epoch: 006  |  Train Loss: 0.860  | test Loss: 0.951  |
Epoch: 007  |  Train Loss: 0.835  | test Loss: 0.944  |
Epoch: 008  |  Train Loss: 0.818  | test Loss: 0.937  |
Epoch: 009  |  Train Loss: 0.803  | test Loss: 0.933  |
Epoch: 010  |  Train Loss: 0.789  | test Loss: 0.930  |
Epoch: 011  |  Train Loss: 0.775  | test Loss: 0.927  |
Epoch: 012  |  Train Loss: 0.761  | test Loss: 0.926  |
Epoch: 013  |  Train Loss: 0.749  | test Loss: 0.921  |
Epoch: 014  |  Train Loss: 0.737  | test Loss: 0.918  |
Epoch: 015  |  Train Loss: 0.726  | test Loss: 0.920  |
Epoch: 016  |  Train Loss: 0.717  | test Loss: 0.919  |
Epoch: 017  |  Train Loss: 0.708  | test Loss: 0.919  |
Epoch: 018  |  Train 

***

### TRIAL 4:  Embedding_dim = 5

In [0]:
n_users, n_items, embedding_dim = 50000, 5000, 5

model = MatrixFactorization_1(n_users, n_items, embedding_dim)

learning_rate = 5e-3
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

model = model.to(device)
criterion = criterion.to(device)

train_dataset = train_dataset
test_dataset = test_dataset
batch_size = 1000
n_epochs = 30

%time listO_train_losses, listO_test_losses, model = fn_train_eval(model, train_dataset, test_dataset, batch_size, optimizer, criterion, n_epochs)

TRAINING >>
EVALUATING >>
Epoch: 001  |  Train Loss: 13.678  | test Loss: 5.323  |
Epoch: 002  |  Train Loss: 2.151  | test Loss: 1.269  |
Epoch: 003  |  Train Loss: 1.058  | test Loss: 1.024  |
Epoch: 004  |  Train Loss: 0.942  | test Loss: 0.960  |
Epoch: 005  |  Train Loss: 0.900  | test Loss: 0.926  |
Epoch: 006  |  Train Loss: 0.869  | test Loss: 0.903  |
Epoch: 007  |  Train Loss: 0.847  | test Loss: 0.891  |
Epoch: 008  |  Train Loss: 0.834  | test Loss: 0.884  |
Epoch: 009  |  Train Loss: 0.825  | test Loss: 0.881  |
Epoch: 010  |  Train Loss: 0.819  | test Loss: 0.878  |
Epoch: 011  |  Train Loss: 0.814  | test Loss: 0.878  |
Epoch: 012  |  Train Loss: 0.809  | test Loss: 0.877  |
Epoch: 013  |  Train Loss: 0.806  | test Loss: 0.875  |
Epoch: 014  |  Train Loss: 0.802  | test Loss: 0.876  |
Epoch: 015  |  Train Loss: 0.798  | test Loss: 0.877  |
Epoch: 016  |  Train Loss: 0.794  | test Loss: 0.877  |
Epoch: 017  |  Train Loss: 0.790  | test Loss: 0.876  |
Epoch: 018  |  Train 