In [1]:
import os
os.chdir("/Users/tien/Documents/PythonEnvs/pytorch/")

# Imports

## General

In [2]:

import io
import math
import copy
import pickle
# import zipfile
#from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
#from collections import defaultdict
#from urllib.error import URLError
#from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F 
#from torch.optim.lr_scheduler import _LRScheduler

from torch.utils.tensorboard import SummaryWriter

from jup.utils import set_random_seed

## For Data

In [3]:
from jup.recsys_models.data.movielens_1m import read_data
from jup.recsys_models.data.movielens_1m import create_dataset
from jup.recsys_models.data.movielens_1m import tabular_preview
from jup.recsys_models.data.movielens_1m import RatingsIterator

## For Learning Rate

In [4]:
from jup.utils import learning_rate

# Data

In [5]:
ratings, movies = read_data()

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
tabular_preview(ratings, movies)

movieId,110,260,480,589,593,608,1196,1198,1210,1270,1580,2028,2571,2762,2858
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
889,4.0,4.0,3.0,5.0,5.0,4.0,4.0,,3.0,4.0,3.0,3.0,5.0,,2.0
1015,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,5.0,4.0
1150,2.0,5.0,,2.0,3.0,5.0,4.0,2.0,3.0,2.0,2.0,2.0,1.0,2.0,4.0
1181,3.0,4.0,2.0,5.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,5.0,4.0,3.0
1449,3.0,3.0,2.0,2.0,5.0,5.0,3.0,4.0,2.0,2.0,4.0,3.0,4.0,4.0,4.0
1680,1.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,4.0,5.0,3.0,5.0,5.0
1941,5.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,5.0,1.0
1980,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0
2063,5.0,4.0,4.0,2.0,5.0,2.0,4.0,4.0,4.0,4.0,3.0,2.0,5.0,4.0,5.0
2909,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0


In [9]:
(n, m), (X, y), _ = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 6040 users, 3706 movies
Dataset shape: (1000209, 2)
Target shape: (1000209,)


In [10]:
def batches(user_rating_matrix, ratings, batch_size=32, shuffle=True):
    for xb, yb in RatingsIterator(user_movie_matrix=user_rating_matrix, ratings=ratings, batch_size=batch_size, shuffle=shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        
        '''
        FOr matrix factorization, we have the issue below.
        Thus, we wont need to make a change in shape
        /Users/tien/Documents/PythonEnvs/pytorch/env/lib/python3.8/site-packages/torch/nn/modules/loss.py:536: UserWarning: Using a target size (torch.Size([2000, 1])) that is different to the input size (torch.Size([2000])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
        return F.mse_loss(input, target, reduction=self.reduction)
        
        '''
        yield xb, yb

# Build Model

In [11]:
# nn.Module is the base class for all the network

class MF(nn.Module):
    """
    Creates a dense network with embedding layers.
    
    Args:
    
        n_users:            
            Number of unique users in the dataset.

        n_movies: 
            Number of unique movies in the dataset.

        n_factors: 
            Number of columns in the embeddings matrix.     
    """
    
    def __init__(self, n_users, n_movies, n_factors=50):
        
        super().__init__() # because we subclass nn.Module
        
        # Build the matrix to store the embedding for n_users
        # Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        # A simple look up table that stores embeddingof a fixed dictionary and size.
        # n_users: n number of embeddings
        self.u = nn.Embedding(n_users, n_factors)
        
        # Build the matrix to store the embedding for m_movies
        # A  look up table that stores embeddings for n_movies
        self.m = nn.Embedding(n_movies, n_factors)
        
        # Initialize the embedding vector
        nn.init.normal_(self.u.weight, 0, 0.1)
        nn.init.normal_(self.m.weight, 0, 0.1)
        

        
    def forward(self, users, movies): 
        users_latent = self.u(users)
        movies_latent = self.m(movies)
        
        # Need to understand what dim = 1 here means (TODO)
        # Need to understand what this does here
        # Note:
        # The passed in information is the pair (user, movie).
        # Meaning: 
        #   users is a list of users
        #   movies is a list of movies
        #   The two lists have the same size
        #   At a given index i, we compute the rating 
        #   of a user @ users[i] with regards to a movie @ movies[i]
        # we do not do a matrix multiplication here as this is pair-wise computation
        # For more understanding, see the note here:
        # https://developers.google.com/machine-learning/recommendation/collaborative/matrix
        # Note: Observe that the (i,j) entry of U . V (transpose) of the embeddings of
        # a user i and item j
        # 
        # Below is the dot product of the two embeddinds with the same len
        # we sum() so that each user & movie pair only have one value (ratin)
        return (users_latent * movies_latent).sum(dim=1) 

In [12]:
model = MF(
    n_users=n,
    n_movies=m,
    n_factors=50
)

In [13]:
model

MF(
  (u): Embedding(6040, 50)
  (m): Embedding(3706, 50)
)

# Training Model

## Split Dataset

In [14]:
RANDOM_STATE = 1
set_random_seed(RANDOM_STATE)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [15]:
X_train

Unnamed: 0,user_id,movie_id
529184,3270,1925
341591,2011,579
470922,2898,1497
630004,3807,737
131938,853,536
...,...,...
491263,3019,1158
791624,4731,560
470924,2898,964
491755,3024,411


## Train Model

In [16]:
lr = 1e-3
wd = 1e-5  # weight decay (TODO)
bs = 2000  # batch size
n_epochs = 100
patience = 10
no_improvements = 0
best_loss = np.inf
best_weights = None
history = []
lr_history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model.to(device)
criterion = nn.MSELoss(reduction='sum')  # criterion
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)  # TODO
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))
scheduler = learning_rate.CyclicLR(optimizer, learning_rate.cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))

writer = SummaryWriter('runs/mf')

# Maybe train just on one epoch
add_model_to_tensorboard = False

for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        if phase == 'train':
          training = True
        else:
          training = False

        running_loss = 0
        n_batches = 0
        
        for batch in batches(*datasets[phase], shuffle=training, batch_size=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            
            # learn up the gradient after each batch
            optimizer.zero_grad()

          
            if not add_model_to_tensorboard:
                writer.add_graph(model, [x_batch[:,0], x_batch[:,1]])
                # writer.close()
                add_model_to_tensorboard = True 
                
            # compute gradients only during 'train' phase
            with torch.set_grad_enabled(training):
                # x_batch[:,0]: take everything in the first (0-index row): users
                # y_batch[:,1]: take everything in the first (1-index row): movies
                
                # call the forward method
                # only take in the available user movie pairs here
                # not a whole user x movie pair
                # tha can be explain why 
                outputs = model(x_batch[:,0], x_batch[:,1])
                preds = torch.round(outputs)  # look into this
                loss = criterion(outputs, y_batch)
                
                # don't update weights and rates when in 'val' phase
                if training:

                    # loss.backward() computes dloss/dx for every parameter 
                    # x which has requires_grad=True. 
                    # These are accumulated into x.grad for every parameter x. In pseudo-code:
                    # x.grad += dloss/dx
                    # source: https://discuss.pytorch.org/t/what-does-the-backward-function-do/9944
                    loss.backward()
                    
                    # optimizer.step updates the value of x using the gradient x.grad. 
                    # For example, the SGD optimizer performs:
                    # x += -lr * x.grad
                    # optimizer.zero_grad() -> should not put it here as the eval value is really bad
                    optimizer.step()
                    
                    # we call it so that the learning rate will change after each epoch
                    # https://discuss.pytorch.org/t/what-does-scheduler-step-do/47764
                    scheduler.step()
                    
                    # relationship between loss.backward() and optimizer.step()
                    # https://stackoverflow.com/questions/53975717/pytorch-connection-between-loss-backward-and-optimizer-step
                    
                    # This is just to save the lr rate to lr_history
                    # so that we can plot later.
                    lr_history.extend(scheduler.get_lr())
                    
            running_loss += loss.item()
            
            # If we have a longer epoch, then we can just add the loss every 100 epoch or so
            # In this case, the number of epoch is less, so we can just add that in here.
            writer.add_scalar('training_loss', running_loss, global_step=epoch)
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        # early stopping: save weights of the best model so far
        if phase == 'val':
            if epoch_loss < best_loss:
                print('loss improvement on epoch: %d' % (epoch + 1))
                best_loss = epoch_loss
                best_weights = copy.deepcopy(model.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break
    
writer.flush()

loss improvement on epoch: 1
[001/100] train: 14.0136 - val: 13.7658
loss improvement on epoch: 2
[002/100] train: 13.0021 - val: 12.5166
loss improvement on epoch: 3
[003/100] train: 6.3815 - val: 2.2942
loss improvement on epoch: 4
[004/100] train: 1.7373 - val: 1.5920
loss improvement on epoch: 5
[005/100] train: 1.1499 - val: 1.0030
loss improvement on epoch: 6
[006/100] train: 0.9253 - val: 0.9489
loss improvement on epoch: 7
[007/100] train: 0.8683 - val: 0.8713
loss improvement on epoch: 8
[008/100] train: 0.8205 - val: 0.8587
loss improvement on epoch: 9
[009/100] train: 0.8095 - val: 0.8366
loss improvement on epoch: 10
[010/100] train: 0.7809 - val: 0.8297
loss improvement on epoch: 11
[011/100] train: 0.7757 - val: 0.8167
loss improvement on epoch: 12
[012/100] train: 0.7482 - val: 0.8101
loss improvement on epoch: 13
[013/100] train: 0.7438 - val: 0.7995
loss improvement on epoch: 14
[014/100] train: 0.7163 - val: 0.7942
loss improvement on epoch: 15
[015/100] train: 0.7129