# Note

https://colab.research.google.com/drive/1It4mPMzbnhgWKLNS8w6ghJaFxn154crl?usp=sharing#scrollTo=7R_8iqrK-Jhp

# Import

In [1]:
import io
import os
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

In [2]:
# for Tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/cf_experiment_1')

In [3]:
plt.style.use('ggplot')

In [4]:
def set_random_seed(state=1):
    gens = (np.random.seed, torch.manual_seed, torch.cuda.manual_seed)
    for set_state in gens:
        set_state(state)

In [5]:
RANDOM_STATE = 1
set_random_seed(RANDOM_STATE)

# Download Data

In [6]:
def try_download(url, download_path):
    archive_name = url.split('/')[-1]
    folder_name, _ = os.path.splitext(archive_name)
    
    try:
        r = urlopen(url)
    except URLError as e:
        print('Cannot download the data. Error: %s' % s)
        return 

    assert r.status == 200
    data = r.read()

    with zipfile.ZipFile(io.BytesIO(data)) as arch:
        arch.extractall(download_path)
        
    print('The archive is extracted into folder: %s' % download_path)

In [7]:
archive_url = f'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
download_path =  Path().absolute() / 'data' / 'movielens'
try_download(archive_url, download_path)


The archive is extracted into folder: /Users/tien/Documents/PythonEnvs/pytorch/jup/recsys_models/cf/data/movielens


# Read data

In [8]:
def read_data(path):
    files = {}
    for filename in path.glob('*'):
        if filename.suffix == '.csv':
            files[filename.stem] = pd.read_csv(filename)
        elif filename.suffix == '.dat':
            if filename.stem == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(filename, sep='::', names=columns, engine='python', encoding='latin-1')
            files[filename.stem] = data
    return files['ratings'], files['movies']

In [9]:
ratings, movies = read_data(download_path /'ml-1m')

# Dataset Preview

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Tabular Preview

In [12]:
def tabular_preview(ratings, n=15):
    """Creates a cross-tabular view of users vs movies."""
    
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:15]

    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:15]

    top = (
        ratings.
        join(top_users, rsuffix='_r', how='inner', on='userId').
        join(top_movies, rsuffix='_r', how='inner', on='movieId'))

    return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)

In [13]:
tabular_preview(ratings, movies)

movieId,110,260,480,589,593,608,1196,1198,1210,1270,1580,2028,2571,2762,2858
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
889,4.0,4.0,3.0,5.0,5.0,4.0,4.0,,3.0,4.0,3.0,3.0,5.0,,2.0
1015,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,5.0,4.0
1150,2.0,5.0,,2.0,3.0,5.0,4.0,2.0,3.0,2.0,2.0,2.0,1.0,2.0,4.0
1181,3.0,4.0,2.0,5.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,5.0,4.0,3.0
1449,3.0,3.0,2.0,2.0,5.0,5.0,3.0,4.0,2.0,2.0,4.0,3.0,4.0,4.0,4.0
1680,1.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,4.0,5.0,3.0,5.0,5.0
1941,5.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,5.0,1.0
1980,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0
2063,5.0,4.0,4.0,2.0,5.0,2.0,4.0,4.0,4.0,4.0,3.0,2.0,5.0,4.0,5.0
2909,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0


# Dataset Preparation

In [14]:
def create_dataset(ratings, top=None):
    """
        Not entirely sure what this does. The returned dataframes are just
        the splitted of the ratings.
    """
    if top is not None:
        ratings.groupby('userId')['rating'].count()
    
    unique_users = ratings.userId.unique()
    user_to_index = {user_id: index for index, user_id in enumerate(unique_users)}
    new_users = ratings.userId.map(user_to_index)
    
    unique_movies = ratings.movieId.unique()
    movie_to_index = {movie_id: index for index, movie_id in enumerate(unique_movies)}
    new_movies = ratings.movieId.map(movie_to_index)
    
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_movies})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [15]:
(n, m), (X, y), _ = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 6040 users, 3706 movies
Dataset shape: (1000209, 2)
Target shape: (1000209,)


# Iterator

In [16]:
class RatingsIterator:
    
    def __init__(self, user_movie_matrix, ratings, batch_size=32, shuffle=True):
        
        user_movie_matrix, ratings = np.asarray(user_movie_matrix), np.asarray(ratings)
        
        if shuffle:
            # X.shape[0] is an interger, so we just want to return a list of indices here
            index = np.random.permutation(user_movie_matrix.shape[0])
            
            user_movie_matrix = user_movie_matrix[index]
            ratings = ratings[index]
            
        self.user_movie_matrix = user_movie_matrix
        self.ratings = ratings
        
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(user_movie_matrix.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
        
    def __next__(self):
        return self.next()
    
    def next(self):
        
        if self._current >= self.n_batches:
            raise StopIteration()
    
        k = self._current
        self._current += 1
        bs = self.batch_size
        
        start_index, end_index = k*bs, (k+1)*bs
        
        return self.user_movie_matrix[start_index:end_index], self.ratings[start_index:end_index]

In [17]:
def batches(user_rating_matrix, ratings, batch_size=32, shuffle=True):
    for xb, yb in RatingsIterator(user_movie_matrix=user_rating_matrix, ratings=ratings, batch_size=batch_size, shuffle=shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        
        '''
        we got this issue:
        UserWarning: Using a target size (torch.Size([2000])) that is different to the input size (torch.Size([2000, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
        return F.mse_loss(input, target, reduction=self.reduction)
        
        2000 is batch_size, below is the fix
        https://stackoverflow.com/questions/65219569/pytorch-gives-incorrect-results-due-to-broadcasting
        '''
        
        new_shape = (len(yb), 1)
        
        # (TODO): check what view is
        yb = yb.view(new_shape)
        yield xb, yb

# Build Model

In [18]:
def get_list(n):
    """
        Give an integer or a list of integers, encapsulate it in a list
    """
    if isinstance(n, (int, float)):
        # just convert this to a list containing the value
        return [n]
    elif hasattr(n, '__iter__'): # this is a list
        return list(n) # encapsulat this as a list
    
    raise TypeError("layers configuration should be a a single number or a list of numbers")

In [19]:
# nn.Module is the base class for all the network

class EmbeddingNetwork(nn.Module):
    """
    Creates a dense network with embedding layers.
    
    Args:
    
        n_users:            
            Number of unique users in the dataset.

        n_movies: 
            Number of unique movies in the dataset.

        n_factors: 
            Number of columns in the embeddings matrix.

        embedding_dropout: 
            Dropout rate to apply right after embeddings layer.

        hidden:
            A single integer or a list of integers defining the number of 
            units in hidden layer(s).

        dropouts: 
            A single integer or a list of integers defining the dropout 
            layers rates applied right after each of hidden layers.
            
    """
    
    def __init__(self, n_users, n_movies, n_factors=50, embedding_droppout_prob=0.02,
                 hidden=10, dropouts=0.2):
        
        super().__init__() # because we subclass nn.Module
        hidden = get_list(hidden) # encapsulate as a list of hidden layer
        dropouts = get_list(dropouts) # encapsulate as a list of dropouts
        
        n_last = hidden[-1] # getting the last layer
        
        # TODO (@tien): will need to move this one out as 
        # this is not preferable.
        def gen_layers(n_in):
            """
            A generator that yields a sequence of hidden layers and 
            their activations/dropouts.
            
            Note that the function captures `hidden` and `dropouts` 
            values from the outer scope.
            """
            nonlocal hidden, dropouts
            assert len(dropouts) <= len(hidden)
            
            for n_out, rate in zip_longest(hidden, dropouts):
                # Applies a linear transformation to the incoming data: y=xA^T + b
                # we do learn bout the additive bias
                # https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

                yield nn.Linear(n_in, n_out)
                yield nn.ReLU()  # rectified linear unit function element-wise
                if rate is not None and rate > 0.:
                    yield nn.Dropout(rate)
                n_in = n_out
        
        # Build the matrix to store the embedding for n_users
        # Embedding: https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
        # A simple look up table that stores embeddingof a fixed dictionary and size.
        # n_users: n number of embeddings
        self.u = nn.Embedding(n_users, n_factors)
        
        # Build the matrix to store the embedding for m_movies
        # A  look up table that stores embeddings for n_movies
        self.m = nn.Embedding(n_movies, n_factors)
        
        # During training, randomly zeroes some of the elements of the input tensor with 
        # probability p using samples from a Bernoulli distribution. 
        # Each channel will be zeroed out independently on every forward call.
        self.drop = nn.Dropout(embedding_droppout_prob)
        
        # A Sequential container. Modules will be added to it in the order they are passed
        # in the constructor.
        # For the code below, we'll have somethings like
        # hiden = nn.Sequential(
        #  nn.Linear(...),
        #  nn.ReLU(),
        #  ...
        #  nn.Linear(...),
        #  nn.ReLU()
        # )
        self.hidden = nn.Sequential(*list(gen_layers(n_factors * 2)))
        
        # This is just the final layer - where the last output is just one number
        self.fc = nn.Linear(n_last, 1)
        self._init()
        
    def forward(self, users, movies, minmax=None):
        # Concatenates the given sequence of seq tensors in the given dimension. 
        # All tensors must either have the same shape (except in the concatenating dimension) or be empty.
        # dim=1: concat all the tensors horizontally
        # This is very simimilar to Neural Collaborative Filter (https://www.kaggle.com/code/jamesloy/deep-learning-based-recommender-systems)
        features = torch.cat([self.u(users), self.m(movies)], dim=1)
        
        # we still drop out at the probability of embedding_droppout_prob
        x = self.drop(features)
        
        # x is just a final layer
        x = self.hidden(x)
        
        # compute the sigmoid on the last layers
        out = torch.sigmoid(self.fc(x))
        if minmax is not None:
            min_rating, max_rating = minmax
            out = out*(max_rating - min_rating + 1) + min_rating - 0.5
        return out
    
    def _init(self):
        """
        Setup embeddings and hidden layers with reasonable initial values.
        """
        
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        
        # Apply on all of hidden layers as well as the final layer
        self.hidden.apply(init)
        init(self.fc)
    

In [20]:
EmbeddingNetwork(n, m, n_factors=150, hidden=[100, 200, 300], dropouts=[0.25, 0.5])

EmbeddingNetwork(
  (u): Embedding(6040, 150)
  (m): Embedding(3706, 150)
  (drop): Dropout(p=0.02, inplace=False)
  (hidden): Sequential(
    (0): Linear(in_features=300, out_features=100, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.25, inplace=False)
    (3): Linear(in_features=100, out_features=200, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=200, out_features=300, bias=True)
    (7): ReLU()
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
)

# Build Cyclical Learning Rate

**Cyclical Learning Rate (CLR)**

One of the fastai library features is the cyclical learning rate scheduler. We can implement something similar inheriting the _LRScheduler class from the torch library. Following the original paper's pseudocode, this CLR Keras callback implementation, and making a couple of adjustments to support cosine annealing with restarts, let's create our own CLR scheduler.

The implementation of this idea is quite simple. The base PyTorch scheduler class has the get_lr() method that is invoked each time when we call the step() method. The method should return a list of learning rates depending on the current training epoch. In our case, we have the same learning rate for all of the layers, and therefore, we return a list with a single value.

The next cell defines a CyclicLR class that expectes a single callback function. This function should accept the current training epoch and the base value of learning rate, and return a new learning rate value

In [21]:
class CyclicLR(_LRScheduler):
    
    def __init__(self, optimizer, schedule, last_epoch=-1):
        assert callable(schedule)
        self.schedule = schedule
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        # base_lrs: https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py
        return [self.schedule(self.last_epoch, lr) for lr in self.base_lrs]

In [22]:
# Not really sure what this function does so I just put it here
def triangular(step_size, max_lr, method='triangular', gamma=0.99):
    
    def scheduler(epoch, base_lr):
        period = 2 * step_size
        cycle = math.floor(1 + epoch/period)
        x = abs(epoch/step_size - 2*cycle + 1)
        delta = (max_lr - base_lr)*max(0, (1 - x))

        if method == 'triangular':
            pass  # we've already done
        elif method == 'triangular2':
            delta /= float(2 ** (cycle - 1))
        elif method == 'exp_range':
            delta *= (gamma**epoch)
        else:
            raise ValueError('unexpected method: %s' % method)
            
        return base_lr + delta
        
    return scheduler

In [23]:
# This needs to note. Dont know why
def cosine(t_max, eta_min=0):
    
    def scheduler(epoch, base_lr):
        t = epoch % t_max
        return eta_min + (base_lr - eta_min)*(1 + math.cos(math.pi*t/t_max))/2
    
    return scheduler

In [24]:
def plot_lr(schedule):
    ts = list(range(1000))
    y = [schedule(t, 0.001) for t in ts]
    plt.plot(ts, y)

# Training

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [26]:
X_train

Unnamed: 0,user_id,movie_id
529184,3270,1925
341591,2011,579
470922,2898,1497
630004,3807,737
131938,853,536
...,...,...
491263,3019,1158
791624,4731,560
470924,2898,964
491755,3024,411


In [27]:
type(X_train)

pandas.core.frame.DataFrame

In [28]:
minmax = float(ratings.rating.min()), float(ratings.rating.max())
minmax

(1.0, 5.0)

In [29]:
model = EmbeddingNetwork(
    n_users=n,
    n_movies=m,
    n_factors=150,
    hidden=[100, 100, 100],
    embedding_droppout_prob=0.05,
    dropouts=[0.5]*3
)

## Training Model

In [36]:
lr = 1e-3
wd = 1e-5  # weight decay (TODO)
bs = 2000  # batch size
n_epochs = 100
patience = 10
no_improvements = 0
best_loss = np.inf
best_weights = None
history = []
lr_history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model.to(device)
criterion = nn.MSELoss(reduction='sum')  # criterion
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)  # TODO
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))
scheduler = CyclicLR(optimizer, cosine(t_max=iterations_per_epoch * 2, eta_min=lr/10))


add_model_to_tensorboard = False

for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        if phase == 'train':
          training = True
        else:
          training = False

        running_loss = 0
        n_batches = 0
        
        for batch in batches(*datasets[phase], shuffle=training, batch_size=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()

            # what does it mean by , in front of 0? - solved
            # see below
            # print("*********")
            # print(x_batch[:,0])
          
            if not add_model_to_tensorboard:
                writer.add_graph(model, [x_batch[:,0], x_batch[:,1]])
                # writer.close()
                add_model_to_tensorboard = True 
                
            # compute gradients only during 'train' phase
            with torch.set_grad_enabled(training):
                # x_batch[:,0]: take everything in the first (0-index row): users
                # y_batch[:,1]: take everything in the first (1-index row): movies
                
                # call the forward method
                outputs = model(x_batch[:,0], x_batch[:,1], minmax)
    
                loss = criterion(outputs, y_batch)
                
                # don't update weights and rates when in 'val' phase
                if training:

                    # loss.backward() computes dloss/dx for every parameter 
                    # x which has requires_grad=True. 
                    # These are accumulated into x.grad for every parameter x. In pseudo-code:
                    # x.grad += dloss/dx
                    # source: https://discuss.pytorch.org/t/what-does-the-backward-function-do/9944
                    loss.backward()
                    
                    # optimizer.step updates the value of x using the gradient x.grad. 
                    # For example, the SGD optimizer performs:
                    # x += -lr * x.grad
                    # optimizer.zero_grad() -> should not put it here as the eval value is really bad
                    optimizer.step()
                    
                    # we call it so that the learning rate will change after each epoch
                    # https://discuss.pytorch.org/t/what-does-scheduler-step-do/47764
                    scheduler.step()
                    
                    # relationship between loss.backward() and optimizer.step()
                    # https://stackoverflow.com/questions/53975717/pytorch-connection-between-loss-backward-and-optimizer-step
                    
                    # This is just to save the lr rate to lr_history
                    # so that we can plot later.
                    lr_history.extend(scheduler.get_lr())
                    
            running_loss += loss.item()
            
            # If we have a longer epoch, then we can just add the loss every 100 epoch or so
            # In this case, the number of epoch is less, so we can just add that in here.
            writer.add_scalar('training_loss', running_loss, global_step=epoch)
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        # early stopping: save weights of the best model so far
        if phase == 'val':
            if epoch_loss < best_loss:
                print('loss improvement on epoch: %d' % (epoch + 1))
                best_loss = epoch_loss
                best_weights = copy.deepcopy(model.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break
    
writer.flush()


KeyboardInterrupt: 

In [34]:
model_path =  Path().absolute() / 'model' 
model_path = f"{model_path}/model"
torch.save(model.state_dict(), model_path)

# Draft

In [32]:
model_path

PosixPath('/Users/tien/Documents/PythonEnvs/pytorch/jup/recsys_models/cf/model')

In [None]:
for x_batch, y_batch in batches(user_rating_matrix=X, ratings=y, batch_size=4):
    print(x_batch)
    print(x_batch[:,0])
    print(x_batch.shape)
    
    print("*********")
    print(y_batch)
    print(y_batch.shape)

    break

In [None]:
a, b = 1

In [None]:
print(math.ceil(0.4))
    

In [None]:
t = [1,2,3]

In [None]:
hasattr(t, '__iter__')

In [None]:
max(X.movie_id)

In [None]:
max(ratings.userId)

In [None]:
unique_users = ratings.userId.unique()

In [None]:
unique_users

In [None]:
user_to_index = {user_id: index for index, user_id in enumerate(unique_users)}

In [None]:
y

In [None]:
new_users = ratings.userId.map(user_to_index)

In [None]:
new_users

In [None]:
ratings.userId

In [None]:
unique_users

In [None]:
type(unique_users)

In [None]:
unique_users.shape

In [None]:
n_epochs