# Representation learning & recommender systems

In this practical session, we investigate two classical matrix-factorization models and their neural network implementation.


In [None]:
#! pip install torch torchvision pytorch-lightning --upgrade
#! pip install matplotlib --upgrade

In [None]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data used : [smallest movie-lens dataset](https://grouplens.org/datasets/movielens/)

Let's start with a very common dataset describing users, movies & interactions (ratings):

![image reco](media/Facto-mat.png)

# 1)  Load & Prepare Data

To be able to embed the data easily, we need to remap  the user/items between [0->N_User] and [0->N_Items].

In [None]:
from random import shuffle

## Load
#ratings = pd.read_csv("data/ratings.csv")
ratings = pd.read_csv("data/ml-100k/u.data", sep="\t",dtype=int, names=["userId","movieId", "rating", "timestamp"])
# We use pandas to load the data... And that's it => No pandas requirements for this practical session !

ratings.astype({'rating': 'float'},copy=False)
ratings.head(5)


In [None]:

## Prepare Data
user_map = {user:num for num,user in enumerate(ratings["userId"].unique())}
item_map = {item:num for num,item in enumerate(ratings["movieId"].unique())}

## Number of users & items
num_users = len(user_map)
num_items = len(item_map)

ratings["userId"] = ratings["userId"].map(user_map)
ratings["movieId"] = ratings["movieId"].map(item_map)

ratings.head(5)


In [None]:

# Creating Test/Train as before

train_indexes,val_indexes,test_indexes = [],[],[]

for index in range(len(ratings)):
    if index%5 == 0: # 20% of the data
        test_indexes.append(index)
    else:
        train_indexes.append(index)

        
shuffle(train_indexes)
num_val = int(len(train_indexes)/100*20)
val_indexes = train_indexes[:num_val]
train_indexes = train_indexes[num_val:]

train_ratings = ratings.iloc[train_indexes].copy() # separate data
val_ratings = ratings.iloc[val_indexes].copy()
test_ratings = ratings.iloc[test_indexes].copy()


print(f" #train:{len(train_ratings)}, #val:{len(val_ratings)} ,#test:{len(test_ratings)}" )



In [None]:
# USAGE
# In what follows, we will browse the tuple this way:
cpt = 0
for index, uid, mid, r, ts in train_ratings.itertuples():
    print(index,uid, mid,r) # remember that indexes were shuffled
    cpt+=1
    if cpt > 5:
        break

## Reproduce the baseline model with pytorch's vanilla autograd

Your goal now is to reproduce the following (strong) baseline model from surprise

 $$\hat{r}_{ui} = b_{ui} = \mu + b_u + b_i, \qquad (\mu,b_u,b_i) \in \mathbb R^3$$

[no matrix factorization here, <font color="red">only 3 scalars</font> involved for a prediction $(u,i)$] <BR>
[Even $\mu$ could be computed from the train set, we are going to learn this parameter in the optimization process]

## First, let's define the parameters

You have many parameters, they are all 1-dimensional:
- **mu:** the global mean (1,)
- **bu:** the user means (n_users,)
- **bi:** the item means (n_items,)

In [None]:
mu = torch.tensor([3.5],requires_grad=True) # activate gradient to be able to learn something
bu = [torch.tensor([0.1],requires_grad=True) for _ in range(num_users)]
bi = [torch.tensor([0.1],requires_grad=True) for _ in range(num_items)]

# # using directly the embedding module it would give:
# KEEP custom tensor first => Easier index management
# mu = torch.nn.Embedding(1, 1) # only one scalar to learn for the whole set
# bu = torch.nn.Embedding(num_users, 1) # one scalar per user
# bi = torch.nn.Embedding(num_items, 1) # one scalar per user

# # init
# torch.nn.init.normal_(mu.weight,3.5,0.001) # almost cst
# torch.nn.init.normal_(bu.weight,0.1,0.001) 
# torch.nn.init.normal_(bi.weight,0.1,0.001) 

In [None]:
# TODO: check the dimensions of the created structures



Then, we define two functions: 

- `predict(u,i)` : Will return the prediction given the (user,item) pair
- `error(pred,real)` : Will return the MSE error of prediction

#### (TODO) Predict Function
This function should implement this: $\hat{r}_{ui} = b_{ui} = \mu + b_u + b_i$

In [None]:
def predict(u,i):
    # build a (simlple) prediction from the above mentioned parameters
    if u < num_users: # if user exist:
        user_mean = bu[u] # brakets with custom tensors
        # user_mean = bu(u) # parentheses with embedding... But also squeeze/unsqueeze required
    else:
        user_mean = 0 
        
    if i < num_items: # if item exist:
        item_mean = bi[i]
    else:
        item_mean = 0
    
    return mu + user_mean + item_mean


In [None]:
# validation on user 0 and item 0 
print(predict(0,0))

### (TODO) error function
We want to use the MSE

In [None]:
def error(pred,real):
    # define simple MSE (1 line, few chars)
    ## <CORRECTION>
    return (pred-real)**2
    ## </CORRECTION>

#### The evaluation loop, without any optimization for now

Bad results expected... Just to check if we can use it

In [None]:
train_e = 0
for index, uid, mid, r, ts in train_ratings.itertuples(): # elegant way to browse tuples (from pandas structure)
    result = predict(uid,mid)
    train_e += error(result,r).item()

# define the same command for validation, test [copy/past/minor changes on var names]
# display the errors    
# The 3 errors are likely to be close with no learning step
## <CORRECTION>
val_e = 0
for index, uid, mid, r, ts in val_ratings.itertuples():
    result = predict(uid,mid)
    val_e += error(result,r).item()

test_e = 0
for index, uid, mid, r, ts in test_ratings.itertuples():
    result = predict(uid,mid)
    test_e += error(result,r).item()

print("final train error : ", train_e/len(train_ratings))
print("final val error : ", val_e/len(val_ratings))
print("final test error : ", test_e/len(test_ratings))
## </CORRECTION>

## Let's optimize the parameters (with SGD)  by slightly modifying the previous loop

### (TODO)


In [None]:
# parameters' values
lr = 0.01
batch_size = 32
n_epochs = 5

for epoch in range(n_epochs):
    
    # loop on the training samples (cf above)
    #   prediction
    #   error
    #   [OPT] error storage to check convergence
    #   backward (accumulation)
    #   update
    #   zero_grad

    # <CORRECTION>
    train_e = 0
    for num,(index, uid, mid, r, ts) in enumerate(train_ratings.sample(frac=1).itertuples()):
        result = predict(uid,mid)
        se = error(result,r)
        train_e += se.item()
        se.backward()

        with torch.no_grad():
            mu -= lr*mu.grad
            bu[uid] -= lr*bu[uid].grad
            bi[mid] -= lr*bi[mid].grad

            # Manually zero the gradients after updating weights
            mu.grad.zero_()
            bu[uid].grad.zero_()
            bi[mid].grad.zero_()


    print(f"epoch {epoch} train error : ", train_e/len(train_ratings))
    # </CORRECTION>

    # Evalaution on the validation set + test set
    val_e = 0
    for index, uid, mid, r, ts in val_ratings.itertuples():
        result = predict(uid,mid)
        val_e += error(result,r).item()

    print(f"epoch {epoch} val error : ", val_e/len(val_ratings))

    test_e = 0
    for index, uid, mid, r, ts in test_ratings.itertuples():
        result = predict(uid,mid)
        test_e += error(result,r).item()

    print(f"epoch {epoch} test error : ", test_e/len(test_ratings))
    print("-----")

# Embedding module

To build a matrix of vectorial representations of dimension $Z$, for instance describing the users, we are going to use a new module called `embedding`:
$$ U = \begin{pmatrix}\mathbf u_1, \ldots, \mathbf u_n\end{pmatrix}, \mathbf u \in \mathbb R^Z $$ 

Call for a index, get a $Z$ dimensional representation:

In [None]:
latent_size = 10
nb_users = 100 # arbitrary
nb_items = 50
users = torch.nn.Embedding(nb_users, latent_size) # random init
items = torch.nn.Embedding(nb_items, latent_size) # random init

# get representation of user 5:
print("User 5:", users(torch.tensor(5))) # WARNING: call for a tensor (not an int)

# get representation of user 5 & 7:
print("User 5 & 7:", users(torch.tensor([5,7])))

In [None]:
# Initialize the embedding with smaller values:

torch.nn.init.normal_(users.weight,0,0.01) # apply on the weights

# get representation of user 5:
print("User 5:", users(torch.tensor(5))) # WARNING: call for a tensor (not an int)


## Main difficulty = dealing with batch !

Based on a very simple matrix factorization formulation:
$$ \hat r = I_i^T U_u $$
Are you able to compute $\hat r$ for a batch of index `ind`?

In [None]:
ind = torch.tensor([1,2,3])
print(ind, ind.size())
print(users(ind).size(), users(ind))

u = users(ind)
i = items(ind)

# compute i.T u for all indices
# idea to save useless computations:
# 1. pairwise multiplication
# 2. find the good sum to get correct dimensions (and probably correct results)
# <CORRECTION>
r = torch.sum(u*i,1)
print(r.size(), r)
# </CORRECTION>

### Specific syntax to deal with an undefined number of parameters

widely used in python... And in particular in the next steps

In [None]:
# 1. a fuction return many things (in a tuple)

def fonction():
    return 1, 2, 3, 4
# default behavior => get a tuple:
a, b, c, d = fonction()
print(a, b, c, d)

#2. you can store them in a list (but not the first)
_,*res = fonction()

# 3. create a new function that takes arbitrary number of parameters:
def fonction2(*params):
    for p in params:
        print(p)

fonction2(res)


##  Classic matrix factorisation (called SVD in RecSys) (with mean)

To see how it works, we propose to implement a simple SVD:
### $$ \min\limits_{U,I}\sum\limits_{(u,i)} \underbrace{(r_{ui} -  (I_i^TU_u + \mu))^2}_\text{minimization} + \underbrace{\lambda(||U_u||^2+||I_u||^2 + \mu) }_\text{regularization} $$

where prediction is done in the following way:
### $$r_{ui} = \mu + U_u.I_i $$

where $\mu$ is the global mean,  $U_u$ a user embedding and $I_i$ an item embedding

### STEPS:
 To implement such model in pytorch, we need to do multiple things:
 
 - (1) model definition
 - (2) loss function
 - (3) evaluation
 - (4) training/eval loop




#### (1) Model definition

A model class typically extends `nn.Module`, the Neural network module. It is a convenient way of encapsulating parameters, with helpers for moving them to GPU, exporting, loading, etc.

One should define two functions: `__init__` and `forward`.

- `__init__` is used to initialize the model parameters
- `forward` is the net transformation from input to output. In fact, when doing `moduleClass(input)` you call this method.

##### (a) Initialization

Our model has different weigths:

- the user profiles (also called user embeddings) $U$
- the item profiles (also called user embeddings) $I$
- the mean bias $\mu$


##### (b) input to output operation
Technically, the prediction as defined earlier can be seen as just a dot product between two embeddings $U_u$ and $I_i$ plus the mean rating:

- `torch.sum(embed_u*embed_i,1) + self.mean` is equivalent to $r_{ui} = \mu + U_u.I_i $ 
- the `.squeeze(1)` operation is a shape operation to remove the dimension 1 (indexing starts at 0) akin to reshaping the matrix from `(batch_size,1,latent_size)` to `(batch_size,latent_size)`
- for reference, the inverse operation is `.unsqueeze()`
- we return weights to regularize them


### (TODO) Just to make sure you were following: complete the following `__init__`and  `forward` methods

In [None]:


# The model define as a class, inheriting from nn.Module
class ClassicMF(torch.nn.Module):
    
    #(a) Init
    def __init__(self,nb_users,nb_items,latent_size):
        super(ClassicMF, self).__init__()
        # define the embeddings
        #   note: the general bias is given with specific syntax
        #   note: to define an attribute: self.users = ...
        # initialize with std = 0.01

        #The mean bias
        self.mean = torch.nn.Parameter(torch.FloatTensor(1,).fill_(3)) # another way to activate grad
        # <CORRECTION>
        #Embedding layers
        self.users = torch.nn.Embedding(nb_users, latent_size)        
        self.items = torch.nn.Embedding(nb_items, latent_size)
        
        #initialize weights with very small values
        torch.nn.init.normal_(self.users.weight,0,0.01)
        torch.nn.init.normal_(self.items.weight,0,0.01)
        # </CORRECTION>
    
    # (b) How we compute the prediction (from input to output)
    def forward(self, user, item): ## method called when doing ClassicMF(user,item)
        # pay attention to the arguments: we have to give indexes
        # from the indexes, compute the output
        # WARNING : return the embeddings on top of the output to compute the regularization term 
        #       => 4 outputs expected (the line is given)

       
        # <CORRECTION>
         #embed_u,embed_i = self.users(user).squeeze(1),self.items(item).squeeze(1) # old => unecessary
        embed_u,embed_i = self.users(user),self.items(item)
        # print(user, user.size())
        # print(self.users(user).size(), self.users(user))
        
        out =   torch.sum(embed_u*embed_i,-1) + self.mean
        # </CORRECTION>
        return out, embed_u, embed_i, self.mean  # We return prediction + weights to regularize them
       
    

#### (2-4) full train loop

The train loop is organized around the [Dataloader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) class which Combines a dataset and a sampler, and provides single- or multi-process iterators over the dataset.

We just redefine a collate function

> collate_fn (callable, optional) – merges a list of samples to form a mini-batch.


**NOTE:** The dataset argument can be a list instead of a "Dataset" instance (works by duck typing)
    

##### The train loop sequence is the following:
    
[Dataset ==Dataloader==> Batch (not prepared) ==collate_fn==> Batch (prepared) ==Model.forward==> Prediction =loss_fn=> loss <-> truth 

1] PREDICT
- (a) The dataloader samples training exemples from the dataset (which is a list)
- (b) The collate_fn prepares the minibatch of training exemples
- (c) The prediction is made by feeding the minibatch in the model
- (d) The loss is computed on the prediction via a loss function

2] OPTIMIZE
- (e) Gradients are computed by automatic backard propagation
- (f) Parameters are updated using computed gradients

In [None]:
#  Let's create the datasets following  (Object w/ __getitem__(index) and __len()__, i.e lists ;)
prep_train = [(tp.userId,tp.movieId,tp.rating) for tp in train_ratings.itertuples()]
prep_val   = [(tp.userId,tp.movieId,tp.rating) for tp in val_ratings.itertuples()]
prep_test  = [(tp.userId,tp.movieId,tp.rating) for tp in test_ratings.itertuples()]

In [None]:
a,b,c = zip(*prep_train[:10])
print(a, b, c)

In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F


# HyperParameters
n_epochs = 3
batch_size = 16
num_feat = 25
lr = 0.01
reg = 0.001


#(a) Collate function => Creates tensor batches to feed model during training
# It can be removed if data is already tensors (torch or numpy ;)
def tuple_batch(l):
    '''
    input l: list of (user,item,rating tuples)
    output: formatted batches (in torch tensors)

    takes n-tuples and create batch
    text -> seq word #id
    '''
    users, items, ratings = zip(*l) 
    users_t = torch.LongTensor(users)
    items_t = torch.LongTensor(items)
    ratings_t = torch.FloatTensor(ratings)
    
    return users_t, items_t, ratings_t
    


#(b) Loss function => Combines MSE and L2
def loss_func(pred,ratings_t,reg,*params): # specific syntax 
    '''
    mse loss combined with l2 regularization.
    params assumed 2-dimension
    '''
    mse = F.mse_loss(pred,ratings_t,reduction='sum')
    l2 = 0
    for p in params: # ranging on all parameters
        l2 += torch.mean(p.norm(2,-1))
        
    return (mse/pred.size(0)) + reg*l2 , mse
    
#
# Training script starts here
#    

# (a) dataloader will sample data from datasets using collate_fn tuple_batch
dataloader_train = DataLoader(prep_train, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=tuple_batch)
dataloader_val = DataLoader(prep_val, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=tuple_batch)
dataloader_test = DataLoader(prep_test, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=tuple_batch)


In [None]:
# Define model & optimizer

model = ClassicMF(num_users,num_items,num_feat)
optimizer = torch.optim.Adam(model.parameters())

In [None]:
## INTERMEDIATE BOX for in depth understanding

# inference & parameter retrieving (if your forward is defined as expected)
users_t,items_t,ratings_t = next(iter(dataloader_train)) # retrieve first batch
# check dim
print(users_t.size()) # batch
print(users_t)

# output of the forward step:
pred, embed_u, embed_i, mu = model(users_t,items_t)
print(pred.size(), embed_u.size()) # batch
print(pred) # Current predictions for the batch

# alternative advanced syntax
pred, *params = model(users_t,items_t) # param is a list !!
print(len(params)) 
print(params[0].size()) # params[0] corresponds to embed_u

# idea: retrieving the list of parameter... And then transmit the list to loss_func without unpacking
print(loss_func(pred,ratings_t,reg,*params))    # yhat, y, lambda_reg, all_params
                                                # return mse + regul, mse (sum not the mean)

# we can apply backward on what we want...
                                                


In [None]:
#
# Train loop (epoch)
#   loop over the dataloader
#       forward (+get the parameters)
#       loss
#       backward
#       optim
#   compute mse on validation & test
#   display losses for epoch e
#

##<CORRECTION>
for e in range(n_epochs):
    mean_loss = [0,0,0] #train/val/test

    ## Training loss (the one we train with)
    
    for users_t,items_t,ratings_t in dataloader_train:
        model.train() # set the model on train mode
        model.zero_grad() # reset gradients
        
        #(c) predictions are made by the model
        pred,*params = model(users_t,items_t)
        
        #(d) loss computed on predictions, we added regularization
        loss,mse_loss = loss_func(pred,ratings_t,reg,*params)
        
        loss.backward() #(e) backpropagating to get gradients
        
        mean_loss[0] += mse_loss
        optimizer.step() #(f) updating parameters
    
    with torch.no_grad():
        ## Validation loss (no training)
        for users_t,items_t,ratings_t in dataloader_val:

            model.eval() # Inference mode
            pred,*params = model(users_t,items_t)
            _,mse_loss = loss_func(pred,ratings_t,reg,*params)

            mean_loss[1] += mse_loss    

        ## Test loss (no training)

        for users_t,items_t,ratings_t in dataloader_test:
            model.eval()
            pred,*params = model(users_t,items_t)
            _,mse_loss = loss_func(pred,ratings_t,reg,*params)

            mean_loss[2] += mse_loss    

    print("-"*25)
    print("epoch",e, "mse (train/val/test)", round((mean_loss[0]/len(prep_train)).item(),3),"/",  round((mean_loss[1]/len(prep_val)).item(),3),"/",  round((mean_loss[2]/len(prep_test)).item(),3))
    ##</CORRECTION>
    

## (Your turn from scratch) Koren 2009 model:

Here, this model simply adds a bias for each user and for each item

### $$ \min\limits_{U,I}\sum\limits_{(u,i)} \underbrace{(r_{ui} -  (I_i^TU_u + \mu+ \mu_i+\mu_u))^2}_\text{minimization} + \underbrace{\lambda(||U_u||^2+||I_u||^2 + \mu  + \mu_i+\mu_u) }_\text{regularization} $$


### $$r_{ui} = \mu + \mu_i + \mu_u + U_u.I_i $$

### TODO:

- (a) complete the model initialization
- (b) complete the forward method

In [None]:

# <CORRECTION>
class KorenMF(torch.nn.Module):

    def __init__(self,nb_users,nb_items,latent_size):
        super(KorenMF, self).__init__()
        
        self.users = torch.nn.Embedding(nb_users, latent_size)
        self.items = torch.nn.Embedding(nb_items, latent_size)
        self.umean = torch.nn.Embedding(nb_users, 1)
        self.imean = torch.nn.Embedding(nb_items, 1)
        self.gmean = torch.nn.Parameter(torch.FloatTensor(1,).fill_(3))

        torch.nn.init.normal_(self.users.weight,0,0.01)
        torch.nn.init.normal_(self.items.weight,0,0.01)
        torch.nn.init.normal_(self.umean.weight,2,1)
        torch.nn.init.normal_(self.imean.weight,2,1)
        
        
    def forward(self, user,item):
        embed_u,embed_i = self.users(user).squeeze(1) , self.items(item).squeeze(1)
        umean, imean = self.umean(user) , self.imean(item)
        out = torch.sum(embed_u*embed_i,1) + umean.squeeze(-1) + imean.squeeze(-1) + self.gmean

        return out , embed_u, embed_i, umean , imean , self.gmean
# </CORRECTION>

### (TODO) Here, train loop stays the same, you only have to change the model

In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F

n_epochs = 10
batch_size = 16
num_feat = 25
lr = 0.01
reg = 0.001

# note: previous loss function should be robust to the new model thanks to advanced syntax :)

model =  KorenMF(num_users,num_items,num_feat)
optimizer = torch.optim.Adam(model.parameters())

# same loop as before
# <CORRECTION>
for e in range(n_epochs):
    mean_loss = [0,0,0] #train/val/test

    for users_t,items_t,ratings_t in dataloader_train:
        model.train()
        model.zero_grad()
        pred,*params = model(users_t,items_t)

        loss,mse_loss = loss_func(pred,ratings_t,reg,*params)
        loss.backward()
        
        mean_loss[0] += mse_loss
        optimizer.step()
    
    

    for users_t,items_t,ratings_t in dataloader_val:
        model.eval()
        pred,*params = model(users_t,items_t)
        _,mse_loss = loss_func(pred,ratings_t,reg,*params)
    
        mean_loss[1] += mse_loss    
        
    for users_t,items_t,ratings_t in dataloader_test:
        model.eval()
        pred,*params = model(users_t,items_t)
        _,mse_loss = loss_func(pred,ratings_t,reg,*params)
    
        mean_loss[2] += mse_loss    

    print("-"*25)
    print("epoch",e, "mse (train/val/test)", round((mean_loss[0]/len(prep_train)).item(),3),"/",  round((mean_loss[1]/len(prep_val)).item(),3),"/",  round((mean_loss[2]/len(prep_test)).item(),3))
    # </CORRECTION>
    

# [Optional part] How to complete this series of experiments

### Visualization

Use tsne to display embedding
* could be done with sklearn [link](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html)
* often done with tensorboard in deep applications


### Regularization

Exploit side informations to regularize the profiles:
* Users from the same age category are supposed to have closer representations, Movies from the same genre, etc...


In [None]:
# load side informations
uinfo = pd.read_csv("data/ml-100k/u.user", sep="|", names=["userId","age", "genre", "prof","zip"])
uinfo.head(5)

# WARNING: we changed the definition of ids => make ids consistent
uinfo["userId"].map(user_map) # using the same dictionary
genre_map = {g:num for num,g in enumerate(uinfo["genre"].unique())}
uinfo["genre"].map(genre_map) 
prof_map = {p:num for num,p in enumerate(uinfo["prof"].unique())}
uinfo["prof"].map(prof_map) 
# age cat

# Construction du sujet à partir de la correction

In [1]:
### <CORRECTION> ###
import re
# transformation de cet énoncé en version étudiante

fname = "3_1-reco-corr.ipynb" # ce fichier
fout  = fname.replace("-corr","")

# print("Fichier de sortie: ", fout )

f = open(fname, "r")
txt = f.read()
 
f.close()


f2 = open(fout, "w")
f2.write(re.sub("<CORRECTION>.*?(</CORRECTION>)"," TODO ",\
    txt, flags=re.DOTALL))
f2.close()

### </CORRECTION> ###