In [None]:
RUNNING_IN_COLAB = True
CALCULATE_NUMPY_ARRAY = False

if RUNNING_IN_COLAB:
    REPO_URL = 'https://github.com/tomatodelavegas/recommender-system.git'
    BRANCH   = 'main'
    REPO_DIR = 'recommender-system'
    DATA_URL = 'https://drive.google.com/uc?id=1psSrOGsxFrlj0UJH1Gn6Jee4lEVHX69T'

    from pathlib import Path

    %cd /content

    # Download the repository
    if not Path(REPO_DIR).is_dir():
        !git clone --branch {BRANCH} --depth=1 -- {REPO_URL} {REPO_DIR}
    
    %cd {REPO_DIR}

    # Install requirements
    !pip install -r requirements.txt | grep -v 'Requirement already satisfied'
    !pip install gdown | grep -v 'Requirement already satisfied'
    
    import gdown
    if not Path('ml-25m.zip').is_file():
        gdown.download(DATA_URL, 'ml-25m.zip', quiet=False)
    
    if not Path('ml-25m').is_dir():
        !unzip -q -- ml-25m.zip
if not CALCULATE_NUMPY_ARRAY:
  DATA_URL = 'https://drive.google.com/uc?id=1KohLYb76pTLr2hGsPFAUAINr_ZAkPJM_'
  if not Path('user_frames_divided.npy').is_file():
    gdown.download(DATA_URL, 'user_frames_divided.npy', quiet=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import tqdm

In [None]:
from functools import partial
from tqdm import tqdm
tqdm = partial(tqdm, position=0, leave=True)

# Data
First we load the data

In [None]:
ratings_df = pd.read_csv("ml-25m/ratings.csv")
movies_df = pd.read_csv("ml-25m/movies.csv")

In [None]:
ratings_df.info()

In [None]:
ratings_df.head(10)

In [None]:
movies_df.info()

## Remove Movies without a genre

In [None]:
nogenres_indexes = movies_df.index[movies_df['genres'] == '(no genres listed)'].tolist()
movieId_todelete = movies_df.iloc[nogenres_indexes]['movieId'].to_numpy()
ratings_df.drop(ratings_df[ratings_df['movieId'].isin(movieId_todelete)].index, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

In [None]:
#movies_df.drop(index=nogenres_indexes,inplace=True)
#movies_df.reset_index(drop=True,inplace=True)

In [None]:
movies_df.info()

In [None]:
p_genres = {}
for e in movies_df.itertuples():
    genres_in_row = e[-1].split('|')
    for genre in genres_in_row:
        if genre in p_genres:
            p_genres[genre] += 1
        else:
            p_genres[genre] = 1

In [None]:
p_genres

Nous pouvons observer qu'il existe un imbalance en terme de genres de film avec énormement plus de films de Drama que d'autres.

In [None]:
user_ids = ratings_df["userId"].unique()
nb_user = len(user_ids)
nb_user

In [None]:
ratings_by_user_dict = dict(tuple(ratings_df.groupby('userId')))

In [None]:
import math
def divide_ratings():
  ratings_by_user = []
  for user_id in tqdm.tqdm(user_ids):
      
      ratings_of_user = ratings_by_user_dict[user_id]
      ratings_of_user.sort_values(by="timestamp", inplace=True, ignore_index=True)
      
      number_of_ratings = len(ratings_of_user)
      ratings_of_user = ratings_of_user.to_numpy()
      
      # We only keep users with more than 50 ratings
      if number_of_ratings >= 50:
          
          ## We crop to have 25 ratings/frame
          if number_of_ratings % 25 != 0:
              round_number = number_of_ratings % 25
              ratings_of_user = ratings_of_user[:-round_number]
              number_of_ratings = ratings_of_user.shape[0]
          

          dividing_size = number_of_ratings / 25
          
          user_frames = np.split(ratings_of_user, dividing_size)
          
          ratings_by_user.append(np.expand_dims(np.array(user_frames), axis=-1))

  ratings_by_user = np.array(ratings_by_user)
  return ratings_by_user

ratings_by_user = []
if CALCULATE_NUMPY_ARRAY:
  ratings_by_user = divide_ratings()
else:
  ratings_by_user = np.load('user_frames_divided.npy', allow_pickle=True)
ratings_by_user.shape

In [None]:
len(ratings_by_user)

In [None]:
ratings_by_user[0].shape

In [None]:
ratings_by_user[1].shape

In [None]:
in_height=ratings_by_user[0].shape[1]
in_width=ratings_by_user[0].shape[2]

In [None]:
if CALCULATE_NUMPY_ARRAY:
  np.save("user_frames_divided.npy", ratings_by_user)

In [None]:
column_movieId = movies_df["movieId"]
print("Max ID of movie: ", column_movieId.max())
id_list = column_movieId.to_numpy()

id_dictionnary = sorted(set(id_list))

id_to_index =  {u:i for i, u in enumerate(id_dictionnary)}
index_to_id = list(id_dictionnary)

# Model
Convolutional Tensor-Train LSTM Recommendation Net  
The model was inspired by Covolutional Click Prediction Model (CCPM)  
 We want to vase our recommendation on previous recommendation we made. We want to add the sequential information  
Therefore, we replace the second convolution by a Convolutional Tensor Train LSTM

In [None]:
from utils.convlstmnet import ConvLSTMNet
from torch.nn import Conv2d, AdaptiveMaxPool2d, Linear, Tanh, ReLU, MaxPool2d, Flatten

Hyperparameters for the CTLRN

In [None]:
output_size=len(movies_df) # Number of movies
inputs_channels=1# To define ==> 2
lstm_input_channels=3
cell = "convttlstm"
order = 3
steps = 3
rank = 8
kernel_size = 5
lr=1e-3
output_sigmoid = True

In [None]:
padding_size_w=in_height//2
padding_size_h=in_width//2

In [None]:
ctln_model = ConvLSTMNet(
        # input to the model
        input_channels = lstm_input_channels, 
        # architecture of the model
        layers_per_block = (3, 3, 3, 3), 
        hidden_channels = (32, 48, 48, 32), 
        skip_stride = 2,
        # parameters of convolutional tensor-train layers
        cell = cell, cell_params = {"order": order,
        "steps": steps, "rank": rank},
        # parameters of convolutional operations
        kernel_size = kernel_size, bias = True,
        # output function and output format
        output_sigmoid = output_sigmoid)

In [None]:
class CTLRNet(nn.Module):
    def __init__(self, inputs_channels, output_size):
        super(CTLRNet, self).__init__()
        
        self.inputs_channels = inputs_channels
        self.output_size = output_size
        
        self.conv1 = Conv2d(in_channels=inputs_channels, out_channels=lstm_input_channels,
                       padding=(padding_size_h,padding_size_h), kernel_size=(3,3))
        self.pool1 = AdaptiveMaxPool2d(output_size=(in_height, in_width))
        self.tanh = Tanh()
        self.convttlstm = ctln_model
        self.pool2 = MaxPool2d(2) # Padding ?
        self.flatten = Flatten()
        self.linear = Linear(in_features=72, out_features=output_size)
        self.relu = ReLU()
    
    def forward(self, inputs):
        x = self.conv1(inputs)
        x = self.pool1(x)
        x = self.tanh(x)
        
        ## To study exactly
        ## uwu ?
        x = torch.unsqueeze(x, dim=0)
        x = self.convttlstm(x, input_frames = inputs.shape[1], future_frames = 1, output_frames = 1)
        x = torch.squeeze(x, dim=0)
        
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.linear(x)
        y = self.relu(x)
        return x
        

In [None]:
model = CTLRNet(inputs_channels, output_size)

In [None]:
quick_test_data = torch.from_numpy(ratings_by_user[0]).permute([0,3,1,2])

In [None]:
quick_test_data.shape

In [None]:
model(quick_test_data.float()).shape

## Data Split

In [None]:
np.random.shuffle(ratings_by_user)

data_samples = ratings_by_user.shape[0]

train_size = math.ceil(data_samples * 0.70)
test_size = data_samples - train_size

train_data = ratings_by_user[train_size:]
test_data = ratings_by_user[:test_size]

In [None]:
def split_x_y(data, p=0.25):
    """
    Split and construct a frame into an input and an output
    parameters data: (nb_frames, c, h, w0)
    return: (nb_frames, c, h, w0), (nb_movies)
    """
    data_frames = data.shape[0]
    X_size = math.ceil(data_frames * (1-p))
    Y_data_size = data_frames - X_size
    
    
    X = data[:X_size]
    Y_data = data[Y_data_size:]
    
    y = np.zeros(output_size)
    
    #print()
    
    #y[Y_data[...,1]] = Y_data[...,2]
    
    for frame in Y_data:
        for row in frame[0]:
            y[id_to_index[int(row[1])]] = row[2]
    
    return X, torch.from_numpy(y.reshape(1,-1)).double()
            

In [None]:
import torch.nn.functional as F

def compute_loss(y_pred, y_true):
    return F.l1_loss(y_pred, y_true, reduction = "mean") + F.mse_loss(y_pred, y_true, reduction = "mean")

optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
# whether to use GPU (or CPU) 
use_cuda  = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# whether to use multi-GPU (or single-GPU)
multi_gpu = use_cuda and torch.cuda.device_count() > 1
num_gpus = (torch.cuda.device_count() if multi_gpu else 1) if use_cuda else 0
# move the model to the device (CPU, GPU, multi-GPU) 
model.to(device)
if multi_gpu: 
    model = nn.DataParallel(model)

In [None]:
# Training loop
num_epochs = 5

loss = 0
for epoch in range(0, num_epochs):
    history = []
    loop = tqdm(enumerate(train_data), total = len(train_data))
    for batch_idx, frames in loop:
      frames = torch.from_numpy(frames).permute([0,3,1,2]).to(device)
      X, y = split_x_y(frames)
      
      optimizer.zero_grad()
      
      pred = model(X.float()).double().to(device)
      y = y.to(device)
      loss = compute_loss(pred, y)
      loss.backward()
      optimizer.step()

      loop.set_description(f"Epoch [{epoch}/{num_epochs}]")
      loop.set_postfix(loss= loss.item())


history.append(loss.numpy().mean())
plt.plot(history)