In [84]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import tqdm

# Data
First we load the data

In [85]:
ratings_df = pd.read_csv("ml-25m/ratings.csv")
movies_df = pd.read_csv("ml-25m/movies.csv")

In [86]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [87]:
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


In [88]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


## Remove Movies without a genre

In [89]:
nogenres_indexes = movies_df.index[movies_df['genres'] == '(no genres listed)'].tolist()
movieId_todelete = movies_df.iloc[nogenres_indexes]['movieId'].to_numpy()
ratings_df.drop(ratings_df[ratings_df['movieId'].isin(movieId_todelete)].index, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)

In [90]:
#movies_df.drop(index=nogenres_indexes,inplace=True)
#movies_df.reset_index(drop=True,inplace=True)

In [91]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [92]:
p_genres = {}
for e in movies_df.itertuples():
    genres_in_row = e[-1].split('|')
    for genre in genres_in_row:
        if genre in p_genres:
            p_genres[genre] += 1
        else:
            p_genres[genre] = 1

In [93]:
p_genres

{'Adventure': 4145,
 'Animation': 2929,
 'Children': 2935,
 'Comedy': 16870,
 'Fantasy': 2731,
 'Romance': 7719,
 'Drama': 25606,
 'Action': 7348,
 'Crime': 5319,
 'Thriller': 8654,
 'Horror': 5989,
 'Mystery': 2925,
 'Sci-Fi': 3595,
 'IMAX': 195,
 'Documentary': 5605,
 'War': 1874,
 'Musical': 1054,
 'Western': 1399,
 'Film-Noir': 353,
 '(no genres listed)': 5062}

Nous pouvons observer qu'il existe un imbalance en terme de genres de film avec énormement plus de films de Drama que d'autres.

In [94]:
user_ids = ratings_df["userId"].unique()
nb_user = len(user_ids)
nb_user

162541

In [95]:
ratings_by_user_dict = dict(tuple(ratings_df.groupby('userId')))

In [96]:
import math

ratings_by_user = []
for user_id in tqdm.tqdm(user_ids):
    
    ratings_of_user = ratings_by_user_dict[user_id]
    ratings_of_user.sort_values(by="timestamp", inplace=True, ignore_index=True)
    
    number_of_ratings = len(ratings_of_user)
    ratings_of_user = ratings_of_user.to_numpy()
    
    # We only keep users with more than 50 ratings
    if number_of_ratings >= 50:
        
        ## We crop to have 25 ratings/frame
        if number_of_ratings % 25 != 0:
            round_number = number_of_ratings % 25
            ratings_of_user = ratings_of_user[:-round_number]
            number_of_ratings = ratings_of_user.shape[0]
        

        dividing_size = number_of_ratings / 25
        
        user_frames = np.split(ratings_of_user, dividing_size)
        
        ratings_by_user.append(np.expand_dims(np.array(user_frames), axis=-1))

ratings_by_user = np.array(ratings_by_user)
    
ratings_by_user.shape

100%|██████████| 162541/162541 [05:27<00:00, 495.65it/s]


(102460,)

In [97]:
len(ratings_by_user)

102460

In [98]:
ratings_by_user[0].shape

(2, 25, 4, 1)

In [99]:
ratings_by_user[1].shape

(7, 25, 4, 1)

In [100]:
in_height=ratings_by_user[0].shape[1]
in_width=ratings_by_user[0].shape[2]

In [101]:
np.save("user_frames_divided.npy", ratings_by_user)

In [157]:
column_movieId = movies_df["movieId"]
print("Max ID of movie: ", column_movieId.max())
id_list = column_movieId.to_numpy()

id_dictionnary = sorted(set(id_list))

id_to_index =  {u:i for i, u in enumerate(id_dictionnary)}
index_to_id = list(id_dictionnary)

Max ID of movie:  209171


# Model
Convolutional Tensor-Train LSTM Recommendation Net  
The model was inspired by Covolutional Click Prediction Model (CCPM)  
 We want to vase our recommendation on previous recommendation we made. We want to add the sequential information  
Therefore, we replace the second convolution by a Convolutional Tensor Train LSTM

In [103]:
from utils.convlstmnet import ConvLSTMNet
from torch.nn import Conv2d, AdaptiveMaxPool2d, Linear, Tanh, ReLU, MaxPool2d, Flatten

Hyperparameters for the CTLRN

In [119]:
output_size=len(movies_df) # Number of movies
inputs_channels=1# To define ==> 2
lstm_input_channels=3
cell = "convttlstm"
order = 3
steps = 3
rank = 8
kernel_size = 5
lr=1e-3
output_sigmoid = True

In [105]:
padding_size_w=in_height//2
padding_size_h=in_width//2

In [106]:
ctln_model = ConvLSTMNet(
        # input to the model
        input_channels = lstm_input_channels, 
        # architecture of the model
        layers_per_block = (3, 3, 3, 3), 
        hidden_channels = (32, 48, 48, 32), 
        skip_stride = 2,
        # parameters of convolutional tensor-train layers
        cell = cell, cell_params = {"order": order,
        "steps": steps, "rank": rank},
        # parameters of convolutional operations
        kernel_size = kernel_size, bias = True,
        # output function and output format
        output_sigmoid = output_sigmoid)

In [107]:
class CTLRNet(nn.Module):
    def __init__(self, inputs_channels, output_size):
        super(CTLRNet, self).__init__()
        
        self.inputs_channels = inputs_channels
        self.output_size = output_size
        
        self.conv1 = Conv2d(in_channels=inputs_channels, out_channels=lstm_input_channels,
                       padding=(padding_size_h,padding_size_h), kernel_size=(3,3))
        self.pool1 = AdaptiveMaxPool2d(output_size=(in_height, in_width))
        self.tanh = Tanh()
        self.convttlstm = ctln_model
        self.pool2 = MaxPool2d(2) # Padding ?
        self.flatten = Flatten()
        self.linear = Linear(in_features=72, out_features=output_size)
        self.relu = ReLU()
    
    def forward(self, inputs):
        x = self.conv1(inputs)
        x = self.pool1(x)
        x = self.tanh(x)
        
        ## To study exactly
        ## uwu ?
        x = torch.unsqueeze(x, dim=0)
        x = self.convttlstm(x, input_frames = inputs.shape[1], future_frames = 1, output_frames = 1)
        x = torch.squeeze(x, dim=0)
        
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.linear(x)
        y = self.relu(x)
        return x
        

In [108]:
model = CTLRNet(inputs_channels, output_size)

In [109]:
quick_test_data = torch.from_numpy(ratings_by_user[0]).permute([0,3,1,2])

In [110]:
quick_test_data.shape

torch.Size([2, 1, 25, 4])

In [111]:
model(quick_test_data.float()).shape

torch.Size([1, 62423])

## Data Split

In [112]:
np.random.shuffle(ratings_by_user)

data_samples = ratings_by_user.shape[0]

train_size = math.ceil(data_samples * 0.70)
test_size = data_samples - train_size

train_data = ratings_by_user[train_size:]
test_data = ratings_by_user[:test_size]

In [158]:
def split_x_y(data, p=0.25):
    """
    Split and construct a frame into an input and an output
    parameters data: (nb_frames, c, h, w0)
    return: (nb_frames, c, h, w0), (nb_movies)
    """
    data = torch.from_numpy(data).permute([0,3,1,2])
    data_frames = data.shape[0]
    X_size = math.ceil(data_frames * (1-p))
    Y_data_size = data_frames - X_size
    
    
    X = data[:X_size]
    Y_data = data[Y_data_size:]
    
    y = np.zeros(output_size)
    
    #print()
    
    #y[Y_data[...,1]] = Y_data[...,2]
    
    for frame in Y_data:
        for row in frame[0]:
            y[id_to_index[int(row[1])]] = row[2]
    
    return X, torch.from_numpy(y.reshape(1,-1)).double()
            

In [115]:
y

array([0., 0., 0., ..., 0., 0., 0.])

In [135]:
import torch.nn.functional as F

def compute_loss(y_pred, y_true):
    return F.l1_loss(y_pred, y_true, reduction = "mean") + F.mse_loss(y_pred, y_true, reduction = "mean")

optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [161]:
# Training loop
num_epochs = 5

loss = 0
for epoch in tqdm.tqdm(range(0, num_epochs)):
    history = []
    for frames in tqdm.tqdm(train_data):
        X, y = split_x_y(frames)
        
        optimizer.zero_grad()
        
        pred = model(X.float()).double()
        
        loss = compute_loss(pred, y)
        loss.backward()
        optimizer.step()

history.append(loss.numpy().mean())
plt.plot(history)


  0%|          | 0/5 [00:00<?, ?it/s][A

  0%|          | 0/30738 [00:00<?, ?it/s][A[A

  0%|          | 1/30738 [00:00<7:55:28,  1.08it/s][A[A

  0%|          | 2/30738 [00:01<8:05:32,  1.06it/s][A[A

  0%|          | 3/30738 [00:02<7:34:39,  1.13it/s][A[A

  0%|          | 4/30738 [00:03<7:14:15,  1.18it/s][A[A

  0%|          | 5/30738 [00:04<7:02:47,  1.21it/s][A[A

  0%|          | 6/30738 [00:04<6:56:08,  1.23it/s][A[A

  0%|          | 7/30738 [00:05<6:46:27,  1.26it/s][A[A

  0%|          | 8/30738 [00:06<7:02:12,  1.21it/s][A[A

  0%|          | 9/30738 [00:07<7:10:54,  1.19it/s][A[A

  0%|          | 10/30738 [00:08<7:09:06,  1.19it/s][A[A

  0%|          | 11/30738 [00:09<7:01:28,  1.22it/s][A[A

  0%|          | 12/30738 [00:09<7:05:24,  1.20it/s][A[A

  0%|          | 13/30738 [00:10<7:14:26,  1.18it/s][A[A

  0%|          | 14/30738 [00:11<7:27:29,  1.14it/s][A[A

  0%|          | 15/30738 [00:12<7:14:11,  1.18it/s][A[A

  0%|          | 

KeyboardInterrupt: 