# 0. Configuration

In [14]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [18]:
pip install torch

You should consider upgrading via the '/Users/vydolga/.pyenv/versions/3.9.16/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [19]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import random
import numpy as np
import pandas as pd
import datetime as dt
from itertools import permutations

import torch
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


RANDOM_STATE = 42

## 1.1. Helper functions to avoid copy paste

In [20]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. RankNet

In [22]:
class RankNet(torch.nn.Module):
    def __init__(self, input_features_len, hidden_dim = 10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_features_len, self.hidden_dim), # y = ax + b,
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )
        
        self.out_activation = torch.nn.Sigmoid() # this transformation equiavalent to RankNet formula

    def forward(self, input_1, input_2):
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)
        
        logits_diff = logits_1 - logits_2
        out = self.out_activation(logits_diff)

        return out
    
    def predict(self, inp):
        logits = self.model(inp)
        return logits

In [23]:
model = RankNet(input_features_len = 8)
model

RankNet(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=1, bias=True)
  )
  (out_activation): Sigmoid()
)

In [24]:
input_1, input_2 = torch.rand(4, 8), torch.rand(4, 8)
input_2

tensor([[0.1310, 0.8451, 0.0799, 0.3155, 0.0753, 0.2602, 0.3703, 0.0384],
        [0.6173, 0.2288, 0.8854, 0.8426, 0.4506, 0.1639, 0.2717, 0.8266],
        [0.4756, 0.0495, 0.0161, 0.7735, 0.8990, 0.1443, 0.0507, 0.2109],
        [0.0964, 0.2692, 0.0692, 0.5755, 0.0064, 0.8514, 0.1110, 0.5629]])

In [25]:
input_1

tensor([[0.1980, 0.3280, 0.8186, 0.8735, 0.7454, 0.1899, 0.2215, 0.9735],
        [0.5374, 0.0418, 0.4625, 0.2301, 0.6139, 0.8527, 0.0850, 0.6337],
        [0.3207, 0.2482, 0.7852, 0.5514, 0.9819, 0.6047, 0.5959, 0.2735],
        [0.7932, 0.0390, 0.5031, 0.3495, 0.9068, 0.8789, 0.7720, 0.1024]])

$$P_{ij} \equiv P(U_{i}>U_{j}) \equiv \frac{1}{1 + \exp^{-\sigma(s_{i} - s{j})}}$$


In [26]:
preds = torch.sort(model(input_1, input_2), descending = True, dim = 0)
preds[0]

tensor([[0.5027],
        [0.4932],
        [0.4852],
        [0.4694]], grad_fn=<SortBackward0>)

## 2.2. ListNet

In [27]:
movies_to_rank = {'The Godfather', 'Avatar', 'Ozark'}
permutations_list = list(permutations(movies_to_rank))

for i in permutations_list:
    print(i)

('Ozark', 'The Godfather', 'Avatar')
('Ozark', 'Avatar', 'The Godfather')
('The Godfather', 'Ozark', 'Avatar')
('The Godfather', 'Avatar', 'Ozark')
('Avatar', 'Ozark', 'The Godfather')
('Avatar', 'The Godfather', 'Ozark')


In [28]:
pi = random.choice(permutations_list)
print(pi)

('Avatar', 'Ozark', 'The Godfather')


In [29]:
np.random.seed(RANDOM_STATE)
scores_dict = {x: np.random.randn(1)[0] for x in movies_to_rank}  
print(scores_dict)

# unpack pi and assign movies to scores
score_movie_pos_1, score_movie_pos_2, score_movie_pos_3 = scores_dict[pi[0]], scores_dict[pi[1]], scores_dict[pi[2]]


{'Ozark': 0.4967141530112327, 'The Godfather': -0.13826430117118466, 'Avatar': 0.6476885381006925}


In [30]:
first_term = np.exp(score_movie_pos_1) / (np.exp(score_movie_pos_1) + np.exp(score_movie_pos_2)\
                                         + np.exp(score_movie_pos_3))

second_term = np.exp(score_movie_pos_2) / (np.exp(score_movie_pos_2) + np.exp(score_movie_pos_3))

third_term = np.exp(score_movie_pos_3) / np.exp(score_movie_pos_3)

print(f'First term is: {first_term}')
print(f'Second term is: {second_term}')
print(f'Third term is: {third_term}')

First term is: 0.4318619033836114
Second term is: 0.6536174529063914
Third term is: 1.0


$P_{s}(<The Godfather, Avatar, Ozark>) = \prod^3_{j = 1} \frac {\phi(s_{\pi(j)})} {\sum^3_{k = j} \phi(s_{\pi(k)})}$ which is equal to

In [31]:
permutation_proba = first_term * second_term * third_term

print(f'Permutation probability is: {permutation_proba}')


Permutation probability is: 0.2822724772969022


## 2.3. CatBoost Ranker

### 2.3.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [32]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [33]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [34]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [35]:
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)

In [36]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


### 2.3.2 Data Preparation

In [37]:
TEST_SIZE = .25

In [38]:
# convert timestamp to date
interactions_filtered['dttm'] = interactions_filtered['timestamp']\
                                .apply(lambda x: pd.to_datetime(dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d')))

Let's calculate some basic features, but keep in mind that our data of historical ratings depends on time.
We need to avoid data leak -- use future values in past data

In [39]:
ITEM_FEATURES_LIST = ['revenue', 'budget', 'runtime']

In [40]:
# calculate avg ratings by users and items daily
daily_users_feature = interactions_filtered.groupby(['userId', 'dttm']).agg({'rating': 'mean',
                                              'movieId': 'count'})\
                                  .reset_index().sort_values(['userId', 'dttm'])\
                                  .rename(columns = {'rating': 'user_mean_rating',
                                                     'movieId': 'user_watch_count'})


daily_users_feature['dttm'] = daily_users_feature['dttm'].apply(lambda x: x + dt.timedelta(days = 1))
daily_users_feature.loc[daily_users_feature['userId'] == 3]

Unnamed: 0,userId,dttm,user_mean_rating,user_watch_count
2,3,2011-03-01,3.541667,24
3,3,2011-03-02,3.5,1


In [41]:
interactions_filtered.loc[interactions_filtered['userId'] == 3]

Unnamed: 0,userId,movieId,rating,timestamp,dttm
97,3,110,4.0,1298922049,2011-02-28
98,3,247,3.5,1298861637,2011-02-28
99,3,267,3.0,1298861761,2011-02-28
100,3,296,4.5,1298862418,2011-02-28
101,3,318,5.0,1298862121,2011-02-28
104,3,377,2.5,1298923242,2011-02-28
105,3,527,3.0,1298862528,2011-02-28
106,3,588,3.0,1298922100,2011-02-28
107,3,592,3.0,1298923247,2011-02-28
108,3,593,3.0,1298921840,2011-02-28


In [42]:
cumulative_total_cnt = daily_users_feature.set_index('dttm').groupby(['userId'])['user_watch_count']\
                        .rolling(window = 3, min_periods = 1).sum()\
                        .reset_index()[['userId', 'dttm', 'user_watch_count']]\
                        .rename(columns = {'user_watch_count': 'user_total_watch_count_last_3_days'})

In [43]:
# merge item features
main_df = pd.merge(
    interactions_filtered, movies_metadata[['movieId']+ ITEM_FEATURES_LIST],
    how = 'left', on = 'movieId'
                   ).drop_duplicates().reset_index(drop = True)
assert main_df.shape[0] == interactions_filtered.shape[0]

In [44]:
main_df = main_df.sort_values('dttm').reset_index(drop = True)
daily_users_feature = daily_users_feature.sort_values('dttm').reset_index(drop = True)
cumulative_total_cnt = cumulative_total_cnt.sort_values('dttm').reset_index(drop = True)

In [45]:
# merge user features with watch count
main_df = pd.merge_asof(
    main_df, daily_users_feature,
    on = 'dttm', by = 'userId',
    direction = 'backward',
    allow_exact_matches = True
    )
assert main_df.shape[0] == interactions_filtered.shape[0]

In [46]:
main_df = pd.merge_asof(
    main_df, cumulative_total_cnt,
    on = 'dttm', by = 'userId',
    direction = 'backward',
    allow_exact_matches = True
    )
assert main_df.shape[0] == interactions_filtered.shape[0]

In [47]:
# tmp  = main_df.loc[main_df['userId'] == 671][['userId', 'dttm']]
# pd.merge_asof(
#     tmp.sort_values('dttm'), cumulative_total_cnt.sort_values('dttm'),
#     on = 'dttm', by = 'userId', direction = 'backward',
#     allow_exact_matches = True).sort_values('dttm')

In [48]:
# anyway we left some NaN
main_df.isnull().sum() / len(main_df) 

userId                                0.000000
movieId                               0.000000
rating                                0.000000
timestamp                             0.000000
dttm                                  0.000000
revenue                               0.000000
budget                                0.000000
runtime                               0.000089
user_mean_rating                      0.612327
user_watch_count                      0.612327
user_total_watch_count_last_3_days    0.612327
dtype: float64

In [49]:
FINAL_FEATURES_LIST = ['revenue', 'budget', 'runtime', 'user_mean_rating',
                       'user_watch_count', 'user_total_watch_count_last_3_days']

In [50]:
ID_COLS = ['userId', 'movieId']

In [51]:
TARGET = 'rating'

In [52]:
X = main_df[ID_COLS + FINAL_FEATURES_LIST]
y = main_df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = TEST_SIZE,
    random_state = RANDOM_STATE)

print(f'Shape of train set X, y: {X_train.shape}, {len(y_train)}')
print(f'Shape of train set X, y: {X_test.shape}, {len(y_test)}')

Shape of train set X, y: (33741, 8), 33741
Shape of train set X, y: (11248, 8), 11248


### 2.3.3. Train Model

In [53]:
# init model
model = CatBoostRegressor(
    loss_function = 'MAE',
    iterations = 2000,
    learning_rate = .1,
    depth = 6,
    verbose = False
)

In [54]:
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds = 20 # to avoid overfitting,
)

<catboost.core.CatBoostRegressor at 0x15aabb310>

In [55]:
model.best_score_

{'learn': {'MAE': 0.6618374739119923},
 'validation': {'MAE': 0.721013774188702}}

# TODO
- Add baseline comparison from the model (well, we discussed what is baseline for MAE metric -- now, you have to define, how you are going to calculate it)

In [58]:
baseline = [y_test.median() for i in range(len(y_test))]
pred = model.predict(X_test)

mae_baseline = mean_absolute_error(y_test, baseline)
mae_model = mean_absolute_error(y_test, pred)

print(f'Baseline MAE: {mae_baseline}')
print(f'Model MAE: {mae_model}')


Baseline MAE: 0.8380156472261735
Model MAE: 0.7210147741887016
