# 0. Configuration

In [41]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [42]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import pandas as pd
import scipy.sparse as sp

from itertools import islice, cycle, product

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')


## 1. 1. Helper functions to avoid copy paste

In [43]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [44]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [45]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [46]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [47]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


## 2.2 Data preparation using LightFM Dataset

To use implicit kNN method `fit` we need a sparse matrix in COOrdinate format. To achieve that we will use `scipy.sparse.coo_matrix` from scipy;


In [48]:
def get_coo_matrix(
        df: pd.DataFrame, 
        user_col: str,
        item_col: str, 
        users_mapping: dict, 
        movies_mapping: dict,
        weight_col: str = None
        ):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)
    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(movies_mapping.get)
        )
    ))
    return interaction_matrix


In [49]:
# define users mapping
users_inv_mapping = dict(enumerate(interactions_filtered['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)


671

In [50]:
# define movies mapping
movies_inv_mapping = dict(enumerate(interactions_filtered['movieId'].unique()))
movies_mapping = {v: k for k, v in movies_inv_mapping.items()}
len(movies_mapping)


2830

In [51]:
# defining train set on the whole interactions dataset (as HW you will have to split into test and train for evaluation)
train_mat = get_coo_matrix(
    interactions_filtered,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()


In [52]:
from implicit import evaluation
train, test = implicit.evaluation.train_test_split(train_mat)


In [53]:
train

<671x2830 sparse matrix of type '<class 'numpy.float32'>'
	with 35976 stored elements in Compressed Sparse Row format>

## 2.3. Model Training & Evaluation

In [`implicit`](https://pypi.org/project/implicit/), there are various models and can be groupped into:
- Item-to-Item: KNN based on various similarities - CosineRecommender, BM25Recommender, TFIDFRecommender
- implicit ALS;
- Logistic Matrix Factorization;
- Bayesian Personalized Ranking (BPR)


### 2.3.1. Train Model

In [54]:
from implicit.nearest_neighbours import (
    CosineRecommender,
    BM25Recommender,
    TFIDFRecommender
    )


Note that in item-to-item models we need to provide matrix in the form of item-user by transposing initial COO matrix user-item


In [55]:
# fit the model
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)


100%|██████████| 671/671 [00:00<00:00, 55806.51it/s]


### 2.3.2. Evaluate the Model

In [56]:
# let's make sense-check
top_N = 10
user_id = interactions_filtered['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 1, row number in matrix - 0


In [57]:
# create mapper for movieId and title names
movie_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['original_title']))

In [58]:
recs = cosine_model.recommend(
    row_id,
    train_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs


Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,653.0,0.861587,653,74458,Mere Brother Ki Dulhan
1,129.0,0.844531,129,1994,The Most Dangerous Game
2,606.0,0.654064,606,8011,Highlander III: The Sorcerer
3,294.0,0.625141,294,70,Million Dollar Baby
4,337.0,0.593856,337,170,28 Days Later
5,648.0,0.577499,648,68954,Longitude
6,579.0,0.571681,579,5956,Joshua
7,399.0,0.561442,399,1088,Whale Rider
8,278.0,0.561442,278,1584,School of Rock
9,150.0,0.557086,150,2100,The Last Castle


# TODO
- Make global train/ global test split -- train the model appropiately and predict on test set;
- Wrap up in function recommendations - lfm_recommend();
- Calculate `NDCG@10` on test set

In [59]:
interactions_filtered["date"] = pd.to_datetime(interactions_filtered["timestamp"], unit="s")
interactions_filtered["date"] = interactions_filtered["date"].dt.strftime("%d-%m-%Y")
interactions_filtered.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,date
10,1,1371,2.5,1260759135,14-12-2009
11,1,1405,1.0,1260759203,14-12-2009
13,1,2105,4.0,1260759139,14-12-2009
15,1,2193,2.0,1260759198,14-12-2009
16,1,2294,2.0,1260759108,14-12-2009


In [60]:
def split(df: pd.DataFrame, train_size: float):
    '''
    df: pd.DataFrame that is needed to split;
    train_size: size of the train sample (from 0 to 1);
    column: column the sorting will be based on
    '''
    df = df.sort_values(by='date')
    bd = int(df.shape[0]*train_size)
    change = int(bd - 1)
    while df["date"].iloc[change] ==  df["date"].iloc[bd]:
        change-=1


        global_train = df.iloc[:change+1]
        global_test = df.iloc[change+1:]

    return global_train, global_test

In [71]:
global_train, global_test = split(interactions_filtered, 0.75)

23-03-2000 23-03-2005


Training

In [62]:
train_mat_global = get_coo_matrix(
    global_train,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

In [63]:
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat_global.T)

100%|██████████| 671/671 [00:00<00:00, 71811.84it/s]


In [64]:
top_N = 10
user_id = global_train['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 224, row number in matrix - 223


In [65]:
test_mat_global = get_coo_matrix(
    global_test,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

In [74]:
recomend = cosine_model.recommend(
    row_id,
    train_mat_global,
    N = top_N,
    filter_already_liked_items = True
    )
recomend = pd.DataFrame(recomend).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recomend['inv_movie_id'] = recomend['col_id'].astype(int)
recomend['movieId'] = recomend['inv_movie_id'].map(movies_inv_mapping.get)
recomend['title'] = recomend['movieId'].map(movie_name_mapper)
recomend

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,129.0,0.832814,129,1994,The Most Dangerous Game
1,653.0,0.802223,653,74458,Mere Brother Ki Dulhan
2,267.0,0.604371,267,4973,Sous le Sable
3,337.0,0.593856,337,170,28 Days Later
4,648.0,0.577499,648,68954,Longitude
5,399.0,0.561442,399,1088,Whale Rider
6,150.0,0.557086,150,2100,The Last Castle
7,368.0,0.546672,368,549,Basquiat
8,384.0,0.546606,384,804,Roman Holiday
9,160.0,0.529829,160,2144,One Night at McCool's


In [82]:
def lfm_recommend(file,
                  row_id,
                #   test,
                  N,
                  filter_already_liked_items) -> pd.DataFrame:
    
    recomend = cosine_model.recommend(
        row_id,
        file,
        N = N,
        filter_already_liked_items = True
        )
    recomend = pd.DataFrame(recomend).T.rename(columns = {0: 'col_id', 1: 'similarity'})
    recomend['inv_movie_id'] = recomend['col_id'].astype(int)
    recomend['movieId'] = recomend['inv_movie_id'].map(movies_inv_mapping.get)
    recomend['title'] = recomend['movieId'].map(movie_name_mapper)
    return recomend

In [87]:
a = lfm_recommend(train_mat_global, 6, 10, True)
a


Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,129.0,0.832814,129,1994,The Most Dangerous Game
1,653.0,0.802223,653,74458,Mere Brother Ki Dulhan
2,267.0,0.604371,267,4973,Sous le Sable
3,337.0,0.593856,337,170,28 Days Later
4,648.0,0.577499,648,68954,Longitude
5,399.0,0.561442,399,1088,Whale Rider
6,150.0,0.557086,150,2100,The Last Castle
7,368.0,0.546672,368,549,Basquiat
8,384.0,0.546606,384,804,Roman Holiday
9,160.0,0.529829,160,2144,One Night at McCool's


NDCG

In [88]:
def ndcg_at_k(scores, k):
    best_scores = sorted(scores, reverse=True)[:k]
    best_dcg = 0
    for i in range(len(best_scores)):
        best_dcg += (2 ** best_scores[i] - 1) / np.log2(i + 2)
        
    true_scores = scores[:k]
    true_dcg = 0
    for i in range(len(true_scores)):
        true_dcg += (2 ** true_scores[i] - 1) / np.log2(i + 2)
        
    if best_dcg == 0:
        ndcg = 0
    else:
        ndcg = true_dcg / best_dcg
        
    return ndcg

print(ndcg_at_k(recomend.similarity, 10))

1.0
