# Imports

In [1]:
# %%capture
!pip install uszipcode
!pip install surprise
!pip install catboost

Collecting uszipcode
  Downloading uszipcode-1.0.1-py2.py3-none-any.whl (35 kB)
Collecting pathlib-mate (from uszipcode)
  Downloading pathlib_mate-1.2.1-py2.py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting atomicwrites (from uszipcode)
  Downloading atomicwrites-1.4.1.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fuzzywuzzy (from uszipcode)
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Collecting haversine>=2.5.0 (from uszipcode)
  Downloading haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Collecting sqlalchemy-mate>=1.4.28.3 (from uszipcode)
  Downloading sqlalchemy_mate-1.4.28.4-py2.py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting SQLAlchemy>=1.4.0 (from uszipcode)
  Downloading SQLAlchemy-1.4.50-cp310-cp310-manylinux

In [2]:
# %%capture
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2023-12-03 15:33:34--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-12-03 15:33:37 (2.43 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base  

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVDpp, SVD, NMF
from surprise.model_selection import cross_validate
import plotly.express as px
from catboost import CatBoostClassifier, Pool
from itertools import product
from uszipcode import SearchEngine
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np



# Load data

In [4]:
with open('ml-100k/u.info', 'r') as f:
    print(f.read())

ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None)
ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings

943 users
1682 items
100000 ratings



Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [5]:
genres = pd.read_csv('ml-100k/u.genre', sep="|", header=None)
genres

Unnamed: 0,0,1
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [6]:
items = pd.read_csv('ml-100k/u.item', sep='|', header=None, encoding='latin-1')
items.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'url'] + genres.iloc[:, 0].tolist()
items

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
occupations = pd.read_csv("ml-100k/u.occupation", sep="|", header=None)
occupations

Unnamed: 0,0
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


In [8]:
users = pd.read_csv("ml-100k/u.user", sep="|", header=None)
users.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [9]:
# Load movie ratings
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u1.base', sep='\t', names=ratings_columns, encoding='latin-1')

# Load movie titles and genres
movies_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies_ = pd.read_csv('ml-100k/u.item', sep='|', names=movies_columns, encoding='latin-1')#, usecols=['movie_id', 'title'])


In [10]:
# Extracting movie genres
genre_columns = movies_columns[5:]
movies_genres = pd.read_csv('ml-100k/u.item', sep='|', names=movies_columns, encoding='latin-1', usecols=genre_columns)

# Convert genres to strings for TF-IDF
movies_genres['genres'] = movies_genres.apply(lambda row: ' '.join([genre_columns[i-5] for i, val in enumerate(row) if val == 1]), axis=1)
movies = movies_[['movie_id', 'title']].copy()
movies['genres'] = movies_genres['genres']


# EDA

First, let's look at the distribution of genres among the movies. It's clear that it is dominated by "Drama" and "Comedy" classes.

In [47]:
genres_count = movies_.iloc[:, 5:].sum(0)
px.bar(x=genres_count.index, y=genres_count.values, title="Genre counts").show()

Regarding the distribution of ratings, people are much more inclined to give ratings 3 and 4 to the movies. If using a classification model, I'd rather assign different weights to the classes

In [49]:
ratings_counts = ratings.groupby('rating').count().user_id
px.bar(x=ratings_counts.index, y=ratings_counts.values, title="Rating counts").show()

This is the distribution of age of the users. Mostly young people rate the movies (or watch movies in the first place)

In [50]:
age_counts = users.groupby('age').count().user_id
px.bar(x=age_counts.index, y=age_counts.values, title="User Ages").show()

Both number of ratings per user and per movie show a heavy tailed distribution with most of the values being close to 0. This may cause problems in predicting ratings for infrequent movies or users. Almost a cold-start problem.

In [45]:
px.histogram(ratings.groupby('user_id').count()['movie_id'].values, title="Number of ratings per user").show()


In [46]:
px.histogram(ratings.groupby('movie_id').count()['user_id'].values, title="Number of ratings per movie").show()

# Metrics code

I use three metrics:
* Hit Rate, which is the proportion of users for whom the recommendation list contains at least one liked movie.
* Average Reciprocal Hit Rank (ARHR), which is the average of the reciprocal ranks of the hit items.
* Normalized Discounted Cumulative Gain (NDCG), which provides a normalized score reflecting the quality of the recommendation list ordering.


In [14]:
def hit_rate(recommended_movies, liked_movies):
    hits = sum([1 for recs, likes in zip(recommended_movies, liked_movies) if len(set(recs) & set(likes)) > 0])
    return hits / len(recommended_movies)

def arhr(recommended_movies, liked_movies):
    reciprocal_ranks = []
    for recs, likes in zip(recommended_movies, liked_movies):
        hit_ranks = [1 / (recs.index(like) + 1) for like in likes if like in recs]
        if hit_ranks:
            reciprocal_ranks.append(sum(hit_ranks) / len(hit_ranks))
    return sum(reciprocal_ranks) / len(reciprocal_ranks) if reciprocal_ranks else 0

def ndcg(recommended_movies, liked_movies):
    import numpy as np
    DCGs = []
    for recs, likes in zip(recommended_movies, liked_movies):
        DCG = sum([1 / np.log2(recs.index(like) + 2) for like in likes if like in recs])
        IDCG = sum([1 / np.log2(i + 2) for i in range(len(likes))])
        DCGs.append(DCG / IDCG if IDCG > 0 else 0)
    return np.mean(DCGs)


# Collaborative Filtering

In [16]:
# Collaborative Filtering using SVD
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)
svd = SVD(n_epochs=30)
_ = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9483  0.9534  0.9520  0.9492  0.9509  0.9508  0.0018  
MAE (testset)     0.7476  0.7482  0.7502  0.7497  0.7489  0.7489  0.0009  
Fit time          1.28    2.56    2.62    4.72    2.71    2.78    1.10    
Test time         0.21    0.15    0.20    0.64    0.13    0.27    0.19    


# Hybrid Approach

In [15]:
# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [17]:
def hybrid_rating(row, top_user_ratings):
    user_id, movie_id = row.user_id, row.movie_id
    # Predict rating for the specific movie using collaborative filtering
    estimated_rating = svd.predict(user_id, movie_id).est

    # Get the indices of the top movies and the specific movie in the similarity matrix
    top_movie_indices = (top_user_ratings[row.user_id] - 1).tolist()
    movie_idx = movie_id - 1

    # Use NumPy to vectorize the calculation of similarities
    sim_scores = cosine_sim[movie_idx, top_movie_indices]

    # Calculate the adjusted rating
    adjusted_ratings = (estimated_rating + sim_scores) / 2

    # Calculate the final adjusted rating as the mean of adjusted ratings
    final_rating = np.mean(adjusted_ratings) if adjusted_ratings.size > 0 else estimated_rating

    return final_rating


# Context Model
That will be a context-awara recommendation model. For that I need to combine all the features for users and movies, so I recreate the data.

In [18]:
def load_ratings(path):
    ratings = pd.read_csv(path, sep='\t', header=None)
    ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = ratings.join(users.set_index('user_id'), on='user_id', lsuffix='l', rsuffix='r')
    ratings = ratings.join(items.set_index('movie_id'), on='movie_id', lsuffix='l', rsuffix='r')
    ratings = ratings.drop(columns=['movie_title', 'release_date', 'video_release_date', 'url'])

    return ratings

In [19]:
ratings_train = load_ratings('ml-100k/u1.base')
ratings_test = load_ratings('ml-100k/u1.test')
ratings_train

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,5,874965758,24,M,technician,85711,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,3,876893171,24,M,technician,85711,0,1,...,0,0,0,0,0,0,0,1,0,0
2,1,3,4,878542960,24,M,technician,85711,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,4,3,876893119,24,M,technician,85711,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,5,3,889751712,24,M,technician,85711,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,943,1067,2,875501756,22,M,student,77841,0,0,...,0,0,0,0,0,0,0,0,0,0
79996,943,1074,4,888640250,22,M,student,77841,0,0,...,0,0,0,0,0,0,0,0,0,0
79997,943,1188,3,888640250,22,M,student,77841,0,1,...,0,0,0,0,0,0,0,0,0,1
79998,943,1228,3,888640275,22,M,student,77841,0,1,...,0,0,0,0,0,0,0,0,0,0


Convert zip_code information into the state.

In [20]:
def zipcode_to_state(df):
    search = SearchEngine()

    # Preload zip code information
    zip_codes = df['zip_code'].unique()
    zip_info = {zc: search.by_zipcode(zc).state for zc in zip_codes if search.by_zipcode(zc) is not None}

    # Create a DataFrame for zip code to state mapping
    zip_info_df = pd.DataFrame(list(zip_info.items()), columns=['zip_code', 'state'])

    # Merge the original DataFrame with the zip info DataFrame
    df = pd.merge(df, zip_info_df, on='zip_code', how='left')

    # Replace NaN states with 'Unknown'
    df['state'] = df['state'].fillna('Unknown')

    # Optionally, drop the zip code column if no longer needed
    df = df.drop(columns=['zip_code'])

    return df

In [21]:
cat_features = [0, 1, 3, 4, 5, 6]

pp_train = ratings_train.iloc[:, :8].drop(columns=['timestamp'])
pp_train['genre'] = ratings_train.iloc[:, 8:].values.argmax(1)
pp_train = zipcode_to_state(pp_train)

pool_train = Pool(pp_train.drop(columns=['rating']), label=pp_train['rating'], cat_features=cat_features)

pp_test = ratings_test.iloc[:, :8].drop(columns=['timestamp'])
pp_test['genre'] = ratings_test.iloc[:, 8:].values.argmax(1)
pp_test = zipcode_to_state(pp_test)
pool_test = Pool(pp_test.drop(columns=['rating']), label=pp_test['rating'], cat_features=cat_features)

Download /root/.uszipcode/simple_db.sqlite from https://github.com/MacHu-GWU/uszipcode-project/releases/download/1.0.1.db/simple_db.sqlite ...
  1.00 MB downloaded ...
  2.00 MB downloaded ...
  3.00 MB downloaded ...
  4.00 MB downloaded ...
  5.00 MB downloaded ...
  6.00 MB downloaded ...
  7.00 MB downloaded ...
  8.00 MB downloaded ...
  9.00 MB downloaded ...
  10.00 MB downloaded ...
  11.00 MB downloaded ...
  Complete!


In [22]:
classes = np.unique(pp_train.rating)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=pp_train.rating)
class_weights = dict(zip(classes, weights))

In [34]:

cb_reg = CatBoostClassifier(iterations=13000,
                            task_type='GPU',
                            random_seed=13,
                            verbose=200,
                            eval_metric="AUC",
                            early_stopping_rounds=2000,
                            class_weights=class_weights
                            )

cb_reg.fit(pool_train, eval_set=pool_test)
pred = cb_reg.predict(pool_test)

Learning rate set to 0.053364


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7039005	best: 0.7039005 (0)	total: 51.7ms	remaining: 11m 12s
200:	test: 0.7727705	best: 0.7727958 (198)	total: 6.15s	remaining: 6m 31s
400:	test: 0.7767747	best: 0.7767747 (400)	total: 12.3s	remaining: 6m 27s
600:	test: 0.7791045	best: 0.7791045 (600)	total: 16.9s	remaining: 5m 48s
800:	test: 0.7802334	best: 0.7802798 (792)	total: 21.4s	remaining: 5m 26s
1000:	test: 0.7811499	best: 0.7811628 (991)	total: 28.2s	remaining: 5m 38s
1200:	test: 0.7817143	best: 0.7817206 (1199)	total: 31.6s	remaining: 5m 10s
1400:	test: 0.7823300	best: 0.7823614 (1388)	total: 35.9s	remaining: 4m 56s
1600:	test: 0.7825227	best: 0.7825227 (1600)	total: 42.9s	remaining: 5m 5s
1800:	test: 0.7825079	best: 0.7825701 (1720)	total: 46.5s	remaining: 4m 49s
2000:	test: 0.7827378	best: 0.7828177 (1991)	total: 49.9s	remaining: 4m 34s
2200:	test: 0.7826568	best: 0.7828177 (1991)	total: 57s	remaining: 4m 39s
2400:	test: 0.7826144	best: 0.7828177 (1991)	total: 1m 1s	remaining: 4m 29s
2600:	test: 0.7825219	best: 

Classification performance is far from perfect. Most probably the existing features do not carry enough information to make good predictions.

In [35]:
print(classification_report(y_true=pp_test.rating, y_pred=pred))

              precision    recall  f1-score   support

           1       0.28      0.55      0.37      1391
           2       0.24      0.30      0.26      2192
           3       0.39      0.26      0.31      5182
           4       0.46      0.32      0.38      6778
           5       0.44      0.63      0.52      4457

    accuracy                           0.39     20000
   macro avg       0.36      0.41      0.37     20000
weighted avg       0.40      0.39      0.38     20000



Here is the code for calculation of the ranking metrics.

In [36]:

def generate_recommendations(df, top_n=10):
    # Sort the predictions for each user and select the top N movies
    top_recommendations = df.sort_values(['user_id', 'predicted_rating'], ascending=[True, False]).groupby('user_id').head(top_n)

    # Create a DataFrame of recommendations
    recommendations = top_recommendations.groupby('user_id')['movie_id'].apply(list).reset_index(name='recommended_movies')

    return recommendations


In [37]:
def calc_metrics(true_df, pred_df, top_n=10):
    recommendations = generate_recommendations(pred_df, top_n=top_n)
    # Filter the ratings DataFrame to include only high ratings (e.g., ratings of 4 or higher)
    high_ratings = true_df[true_df['rating'] >= 4]

    # Create a list of liked movies for each user based on high ratings
    liked_movies_per_user = high_ratings.groupby('user_id')['movie_id'].apply(list)

    # Filter recommendations and liked_movies lists to include only users present in both DataFrames
    common_users = set(recommendations['user_id']) & set(liked_movies_per_user.index)
    filtered_recommendations = recommendations[recommendations['user_id'].isin(common_users)]['recommended_movies'].tolist()
    filtered_liked_movies = liked_movies_per_user[liked_movies_per_user.index.isin(common_users)].tolist()

    # Calculate metrics
    hit_rate_score = hit_rate(filtered_recommendations, filtered_liked_movies)
    arhr_score = arhr(filtered_recommendations, filtered_liked_movies)
    ndcg_score = ndcg(filtered_recommendations, filtered_liked_movies)

    return hit_rate_score, arhr_score, ndcg_score


All user-movies pairs except the ones that are present in the train dataset

In [38]:
def get_pred_dataset(exclude_df):
    unique_users = users['user_id'].unique()
    unique_movies = items['movie_id'].unique()

    # Create all possible combinations of users and movies
    all_combinations = pd.DataFrame(product(unique_users, unique_movies), columns=['user_id', 'movie_id'])

    # Exclude combinations that are already in the original DataFrame
    combined_df = pd.merge(all_combinations, exclude_df, on=['user_id', 'movie_id'], how='left', indicator=True)
    all_pairs = combined_df[combined_df['_merge'] == 'left_only'].drop(columns=['_merge'])

    # Reset index if desired
    all_pairs.reset_index(drop=True, inplace=True)

    # Build the dataframe with all the features
    all_pairs = all_pairs.join(users.set_index('user_id'), on='user_id', lsuffix='l', rsuffix='r')
    all_pairs = all_pairs.join(items.set_index('movie_id'), on='movie_id', lsuffix='l', rsuffix='r')
    all_pairs = all_pairs.drop(columns=['movie_title', 'release_date', 'video_release_date', 'url'])

    all_pairs_ = all_pairs.iloc[:, :6]
    all_pairs_['genre'] = all_pairs.iloc[:, 6:].values.argmax(1)
    all_pairs = zipcode_to_state(all_pairs_)
    return all_pairs

Create a dataset which consists of all user-movie pairs that are not present in the train dataset, i.e. the ones that users didn't rate yet.

In [39]:
pred_df = get_pred_dataset(pp_train[['user_id', 'movie_id']])

Random Model Performance as a reference.

In [53]:
pred = cb_reg.predict(pred_df)
pred = np.random.permutation(pred)

inp = pred_df[['user_id', 'movie_id']].copy()
inp['predicted_rating'] = pred

hit_rate_score, arhr_score, ndcg_score = calc_metrics(true_df=pp_test, pred_df=inp)

print(f'Hit Rate: {hit_rate_score}')
print(f'ARHR: {arhr_score}')
print(f'NDCG: {ndcg_score}')

Hit Rate: 0.32894736842105265
ARHR: 0.3600443121693123
NDCG: 0.027972200882412274


CatBoost Performance

In [51]:
pred = cb_reg.predict(pred_df)

inp = pred_df[['user_id', 'movie_id']].copy()
inp['predicted_rating'] = pred

hit_rate_score, arhr_score, ndcg_score = calc_metrics(true_df=pp_test, pred_df=inp)

print(f'Hit Rate: {hit_rate_score}')
print(f'ARHR: {arhr_score}')
print(f'NDCG: {ndcg_score}')

Hit Rate: 0.5131578947368421
ARHR: 0.33590565531372407
NDCG: 0.0609500795007173


Collaborative Filtering Performance

In [30]:
def get_svd_rating(row):
    return svd.predict(row.user_id, row.movie_id).est

pred = pred_df[['user_id', 'movie_id']].apply(get_svd_rating, axis=1)

inp = pred_df[['user_id', 'movie_id']].copy()
inp['predicted_rating'] = pred

hit_rate_score, arhr_score, ndcg_score = calc_metrics(true_df=pp_test, pred_df=inp)

print(f'Hit Rate: {hit_rate_score}')
print(f'ARHR: {arhr_score}')
print(f'NDCG: {ndcg_score}')

Hit Rate: 0.5307017543859649
ARHR: 0.34462735737086386
NDCG: 0.06555602033913271


Hybrid Approach Performance

In [31]:
def get_user_ratings(df):
    top_user_ratings_dict = {}
    for user_id in users.user_id.unique():
        top_user_ratings_dict[user_id] = df[(df['user_id'] == 1) & (df['rating'] >= 4)].nlargest(5, 'rating')['movie_id'].values

    return top_user_ratings_dict

In [32]:
user_ratings = get_user_ratings(ratings_train)
pred = pred_df[['user_id', 'movie_id']].apply(lambda row: hybrid_rating(row, user_ratings), axis=1)

inp = pred_df[['user_id', 'movie_id']].copy()
inp['predicted_rating'] = pred

hit_rate_score, arhr_score, ndcg_score = calc_metrics(true_df=pp_test, pred_df=inp)

print(f'Hit Rate: {hit_rate_score}')
print(f'ARHR: {arhr_score}')
print(f'NDCG: {ndcg_score}')

Hit Rate: 0.4473684210526316
ARHR: 0.30816426107480044
NDCG: 0.04722512800731783
