In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import scipy
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import surprise as sp
import time
import random

I need to bring back some of the code from EDA as there are some specific variables there that will need to be used for the Preprocessing and Model element.

In [2]:
user_df = pd.read_csv('users_cleaned.csv')
anime_df = pd.read_csv('anime_cleaned.csv')

In [3]:
score_df = pd.read_csv('animelists_cleaned.csv')

In [4]:
score_df = score_df[['username', 'anime_id', 'my_score', 'my_status']]

In [5]:
score_df['my_score'].describe().apply(lambda x: format(x, '.2f')).reset_index()

Unnamed: 0,index,my_score
0,count,31284030.0
1,mean,4.65
2,std,3.93
3,min,0.0
4,25%,0.0
5,50%,6.0
6,75%,8.0
7,max,10.0


In [6]:
user_scores = score_df['username'].value_counts().reset_index().rename(
    columns={"username": "animes_rated", "index": "username"})

I need to create a method for my metrics before beginning the preprocessing and model, I am going with the Precision@k and Recall@k metric as this is a very common metric for recommender systems and it gives an importance to recommendations.

In [7]:
sample_users = user_df.sample(frac = .01, random_state = 2)
sample_users.head()

Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes
48731,Saperella,98375,21,162,0,4,28,34.55,Male,"York, UK",1989-08-31 00:00:00,,2008-09-25 00:00:00,2010-07-22 05:41:00,7.94,4.0,2131
35191,Inugirlz,207889,52,908,227,164,300,301.467361,Female,Where Maple Leaves Reside,1994-04-04 00:00:00,,2009-07-14 00:00:00,2018-05-19 14:23:41,7.16,58.0,18796
77699,Luu-Senpai,5393511,10,663,0,0,19,177.401111,Male,Buenos Aires,1991-05-23 00:00:00,,2016-05-07 00:00:00,2018-04-27 04:38:21,7.72,0.0,11129
95207,OzzyOtaku,607875,16,342,0,76,69,91.565278,Male,TEXAS,1989-01-24 00:00:00,,2011-09-19 00:00:00,2018-05-20 18:51:57,7.77,18.0,5987
85049,Ahlala,485718,1,67,29,8,32,37.024306,Female,Philippines,1991-04-10 00:00:00,,2011-05-12 00:00:00,2017-07-27 01:10:00,8.31,27.0,2251


In [8]:
user_score_sampled = pd.merge(user_scores, sample_users, left_on = 'username', right_on = 'username', 
                                 how = 'inner')

In [9]:
aggregated_user_ratings = user_score_sampled['animes_rated'].value_counts().reset_index().rename(
    columns={"animes_rated": "group_size", "index": "animes_rated"}).sort_values(by=['animes_rated'])

In [10]:
rating_anime = score_df['anime_id'].value_counts().reset_index().rename(
    columns={"anime_id": "number_of_users", "index": "anime_id"})
rating_anime.head()

Unnamed: 0,anime_id,number_of_users
0,1535,81332
1,1575,67721
2,226,64486
3,5114,63649
4,4224,63278


In [11]:
aggregated_anime_ratings = rating_anime['number_of_users'].value_counts().reset_index().rename(
    columns={"number_of_users": "group_size", "index": "number_of_users"}).sort_values(by=['number_of_users'])
aggregated_anime_ratings.head(30)

Unnamed: 0,number_of_users,group_size
331,1,3
27,2,11
25,3,11
23,4,12
6,5,16
18,6,13
14,7,14
7,8,16
0,9,22
2,10,21


I need to create a method for my metrics before beginning the preprocessing and model, I am going with the Precision@k and Recall@k metric as this is a very common metric for recommender systems and it gives an importance to recommendations.



In [12]:
def precision_recall(predictions, k=10, threshold= 7):
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        
        recalls[uid] = n_rel_and_rec_k /  n_rel if n_rel != 0 else 1

    return precisions, recalls

I ran into a few issues, this one is about a "cold start" which essentially is the problem that a user does not have any historical data for us to go through. So I'm only recommending for people that have a specific amount rated.

In [13]:
user_rate_10 = user_score_sampled[user_score_sampled['animes_rated']>= 10]
anime_rate_10 = rating_anime[rating_anime['number_of_users']>=10]

In [14]:
score_df_cold_start = pd.merge(score_df, user_rate_10, left_on = 'username', right_on = 'username', how = 'inner')
score_df_cold_start = pd.merge(score_df_cold_start, anime_rate_10, left_on = 'anime_id', right_on = 'anime_id', how = 'inner')


Another problem I kept facing was the fact that my dataframes were initially too large. I had to reclean them to get a much smaller sample size. 

In [15]:
print('The initial dataframe has {0} registers and the sampled one has {1} rows.'.format(
    score_df['username'].count(), score_df_cold_start['username'].count()))

The initial dataframe has 31283787 registers and the sampled one has 324206 rows.


# Training and PreProcessing

In [16]:
random_state = 42
reader = sp.Reader(rating_scale=(0, 10))
data = sp.Dataset.load_from_df(score_df_cold_start[['username', 'anime_id', 'my_score']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=.25, random_state = random_state)
analysis = defaultdict(list)

# Model

Initially, I couldn't bring in any of the KNN models, because of the sheer size and my code earlier would cause performance problems causing the kernel to crash. Reducing the size of the dataset by recleaning and re-adjusting my earlier code, allowed me to add these models back in. 

In [17]:
model = {'SVD' : sp.SVD(random_state=random_state), 
         'SlopeOne' : sp.SlopeOne(), 
         'NMF' : sp.NMF(random_state=random_state), 
         'NormalPredictor' : sp.NormalPredictor(), 
         'KNNBaseline' : sp.KNNBaseline(random_state=random_state), 
         'KNNBasic' : sp.KNNBasic(random_state=random_state), 
         'KNNWithMeans' : sp.KNNWithMeans(random_state=random_state), 
         'KNNWithZScore' : sp.KNNWithZScore(random_state=random_state), 
         'BaselineOnly' : sp.BaselineOnly(), 
         'CoClustering': sp.CoClustering(random_state=random_state)}


In [18]:
try:
    for key, value in model.items():
        print(f"Training {value}")
        start = time.time()    
        value.fit(trainset)
        predictions = value.test(testset)

        rmse = sp.accuracy.rmse(predictions)
        precisions, recalls = precision_recall(predictions, k=10, threshold=7)
        precision_avg = sum(prec for prec in precisions.values()) / len(precisions)

        analysis[value] = (key, rmse, precision_avg, time.time() - start)

    print(analysis)
except Exception as e:
    print(f"Error while training {value}")
    print(e)


Training <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fc5e8031a80>
RMSE: 3.5223
Training <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7fc5e8033790>
RMSE: 3.2537
Training <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7fc5e8033160>
RMSE: 3.3655
Training <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7fc5e8033400>
RMSE: 5.1279
Training <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7fc5e80314e0>
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.2144
Training <surprise.prediction_algorithms.knns.KNNBasic object at 0x7fc5e8032950>
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.3635
Training <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7fc5e8033b20>
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 3.2437
Training <surprise.prediction_algorithms.knn

In [19]:
analysis_df = pd.DataFrame.from_dict(analysis, orient = 'index', 
                                     columns = ['Algorithm', 'RMSE', 'Precision@10', 'Time to run (in seconds)']).reset_index()


In [20]:
analysis_df = analysis_df[['Algorithm', 'RMSE', 'Precision@10', 'Time to run (in seconds)']]
analysis_df = analysis_df.sort_values(by=['Precision@10'], ascending = False)
analysis_df['RMSE^-1'] = analysis_df['RMSE'] ** -1
analysis_df.head(n = 15)

Unnamed: 0,Algorithm,RMSE,Precision@10,Time to run (in seconds),RMSE^-1
8,BaselineOnly,3.243375,0.916755,0.808258,0.308321
1,SlopeOne,3.253656,0.895721,31.088714,0.307347
6,KNNWithMeans,3.243719,0.893348,10.369137,0.308288
4,KNNBaseline,3.214395,0.883658,12.280892,0.311101
9,CoClustering,3.291171,0.865126,6.19262,0.303843
7,KNNWithZScore,3.24702,0.845872,11.15021,0.307975
5,KNNBasic,3.363484,0.825126,9.670397,0.297311
2,NMF,3.365521,0.796139,5.216902,0.297131
0,SVD,3.52228,0.765022,4.707509,0.283907
3,NormalPredictor,5.127876,0.548498,0.607707,0.195013


Based on the models above, it looks like the best performing model is BaselineOnly so i will be using this model. it is the simplest and the least costing collaborative filtering algorithms.

In [21]:
als_param_grid = {'bsl_options': {'method': ['als'],
                              'reg_i': [5, 10, 15],
                              'reg_u': [10, 15, 20],
                              'n_epochs': [5, 10, 15, 20]
                              }
              }

sgd_param_grid = {'bsl_options': {'method': ['sgd'],
                              'reg': [0.01, 0.02, 0.03],
                              'n_epochs': [5, 10, 15, 20],
                              'learning_rate' : [0.001, 0.005, 0.01]
                              }
              }

als_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, als_param_grid, measures=['rmse'], cv = 3, joblib_verbose = 0)

sgd_gs = sp.model_selection.GridSearchCV(sp.BaselineOnly, sgd_param_grid, measures=['rmse'], cv = 3, joblib_verbose = 0)

In [22]:
als_gs.fit(data)

print(als_gs.best_score['rmse'])

# combo of parameters that achieve the best scores
print(als_gs.best_params['rmse'])

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [23]:
sgd_gs.fit(data)

print(sgd_gs.best_score['rmse'])

# combo of parameters that achieve the best scores
print(sgd_gs.best_params['rmse'])

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimati

The default options for the params brought a better RMSE than the initial change of both grids, so I will keep everything how it is.

In [24]:
trainset = data.build_full_trainset()
algo = sp.BaselineOnly()
algo.fit(trainset)


Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7fc5e80300a0>

In [25]:
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [26]:
last_predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
last_predictions.drop('rui', inplace = True, axis = 1)

Now I have generated a list of all possible rcommendations, and now i'm going to create a function that shows the best recommendation per customer.

In [27]:
def item_values(df, uid, n=10):
    df = df[df['uid'] == uid].nlargest(n, 'est')[['uid', 'iid', 'est']]
    df = pd.merge(df, anime_df, left_on = 'iid', right_on = 'anime_id', how = 'left')
    return df[['uid', 'est', 'title', 'genre']]

In [28]:
item_values(last_predictions, 'SocialAwkward')

Unnamed: 0,uid,est,title,genre
0,SocialAwkward,8.130836,One Punch Man,"Action, Sci-Fi, Comedy, Parody, Super Power, S..."
1,SocialAwkward,7.945342,Dragon Ball Z,"Action, Adventure, Comedy, Fantasy, Martial Ar..."
2,SocialAwkward,7.909949,Kimi no Na wa.,"Supernatural, Drama, Romance, School"
3,SocialAwkward,7.862435,Shelter,"Sci-Fi, Music"
4,SocialAwkward,7.759719,Final Fantasy VII: Advent Children,"Action, Super Power, Fantasy"
5,SocialAwkward,7.715446,Fullmetal Alchemist,"Action, Adventure, Comedy, Drama, Fantasy, Mag..."
6,SocialAwkward,7.712281,One Piece 3D2Y: Ace no shi wo Koete! Luffy Nak...,"Adventure, Comedy, Fantasy, Shounen"
7,SocialAwkward,7.667276,Fate/Zero 2nd Season,"Action, Supernatural, Magic, Fantasy"
8,SocialAwkward,7.666911,Fate/stay night: Unlimited Blade Works 2nd Season,"Action, Supernatural, Magic, Fantasy"
9,SocialAwkward,7.663673,Fate/stay night: Unlimited Blade Works - Prologue,"Action, Supernatural, Magic, Fantasy"
