# To do

1. Put data file `rating_complete.csv` into the `data/` directory.
2. Open and run this notebook. Make sure all libraries are included

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import time
import pickle
from copy import deepcopy
from sklearn.model_selection import train_test_split

In [2]:
from models.explicit_mf_with_bias import SGDExplicitBiasMF

In [3]:
data_path = "../../data/"
df = pd.read_csv(data_path + "rating_complete.csv")
df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


In [5]:
df_count = df.groupby("user_id").agg({"anime_id":"count", "rating":["mean", "std", "min", "max"]})

KeyboardInterrupt: 

In [None]:
df_count

Unnamed: 0_level_0,anime_id,rating,rating,rating,rating
Unnamed: 0_level_1,count,mean,std,min,max
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,35,7.400000,1.575548,4,10
1,103,8.058252,1.186734,5,10
2,51,8.333333,0.909212,6,10
3,315,7.603175,0.820511,3,10
4,118,7.652542,1.179433,5,10
...,...,...,...,...,...
353400,67,8.507463,0.990456,5,10
353401,62,7.532258,1.715099,2,10
353402,19,8.000000,1.452966,6,10
353403,67,8.805970,0.908525,7,10


In [None]:
user_decoder = df.user_id.drop_duplicates().sort_values().reset_index(drop=True)
user_encoder = pd.Series(data=user_decoder.index, index=user_decoder.values)
user_encoder = user_encoder.to_dict()

In [None]:
anime_decoder = df.anime_id.drop_duplicates().sort_values().reset_index(drop=True)
anime_encoder = pd.Series(data=anime_decoder.index, index=anime_decoder.values)
anime_encoder = anime_encoder.to_dict()

In [None]:
df_encode = df.copy()
df_encode['anime_id'] = df.anime_id.apply(anime_encoder.get)
df_encode['user_id'] = df.user_id.apply(user_encoder.get)
df_encode['rating'] = df_encode['rating']
df_encode

Unnamed: 0,user_id,anime_id,rating
0,0,402,9
1,0,907,5
2,0,2740,7
3,0,534,7
4,0,2539,9
...,...,...,...
57633273,310058,468,8
57633274,310058,890,4
57633275,310058,201,8
57633276,310058,218,7


In [None]:
N_USER, N_ANIME = df_encode.user_id.nunique(), df_encode.anime_id.nunique()

In [None]:

df_enc_train, df_enc_test_eval = train_test_split(df_encode, test_size=0.04)
df_enc_test, df_enc_eval = train_test_split(df_enc_test_eval, test_size=0.5)
train_set = csr_matrix((df_enc_train.rating, (df_enc_train.user_id, df_enc_train.anime_id)), shape=(N_USER, N_ANIME))
eval_set = csr_matrix((df_enc_eval.rating, (df_enc_eval.user_id, df_enc_eval.anime_id)), shape=(N_USER, N_ANIME))
test_set = csr_matrix((df_enc_test.rating, (df_enc_test.user_id, df_enc_test.anime_id)), shape=(N_USER, N_ANIME))

Link: [ExplicitMF](https://www.ethanrosenthal.com/2016/01/09/explicit-matrix-factorization-sgd-als/)

In [None]:
# global_bias = df_enc_train.rating.mean()
# user_bias_init = (df_enc_train.groupby('user_id').rating.mean() - global_bias) / 2
# user_bias_init = user_bias_init.sort_index()
# item_bias_init = (df_enc_train.groupby('anime_id').rating.mean() - global_bias) / 2
# item_bias_init = item_bias_init.sort_index()
# ubi = pd.Series(np.arange(N_USER)).apply(user_bias_init.get).fillna(0)
# ibi = pd.Series(np.arange(N_ANIME)).apply(item_bias_init.get).fillna(0)
# ubi.values
# ibi.values

In [None]:
# sgd_mf_model = SGDExplicitBiasMF(ratings=test_set, 
#     ratings_eval = eval_set, n_factors = 128, 
#     early_stopping_rounds=2, verbose=True, 
#     item_fact_reg=0.005, item_bias_reg=0.005, 
#     user_fact_reg=0.001, user_bias_reg=0.01, 
#     model_saving_path=data_path)
# sgd_mf_model.train(max_iter=20, learning_rate=0.005)


In [None]:
sgd_mf_model = SGDExplicitBiasMF(ratings=train_set, 
    ratings_eval = eval_set, n_factors = 128, 
    early_stopping_rounds=10, verbose=True,
    item_fact_reg=0.005, item_bias_reg=0.005,
    user_fact_reg=0.005, user_bias_reg=0.005,
    model_saving_path=data_path)
sgd_mf_model.train(max_iter=200, learning_rate=0.005)
