In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import time
import pickle

In [2]:
df = pd.read_csv("anime_recommendation/rating_complete.csv")
df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


In [4]:
df_count = df.groupby("user_id").agg({"anime_id":"count", "rating":["mean", "std", "min", "max"]})

In [5]:
df_count

Unnamed: 0_level_0,anime_id,rating,rating,rating,rating
Unnamed: 0_level_1,count,mean,std,min,max
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,35,7.400000,1.575548,4,10
1,103,8.058252,1.186734,5,10
2,51,8.333333,0.909212,6,10
3,315,7.603175,0.820511,3,10
4,118,7.652542,1.179433,5,10
...,...,...,...,...,...
353400,67,8.507463,0.990456,5,10
353401,62,7.532258,1.715099,2,10
353402,19,8.000000,1.452966,6,10
353403,67,8.805970,0.908525,7,10


In [6]:
user_decoder = df.user_id.drop_duplicates().sort_values().reset_index(drop=True)
user_encoder = pd.Series(data=user_decoder.index, index=user_decoder.values)
user_encoder = user_encoder.to_dict()

In [7]:
anime_decoder = df.anime_id.drop_duplicates().sort_values().reset_index(drop=True)
anime_encoder = pd.Series(data=anime_decoder.index, index=anime_decoder.values)
anime_encoder = anime_encoder.to_dict()

In [8]:
df_encode = df.copy()
df_encode['anime_id'] = df.anime_id.apply(anime_encoder.get)
df_encode['user_id'] = df.user_id.apply(user_encoder.get)
df_encode['rating'] = df_encode['rating']
df_encode

Unnamed: 0,user_id,anime_id,rating
0,0,402,9
1,0,907,5
2,0,2740,7
3,0,534,7
4,0,2539,9
...,...,...,...
57633273,310058,468,8
57633274,310058,890,4
57633275,310058,201,8
57633276,310058,218,7


In [9]:
N_USER, N_ANIME = df_encode.user_id.nunique(), df_encode.anime_id.nunique()

In [10]:
from sklearn.model_selection import train_test_split

df_enc_train, df_enc_test_eval = train_test_split(df_encode, test_size=0.04)
df_enc_test, df_enc_eval = train_test_split(df_enc_test_eval, test_size=0.5)
train_set = csr_matrix((df_enc_train.rating, (df_enc_train.user_id, df_enc_train.anime_id)), shape=(N_USER, N_ANIME))
eval_set = csr_matrix((df_enc_eval.rating, (df_enc_eval.user_id, df_enc_eval.anime_id)), shape=(N_USER, N_ANIME))
test_set = csr_matrix((df_enc_test.rating, (df_enc_test.user_id, df_enc_test.anime_id)), shape=(N_USER, N_ANIME))

Link: [ExplicitMF](https://www.ethanrosenthal.com/2016/01/09/explicit-matrix-factorization-sgd-als/)

In [11]:
class SGDExplicitBiasMF:
    def __init__(self, 
                 ratings,
                 ratings_eval,
                 n_factors=64,
                 early_stopping_rounds=10,
                 item_fact_reg=0.0, 
                 user_fact_reg=0.0,
                 item_bias_reg=0.0,
                 user_bias_reg=0.0,
                 verbose=False,
                 model_saving_path="."):
        """
        Train a matrix factorization model to predict empty 
        entries in a matrix. The terminology assumes a 
        ratings matrix which is ~ user x item
        
        Params
        ======
        ratings : (ndarray)
            User x Item matrix with corresponding ratings
        
        n_factors : (int)
            Number of latent factors to use in matrix 
            factorization model
        
        item_fact_reg : (float)
            Regularization term for item latent factors
        
        user_fact_reg : (float)
            Regularization term for user latent factors
            
        item_bias_reg : (float)
            Regularization term for item biases
        
        user_bias_reg : (float)
            Regularization term for user biases
        
        verbose : (bool)
            Whether or not to printout training progress
        """
        
        self.ratings = ratings
        self.n_users, self.n_items = ratings.shape
        self.ratings_eval = ratings_eval
        self.n_factors = n_factors
        self.early_stopping_rounds = early_stopping_rounds
        self.item_fact_reg = item_fact_reg
        self.user_fact_reg = user_fact_reg
        self.item_bias_reg = item_bias_reg
        self.user_bias_reg = user_bias_reg
        self.sample_row, self.sample_col = self.ratings.nonzero()
        self.n_samples = len(self.sample_row)
        self._v = verbose
        self._manual_init_bias = False
        self.model_saving_path = model_saving_path

    def init_bias(self, user_bias_init, item_bias_init):
        self.global_bias = np.mean(self.ratings[self.ratings != 0])
        self.user_bias = user_bias_init
        self.item_bias = item_bias_init
        self._manual_init_bias = True

    def train(self, max_iter=200, learning_rate=0.005):
        """ Train model for n_iter iterations from scratch."""
        # initialize latent vectors        
        self.user_vecs = np.random.normal(scale=1./self.n_factors,\
                                          size=(self.n_users, self.n_factors))
        self.item_vecs = np.random.normal(scale=1./self.n_factors,
                                          size=(self.n_items, self.n_factors))
        self.learning_rate = learning_rate
        if not self._manual_init_bias:
            self.user_bias = np.zeros(self.n_users)
            self.item_bias = np.zeros(self.n_items)
            self.global_bias = np.mean(self.ratings[self.ratings != 0])

        self.min_mse_eval = np.Inf
        self.list_mse_eval = []

        self.partial_train(n_iter=max_iter, save_interim = True)
    
    
    def partial_train(self, n_iter, save_interim = True):
        """ 
        Train model for n_iter iterations. Can be 
        called multiple times for further training.
        """
        iter_cnt = 1
        while iter_cnt <= n_iter:
            
            self.training_indices = np.arange(self.n_samples)
            np.random.shuffle(self.training_indices)
            self.iter_idx = iter_cnt
            self.sgd()
            eval_mse = self.evaluate(self.ratings_eval)
            self.min_mse_eval = min(eval_mse, self.min_mse_eval)
            self.list_mse_eval.append(eval_mse)
            if self._v:
                print(f"Iteration {iter_cnt}. Latest MSE: {eval_mse:.4f}. Min MSE: {self.min_mse_eval:.4f}.")
            
            now = int(time.time())
            with open(f"{self.model_saving_path}/model_sgd_mf_v3_{now}.pkl", "wb") as f:
                pickle.dump(self, file=f)

            if min(self.list_mse_eval[-self.early_stopping_rounds:]) > self.min_mse_eval: 
                print("Early stopping due to non-improvement on the test set")
                break
            iter_cnt += 1

    def sgd(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            prediction = self.predict(u, i)
            e = (self.ratings[u,i] - prediction) # error
            
            # Update biases
            self.user_bias[u] += self.learning_rate * \
                                (e - self.user_bias_reg * self.user_bias[u])
            self.item_bias[i] += self.learning_rate * \
                                (e - self.item_bias_reg * self.item_bias[i])
            
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * \
                                    (e * self.item_vecs[i, :] - \
                                     self.user_fact_reg * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * \
                                    (e * self.user_vecs[u, :] - \
                                     self.item_fact_reg * self.item_vecs[i,:])
            # if idx % 1000 == 0:
            #     print(idx)
            
    def predict(self, u, i):
        """ Single user and item prediction."""
        prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
        prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
        return prediction
    
    def predict_all(self):
        """ Predict ratings for every user and item."""
        predictions = np.zeros((self.user_vecs.shape[0], 
                                self.item_vecs.shape[0]))
        for u in range(self.user_vecs.shape[0]):
            for i in range(self.item_vecs.shape[0]):
                predictions[u, i] = self.predict(u, i)
        return predictions

    def evaluate(self, test_sparse_matrix):
        nz_row, nz_col = test_sparse_matrix.nonzero()
        n_idx = len(nz_row)
        rating_pred = np.zeros(n_idx)
        rating_true = np.zeros(n_idx)
        for idx in np.arange(n_idx):
            irow, icol = nz_row[idx], nz_col[idx]
            rating_pred[idx] = self.predict(irow, icol)
            rating_true[idx] = test_sparse_matrix[irow, icol]
        mse = mean_squared_error(rating_true, rating_pred)
        return mse
        
    


In [12]:
# global_bias = df_enc_train.rating.mean()
# user_bias_init = (df_enc_train.groupby('user_id').rating.mean() - global_bias) / 2
# user_bias_init = user_bias_init.sort_index()
# item_bias_init = (df_enc_train.groupby('anime_id').rating.mean() - global_bias) / 2
# item_bias_init = item_bias_init.sort_index()
# ubi = pd.Series(np.arange(N_USER)).apply(user_bias_init.get).fillna(0)
# ibi = pd.Series(np.arange(N_ANIME)).apply(item_bias_init.get).fillna(0)
# ubi.values
# ibi.values

In [13]:
sgd_mf_model = SGDExplicitBiasMF(ratings=test_set, 
    ratings_eval = eval_set, n_factors = 128, 
    early_stopping_rounds=2, verbose=True, 
    item_fact_reg=0.005, item_bias_reg=0.005, 
    user_fact_reg=0.001, user_bias_reg=0.01)
sgd_mf_model.train(max_iter=20, learning_rate=0.005)


Iteration 1. Latest MSE: 2.2290. Min MSE: 2.2290.
Iteration 2. Latest MSE: 2.0991. Min MSE: 2.0991.
Iteration 3. Latest MSE: 2.0186. Min MSE: 2.0186.
Iteration 4. Latest MSE: 1.9613. Min MSE: 1.9613.


KeyboardInterrupt: 

In [None]:
# sgd_mf_model = SGDExplicitBiasMF(ratings=train_set, 
#     ratings_eval = eval_set, n_factors = 128, 
#     early_stopping_rounds=2, verbose=True, 
#     item_fact_reg=0.01, item_bias_reg=0.01, 
#     user_fact_reg=0.02, user_bias_reg=0.02)
# sgd_mf_model.train(max_iter=20, learning_rate=0.01)
