In [1]:
from __future__ import division
from __future__ import print_function 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
import numpy as np 

from sklearn.preprocessing import LabelEncoder
from yarst.model_selection import TrainTestSplit
from yarst.model import StochasticGradientDescent,AlternatingLeastSquare,CollaborativeFiltering
from yarst.model import Recommender
from yarst.metrics import mae,mse
# set seed to repeat the task

seed_set = 1234
np.random.seed(seed_set)


## Import Data

In [2]:
df_rating = pd.read_csv('data.csv')
df_rating.head(5)

Unnamed: 0,user,movie_title,rating,genres,timestamp
0,1,Dangerous Minds (1995),2.5,Drama,1260759144
1,7,Dangerous Minds (1995),3.0,Drama,851868750
2,31,Dangerous Minds (1995),4.0,Drama,1273541953
3,32,Dangerous Minds (1995),4.0,Drama,834828440
4,36,Dangerous Minds (1995),3.0,Drama,847057202


In [3]:
# process data

user_encoder = LabelEncoder()
df_rating['user_id'] = user_encoder.fit_transform(df_rating['user'].values)

movie_encoder = LabelEncoder()
df_rating['movie_id'] = user_encoder.fit_transform(df_rating['movie_title'].values)

# drop dups
df_rating = df_rating.drop_duplicates(['user_id','movie_id'])
df_rating.shape
df_rating.head(5)

(100003, 7)

Unnamed: 0,user,movie_title,rating,genres,timestamp,user_id,movie_id
0,1,Dangerous Minds (1995),2.5,Drama,1260759144,0,1976
1,7,Dangerous Minds (1995),3.0,Drama,851868750,6,1976
2,31,Dangerous Minds (1995),4.0,Drama,1273541953,30,1976
3,32,Dangerous Minds (1995),4.0,Drama,834828440,31,1976
4,36,Dangerous Minds (1995),3.0,Drama,847057202,35,1976


In [4]:
# split the train and test 
# params 
# split_count: int. Number of user-item-ratings per user to move from training to test set.
# threshold: int, set the minimin number of items that test has hold
# fractions : float. Fraction of users to split off some of their interactions into test set. 
#            If None, then all users are considered.

x_col = {'user':'user_id','item':'movie_id','rate':'rating'}
df_train, df_test = TrainTestSplit(df_rating,x_col,
                                   split_count = 100,threshold = 200, fraction = None,
                                   verbose=1)

# remove dups
df_train = df_train.drop_duplicates(['user_id','movie_id'])
df_test = df_test.drop_duplicates(['user_id','movie_id'])


557 unique users in train
Train sparsity: 1.82%
137 unique users in test
Test sparsity: 2.43%


## Build Model(SGD)

In [5]:
# build the sgd, components is hidden factor number
sgd = StochasticGradientDescent(components=100)

#fit the data via df_train
x_col_dict = {'user':'user_id','item':'movie_id'}
y_col_dict = {'rate':'rating'}
sgd.fit(df_train,x_col_dict,y_col_dict)

<yarst.model.stochastic_gradient_descent.StochasticGradientDescent instance at 0x7f1d69f277e8>

In [6]:
# build recommender to train model
# learning_rate: learning rate
# epochs: epochs
# shuffle: whether shuffle the data order 
# reg_user,reg_item: regularize of use/item
# reg_bias_user, reg_bias_item: regularize of use/item bias

recommender_sgd = Recommender(sgd)

recommender_sgd.fit(df_train,
                    epochs=10,learning_rate=0.01,
                    shuffle=True,
                    reg_user=0.01,reg_item=0.01,
                    reg_bias_user=0.01,reg_bias_item=0.01,
                    verbose=1)

model built via StochasticGradientDescent
epochs:10,learning_rate:0.01,shuffle:True
reg_user:0.01,reg_item:0.01,reg_bias_user:0.01,reg_bias_item:0.01


100%|██████████| 10/10 [00:19<00:00,  1.97s/it]


<yarst.model.recommender.Recommender instance at 0x7f1d69f0dd88>

In [7]:
# evaluate the df_test
recommender_sgd.evaluate(df_test,metrics = [mae(),mse()])

{'mean_absolute_error': 0.78931707770322834,
 'mean_squared_error': 0.98205753617791436}

In [8]:
# or use various learning_rate to train model

recommender_sgd = Recommender(sgd)

recommender_sgd.fit(df_train,epochs=3,learning_rate=0.01,verbose=0)
recommender_sgd.fit(df_train,epochs=7,learning_rate=0.001,verbose=0)

model built via StochasticGradientDescent


<yarst.model.recommender.Recommender instance at 0x7f1d69f27f38>

<yarst.model.recommender.Recommender instance at 0x7f1d69f27f38>

In [9]:
# with same epochs, the result is bit better
recommender_sgd.evaluate(df_test,metrics = [mae(),mse()])

{'mean_absolute_error': 0.76145355673237169,
 'mean_squared_error': 0.92423536815545637}

In [10]:
# once model is trained, predict score for new pop

pred = recommender_sgd.predict(df_test)
pred.head(5)

Unnamed: 0,user_id,movie_id,rating,prediction
0,22,5,3.0,3.704309
1,63,5,4.5,3.485385
2,114,6,5.0,4.217211
3,48,7,1.5,2.848711
4,51,7,4.5,4.169334


## Build Model(ALS)

In [11]:
# build the sgd, components is hidden factor number
als = AlternatingLeastSquare(components=100)

#fit the data via df_rating
x_col_dict = {'user':'user_id','item':'movie_id'}
y_col_dict = {'rate':'rating'}
als.fit(df_train,x_col_dict,y_col_dict)

<yarst.model.alternating_least_square.AlternatingLeastSquare instance at 0x7f1d69ecdb90>

In [12]:
# build recommender to train model
# epochs: epochs
# reg_user,reg_item: regularize of use/item

recommender_als = Recommender(als)
recommender_als.fit(df_train,epochs=2,reg_user=0.01,reg_item=0.01,verbose=1)

  0%|          | 0/2 [00:00<?, ?it/s]

model built via AlternatingLeastSquare
epochs:2,reg_user:0.01,reg_item:0.01


100%|██████████| 2/2 [00:12<00:00,  6.11s/it]


<yarst.model.recommender.Recommender instance at 0x7f1d69efc950>

In [13]:
# evaluate the df_test
recommender_als.evaluate(df_test,metrics = [mae(),mse()])

{'mean_absolute_error': 3.4419453256951886,
 'mean_squared_error': 13.201477968175322}

In [14]:
pred = recommender_als.predict(df_test)
pred.head(5)

Unnamed: 0,user_id,movie_id,rating,prediction
0,22,5,3.0,0.29157
1,63,5,4.5,0.128462
2,114,6,5.0,0.084008
3,48,7,1.5,0.150978
4,51,7,4.5,0.682083


## Build Model(CF)

In [15]:
cf = CollaborativeFiltering(use_attribute=False,cold_start=False)
x_col_dict = {'user':'user_id','item':'movie_id'}
y_col_dict = {'rate':'rating'}
cf.fit(df_train,x_col_dict,y_col_dict)

<yarst.model.collaborative_filtering.CollaborativeFiltering instance at 0x7f1d69f2a1b8>

### CollaborativeFiltering params

##### similarity_type: string
- rate: use rating only to calculate the similarity
- attribute: use attribute only to calculate the similarity
- hyrbid: use rate and attribute together to calculate the similarity

##### rate_similarity_metric: string, 
    how to calculate the similarity of rate,
    ref: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
    
##### rate_similarity_max_from_train: boolean, 
    whether get the max value of rate similarity from the train, or the max value is 1
##### rate_similarity_scale: string, 
    how to scale the similarity
- nochange: no change
- rescale: rescale similarity to [0,1]
- abs: absolute similarity   
    
##### use_bias: boolean, 
    use bias or not
##### bias_type: string
- user: rating subtract the user based bias 
- item: rating subtract the item based bias
- hyrbid: ratings subtract item and user based bias
- zscore: rating subtract the user based bias and devided by user-based standard devation

In [16]:
recommender_cf = Recommender(cf)
recommender_cf.fit(df_train,            
                   similarity_type='rate',
                   rate_similarity_metric='cosine',
                   rate_similarity_max_from_train=True,
                   rate_similarity_scale = 'rescale',
                   use_bias = True,
                   bias_type = 'user')

model built via CollaborativeFiltering
similarity_type:rate
rate_similarity_metric:cosine,
rate_similarity_max_from_train:True
rate_similarity_scale:rescale
attr_similarity_metric:cosine
attr_similarity_max_from_train:False
attr_similarity_scale:nochange
similarity_weight:0.5
use_bias:True,bias_type:user
num_threads:1


<yarst.model.recommender.Recommender instance at 0x7f1d69ef4098>

In [17]:
# evaluate the df_test
# predict_method: string, hwo to get the prediction
#   - topk_wtd: seach the k nearest neighbours and get the predicitons weigthed by similarity 
#   - topk_ave: seach the k nearest neighbours and get the predicitons by average
#   - topk_avedev: user'mean + simple average of neighbours' centered rating
#   - topk_avezscore: user'mean +user std * simple average of neighbours z-score rating
#   - baseline_withbias: ratings are subtracted based on the bias_type, prediciton is weighted average of all users
#   - hyrbid: nobias for topk neighbours 
#   - baseline: predicitons is mean of all users
# k: int,

recommender_cf.evaluate(df_test,metrics = [mae(),mse()],k=100,predict_method='baseline')

predict_method:baseline
k:100


{'mean_absolute_error': 1.0400748809444669,
 'mean_squared_error': 1.746657716375553}

In [18]:
pred = recommender_cf.predict(df_test)
pred.head(5)

predict_method:topk_wtd
k:50


Unnamed: 0,user_id,movie_id,rating,prediction
0,22,5,3.0,2.981053
1,63,5,4.5,2.941319
2,114,6,5.0,5.0
3,48,7,1.5,3.751776
4,51,7,4.5,3.806451
