In [1]:
import numpy as np
import pandas as pd
import os
from scipy import sparse
from scipy.sparse import csr_matrix
import random
from datetime import datetime

In [2]:
data = pd.read_csv('data10m.csv')

In [3]:
print("Input Data Shape: ",
      data.shape)

Input Data Shape:  (10000054, 4)


In [4]:
print("First Five Line: ")
print(data.head(5))

First Five Line: 
   movie  user  rating                           date
0    122     1     5.0  1970-01-01 00:00:00.838985046
1    185     1     5.0  1970-01-01 00:00:00.838983525
2    231     1     5.0  1970-01-01 00:00:00.838983392
3    292     1     5.0  1970-01-01 00:00:00.838983421
4    316     1     5.0  1970-01-01 00:00:00.838983392


In [5]:
#Create Training Set:
train_df = data.iloc[:int(data.shape[0]*0.80)]
print("Train set: ", train_df.shape[0])

Train set:  8000043


In [6]:
test_df = data.iloc[int(data.shape[0]*0.80) : ]
print("Test set: ", test_df.shape[0])

Test set:  2000011


In [7]:
print("Total No of Users   :", len(np.unique(train_df.user)))
print("Total No of movies  :", len(np.unique(train_df.movie)))

Total No of Users   : 56101
Total No of movies  : 10633


In [8]:
from surprise import Reader, Dataset

In [9]:
from surprise.model_selection import cross_validate
from surprise import accuracy

In [10]:
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(train_df[['user', 'movie', 'rating']], reader)
trainset = train_data.build_full_trainset()

In [11]:
testset = list(zip(test_df.user.values, test_df.movie.values, test_df.rating.values))
testset[:3]

[(57375, 7361, 4.0), (57375, 7379, 3.0), (57375, 7386, 4.5)]

In [12]:
RMSE = []
MAE = []
def evaluate_train_test(algo, train, test):
    print('Training the model..')
    start =datetime.now()    
    algo.fit(train)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))
#evaluation this model took such a long time so we just skip this step
#    print('Evaluating the model with TRAIN data...')
#    start =datetime.now()
#    prediction_train = algo.test(train.build_testset())
#    rmse_train = accuracy.rmse(prediction_train)
#    mae_train = accuracy.mae(prediction_train)
#    print('Done. Time taken : {}\n'.format(datetime.now()-start))
    print('\nEvaluating for test data...')
    start =datetime.now()
    prediction = algo.test(test)
    rmse_test = accuracy.rmse(prediction)
    mae_test = accuracy.mae(prediction)
    RMSE.append(rmse_test)
    MAE.append(mae_test)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))

In [13]:
algo = []

In [14]:
#Baseline Model ( with User and Item biases)
from surprise import BaselineOnly
bsl_options = {'method': 'sgd',
               'learning_rate': .001
               }
bsl = BaselineOnly(bsl_options=bsl_options)
algo.append([bsl, "Baseline Model with User and Item Biases"])

In [15]:
#KNN with User User similarities
from surprise import KNNBaseline
sim_options_u = {'user_based' : True,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'} 
knn_bsl_u = KNNBaseline(k=20, sim_options = sim_options_u, bsl_options = bsl_options)
algo.append([knn_bsl_u, "KNN Basline with User User Similarity"])

In [16]:
#KNN with Item Item similarities
sim_options_i = {'user_based' : False,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'}
knn_bsl_m = KNNBaseline(k=20, sim_options = sim_options_i, bsl_options = bsl_options)
algo.append([knn_bsl_m, "KNN Basline with Item Item Similarity"])

In [17]:
#SVD - MF algorithm with user item interactions
from surprise import SVD
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
algo.append([svd, "SVD model"])

In [18]:
algo

[[<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1a5812ae808>,
  'Baseline Model with User and Item Biases'],
 [<surprise.prediction_algorithms.knns.KNNBaseline at 0x1a5812b3948>,
  'KNN Basline with User User Similarity'],
 [<surprise.prediction_algorithms.knns.KNNBaseline at 0x1a5812b6248>,
  'KNN Basline with Item Item Similarity'],
 [<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a5812b66c8>,
  'SVD model']]

In [20]:
for a in algo:
    print("==========================================================")
    print("Model: " + a[1])
    evaluate_train_test(a[0], trainset, testset)

Model: Baseline Model with User and Item Biases
Training the model..
Estimating biases using sgd...
Done. Time taken : 0:00:06.469934


Evaluating for test data...
RMSE: 0.9955
MAE:  0.7868
Done. Time taken : 0:00:09.358666

Model: KNN Basline with User User Similarity
Training the model..
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. Time taken : 0:03:39.333393


Evaluating for test data...
RMSE: 0.9903
MAE:  0.7838
Done. Time taken : 0:00:07.261428

Model: KNN Basline with Item Item Similarity
Training the model..
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. Time taken : 0:01:43.733350


Evaluating for test data...
RMSE: 0.9903
MAE:  0.7838
Done. Time taken : 0:00:07.576685

Model: SVD model
Training the model..
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Process

In [21]:
#make predction for each model
test = train_df.sample(10)
reader = Reader(rating_scale=(1,5))
test_data = Dataset.load_from_df(test[['user', 'movie', 'rating']], reader)
train = test_data.build_full_trainset()

In [22]:
testset = train.build_testset()
pred = test[['user', 'movie', 'rating']]
l = [bsl, knn_bsl_u, knn_bsl_m, svd]
for i in range(len(l)):
    predictions = l[i].test(testset)
    model_pred = pd.DataFrame([[i.uid, i.iid, i.est] for i in predictions], columns=['user', 'movie', str(i)])
    pred = pd.merge(pred, model_pred, how='left', left_on=['user', 'movie'], right_on=['user', 'movie'])
pred.columns = pred.columns[:3].tolist() + ['bsl', 'knn_bsl_u', 'knn_bsl_i', 'svd']

In [23]:
pred

Unnamed: 0,user,movie,rating,bsl,knn_bsl_u,knn_bsl_i,svd
0,1883,2301,2,3.303331,2.417889,2.684213,2.265347
1,1567,1299,4,4.19361,4.018281,4.084649,3.976425
2,592,3258,3,3.179302,2.945151,3.122728,3.344425
3,4009,3174,4,3.034792,3.678274,3.454418,3.405793
4,690,954,4,4.837191,4.212733,4.235366,4.303308
5,5,41,4,3.470849,3.619949,4.000806,3.822184
6,615,2571,4,4.502376,4.274269,4.462415,4.625705
7,4152,3363,5,3.998745,4.608908,4.527895,4.568134
8,2330,2160,1,3.170795,1.730769,1.799452,1.597867
9,2432,1721,5,3.631089,4.494586,4.313615,4.41639


In [24]:
models = ['BaseLineOnly', 'User-User CF', 'Item-Item CF', 'SVD']

In [25]:
data = {'Model': models, 'RMSE': RMSE, 'MAE': MAE}
df= pd.DataFrame(data)

In [26]:
df

Unnamed: 0,Model,RMSE,MAE
0,BaseLineOnly,0.995483,0.786751
1,User-User CF,0.990302,0.783754
2,Item-Item CF,0.99032,0.783772
3,SVD,0.989015,0.787322
