In [1]:
import numpy as np
import pandas as pd
import os
from scipy import sparse
from scipy.sparse import csr_matrix
import random
from datetime import datetime

In [2]:
df = pd.read_csv('./ratings.dat', names=['user', 'movie', 'rating', 'date'], delimiter='::', engine= 'python')
print('Rows:', df.shape[0], '; Columns:', df.shape[1], '\n')
from datetime import datetime
#df.date = pd.to_datetime(df.date)
df = df[['movie', 'user', 'rating', 'date']]
print(df.head(5))

Rows: 1000209 ; Columns: 4 

   movie  user  rating       date
0   1193     1       5  978300760
1    661     1       3  978302109
2    914     1       3  978301968
3   3408     1       4  978300275
4   2355     1       5  978824291


In [3]:
df.to_csv("data1m.csv", index = False)

In [4]:
data = pd.read_csv('data1m.csv')

In [5]:
print("Input Data Shape: ",
      data.shape)

Input Data Shape:  (1000209, 4)


In [6]:
print("First Five Line: ")
print(data.head(5))

First Five Line: 
   movie  user  rating       date
0   1193     1       5  978300760
1    661     1       3  978302109
2    914     1       3  978301968
3   3408     1       4  978300275
4   2355     1       5  978824291


In [7]:
#Create Training Set:
train_df = data.iloc[:int(data.shape[0]*0.80)]
print("Train set: ", train_df.shape[0])

Train set:  800167


In [8]:
test_df = data.iloc[int(data.shape[0]*0.80) : ]
print("Test set: ", test_df.shape[0])

Test set:  200042


In [9]:
print("Total No of Users   :", len(np.unique(train_df.user)))
print("Total No of movies  :", len(np.unique(train_df.movie)))

Total No of Users   : 4795
Total No of movies  : 3685


In [10]:
from surprise import Reader, Dataset

In [11]:
from surprise.model_selection import cross_validate
from surprise import accuracy

In [12]:
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(train_df[['user', 'movie', 'rating']], reader)
trainset = train_data.build_full_trainset()

In [13]:
testset = list(zip(test_df.user.values, test_df.movie.values, test_df.rating.values))
testset[:3]

[(4795, 3262, 3), (4795, 2624, 4), (4795, 2628, 3)]

In [14]:
def evaluate_train_test(algo, train, test):
    print('Training the model..')
    start =datetime.now()    
    algo.fit(train)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))
    print('Evaluating the model with TRAIN data...')
    start =datetime.now()
    prediction_train = algo.test(train.build_testset())
    rmse_train = accuracy.rmse(prediction_train)
    mae_train = accuracy.mae(prediction_train)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))
    print('\nEvaluating for test data...')
    start =datetime.now()
    prediction = algo.test(test)
    rmse_test = accuracy.rmse(prediction)
    mae_test = accuracy.mae(prediction)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))

In [31]:
algo = []

In [32]:
#Baseline Model ( with User and Item biases)
from surprise import BaselineOnly
bsl_options = {'method': 'sgd',
               'learning_rate': .001
               }
bsl = BaselineOnly(bsl_options=bsl_options)
algo.append([bsl, "Baseline Model with User and Item Biases"])

In [33]:
#KNN with User User similarities
from surprise import KNNBaseline
sim_options_u = {'user_based' : True,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'} 
knn_bsl_u = KNNBaseline(k=20, sim_options = sim_options_u, bsl_options = bsl_options)
algo.append([knn_bsl_u, "KNN Basline with User User Similarity"])

In [34]:
#KNN with Item Item similarities
sim_options_i = {'user_based' : False,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'}
knn_bsl_m = KNNBaseline(k=20, sim_options = sim_options_i, bsl_options = bsl_options)
algo.append([knn_bsl_m, "KNN Basline with Item Item Similarity"])

In [35]:
#SVD - MF algorithm withuser item interactions
from surprise import SVD
svd = SVD(n_factors=150, biased=True, random_state=15, verbose=True)
algo.append([svd, "SVD model"])

In [36]:
algo

[[<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1ad96716fc8>,
  'Baseline Model with User and Item Biases'],
 [<surprise.prediction_algorithms.knns.KNNBaseline at 0x1ad96713508>,
  'KNN Basline with User User Similarity'],
 [<surprise.prediction_algorithms.knns.KNNBaseline at 0x1ad96713c48>,
  'KNN Basline with Item Item Similarity'],
 [<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ad96713208>,
  'SVD model']]

In [37]:
for a in algo:
    print("==========================================================")
    print("Model: " + a[1])
    evaluate_train_test(a[0], trainset, testset)

Model: Baseline Model with User and Item Biases
Training the model..
Estimating biases using sgd...
Done. Time taken : 0:00:01.499896

Evaluating the model with TRAIN data...
RMSE: 0.9013
MAE:  0.7136
Done. Time taken : 0:00:05.506837


Evaluating for test data...
RMSE: 0.9506
MAE:  0.7218
Done. Time taken : 0:00:00

Model: KNN Basline with User User Similarity
Training the model..
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. Time taken : 0:01:17.256624

Evaluating the model with TRAIN data...
RMSE: 0.4849
MAE:  0.3739
Done. Time taken : 0:06:49.202330


Evaluating for test data...
RMSE: 0.4143
MAE:  0.3351
Done. Time taken : 0:00:00.003990

Model: KNN Basline with Item Item Similarity
Training the model..
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. Time taken : 0:00:48.424549

Evaluating the model with TRAIN data...
RMSE: 0.5065
M

In [38]:
#make predction for each model
test = train_df.sample(10)
reader = Reader(rating_scale=(1,5))
test_data = Dataset.load_from_df(test[['user', 'movie', 'rating']], reader)
train = test_data.build_full_trainset()

In [41]:
testset = train.build_testset()
pred = test[['user', 'movie', 'rating']]
l = [bsl, knn_bsl_u, knn_bsl_m, svd]
for i in range(len(l)):
    predictions = l[i].test(testset)
    model_pred = pd.DataFrame([[i.uid, i.iid, i.est] for i in predictions], columns=['user', 'movie', str(i)])
    pred = pd.merge(pred, model_pred, how='left', left_on=['user', 'movie'], right_on=['user', 'movie'])
pred.columns = pred.columns[:3].tolist() + ['bsl', 'knn_bsl_u', 'knn_bsl_i', 'svd']

In [42]:
pred

Unnamed: 0,user,movie,rating,bsl,knn_bsl_u,knn_bsl_i,svd
0,3994,3408,5,4.1525,4.546173,4.619635,4.375931
1,2688,2890,3,3.62522,3.262196,3.260935,3.203131
2,42,1275,4,4.011893,4.126568,4.368369,4.364913
3,4064,593,5,3.836693,4.35205,4.207762,4.272844
4,889,3363,4,3.38254,3.48386,3.701131,3.79396
5,4520,2427,4,3.6349,3.793198,3.725985,3.715754
6,3436,2527,4,3.659523,3.838034,3.984053,3.565259
7,855,339,3,3.369182,2.982202,3.12798,2.820548
8,1204,2828,2,2.43169,2.092141,2.280917,2.163028
9,3389,2421,1,2.110623,1.527616,1.390509,1.56095
