In [3]:
import numpy as np
import pandas as pd
import os
from scipy import sparse
from scipy.sparse import csr_matrix
import random
from datetime import datetime

In [55]:
df = pd.read_csv('./ratings.dat', names=['user', 'movie', 'rating', 'date'], delimiter='::', engine= 'python')
print('Rows:', df.shape[0], '; Columns:', df.shape[1], '\n')
from datetime import datetime
#df.date = pd.to_datetime(df.date)
df = df[['movie', 'user', 'rating', 'date']]
print(df.head(5))

Rows: 1000209 ; Columns: 4 

   movie  user  rating       date
0   1193     1       5  978300760
1    661     1       3  978302109
2    914     1       3  978301968
3   3408     1       4  978300275
4   2355     1       5  978824291


In [56]:
df.to_csv("data1m.csv", index = False)

In [57]:
data = pd.read_csv('data1m.csv')

In [58]:
print("Input Data Shape: ",
      data.shape)

Input Data Shape:  (1000209, 4)


In [59]:
print("First Five Line: ")
print(data.head(5))

First Five Line: 
   movie  user  rating       date
0   1193     1       5  978300760
1    661     1       3  978302109
2    914     1       3  978301968
3   3408     1       4  978300275
4   2355     1       5  978824291


In [9]:
#Create Training Set:
train_df = data.iloc[:int(data.shape[0]*0.80)]
print("Train set: ", train_df.shape[0])

Train set:  800167


In [10]:
test_df = data.iloc[int(data.shape[0]*0.80) : ]
print("Test set: ", test_df.shape[0])

Test set:  200042


In [11]:
print("Total No of Users   :", len(np.unique(train_df.user)))
print("Total No of movies  :", len(np.unique(train_df.movie)))

Total No of Users   : 4795
Total No of movies  : 3685


In [12]:
from surprise import Reader, Dataset

In [13]:
from surprise.model_selection import cross_validate
from surprise import accuracy

In [23]:
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(train_df[['user', 'movie', 'rating']], reader)
trainset = train_data.build_full_trainset()

In [15]:
testset = list(zip(test_df.user.values, test_df.movie.values, test_df.rating.values))
testset[:3]

[(4795, 3262, 3), (4795, 2624, 4), (4795, 2628, 3)]

In [16]:
def evaluate_train_test(algo, train, test):
    print('Training the model..')
    start =datetime.now()    
    algo.fit(train)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))
    print('Evaluating the model with TRAIN data...')
    start =datetime.now()
    prediction_train = algo.test(train.build_testset())
    rmse_train = accuracy.rmse(prediction_train)
    mae_train = accuracy.mae(prediction_train)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))
    print('\nEvaluating for test data...')
    start =datetime.now()
    prediction = algo.test(test)
    rmse_test = accuracy.rmse(prediction)
    mae_test = accuracy.mae(prediction)
    print('Done. Time taken : {}\n'.format(datetime.now()-start))

In [17]:
algo = []

In [18]:
#Baseline Model ( with User and Item biases)
from surprise import BaselineOnly
bsl_options = {'method': 'sgd',
               'learning_rate': .001
               }
bsl = BaselineOnly(bsl_options=bsl_options)
algo.append([bsl, "Baseline Model with User and Item Biases"])

In [19]:
#KNN with User User similarities
from surprise import KNNBaseline
sim_options = {'user_based' : True,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'} 
knn_bsl_u = KNNBaseline(k=40, sim_options = sim_options, bsl_options = bsl_options)
algo.append([knn_bsl_u, "KNN Basline with User User Similarity"])

In [20]:
algo

[[<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x20e1a366f88>,
  'Baseline Model with User and Item Biases'],
 [<surprise.prediction_algorithms.knns.KNNBaseline at 0x20e1a36a648>,
  'KNN Basline with User User Similarity']]

In [21]:
#KNN with Item Item similarities
sim_options = {'user_based' : False,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'}
knn_bsl_m = KNNBaseline(k=40, sim_options = sim_options, bsl_options = bsl_options)
algo.append([knn_bsl_m, "KNN Basline with Item Item Similarity"])

In [27]:
#SVD - MF algorithm withuser item interactions
from surprise import SVD
svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
algo.append([svd, "SVD model"])

In [28]:
for a in algo:
    print("==========================================================")
    print("Model: " + a[1])
    evaluate_train_test(a[0], trainset, testset)

Model: Baseline Model with User and Item Biases
Training the model..
Estimating biases using sgd...
Done. Time taken : 0:00:01.681527

Evaluating the model with TRAIN data...
RMSE: 0.9013
MAE:  0.7136
Done. Time taken : 0:00:09.275603


Evaluating for test data...
RMSE: 0.9955
MAE:  0.7868
Done. Time taken : 0:00:03.019585

Model: KNN Basline with User User Similarity
Training the model..
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. Time taken : 0:01:18.571339

Evaluating the model with TRAIN data...
RMSE: 0.5522
MAE:  0.4267
Done. Time taken : 0:08:07.364467


Evaluating for test data...
RMSE: 0.9903
MAE:  0.7838
Done. Time taken : 0:00:03.103025

Model: KNN Basline with Item Item Similarity
Training the model..
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. Time taken : 0:00:47.924994

Evaluating the model with TRAIN data...
RMSE: 0

In [36]:
#make predction for each model
test = train_df.sample(10)
reader = Reader(rating_scale=(1,5))
test_data = Dataset.load_from_df(test[['user', 'movie', 'rating']], reader)
train = test_data.build_full_trainset()

In [52]:
testset = train.build_testset()
pred = test[['user', 'movie', 'rating']]
l = [bsl, knn_bsl_u, knn_bsl_m, svd]
for i in range(len(l)):
    predictions = l[i].test(testset)
    model_pred = pd.DataFrame([[i.uid, i.iid, i.est] for i in predictions], columns=['user', 'movie', str(i)])
    pred = pd.merge(pred, model_pred, how='left', left_on=['user', 'movie'], right_on=['user', 'movie'])
pred.columns = pred.columns[:3].tolist() + ['bsl', 'knn_bsl_u', 'knn_bsl_m', 'svd']

In [54]:
pred

Unnamed: 0,user,movie,rating,bsl,knn_bsl_u,knn_bsl_m,svd
0,1001,3745,4,3.499386,3.793326,3.737759,3.858508
1,4384,440,5,4.272058,4.405127,4.290638,4.214437
2,4064,1019,3,3.22762,3.362304,3.444051,3.512397
3,3461,1036,5,4.551968,5.0,4.980463,5.0
4,1943,180,4,3.639637,3.460256,3.552031,3.187237
5,3444,471,3,3.290491,3.525312,3.36733,3.249894
6,4569,2826,3,3.417219,3.414261,3.406698,3.339284
7,922,11,4,3.746115,3.827676,3.813146,4.280857
8,3648,849,1,2.203562,2.097621,1.972498,2.023431
9,1878,546,2,2.354225,2.033213,1.983627,1.682131
