In [None]:
from surprise import Dataset, Reader

file_path= '/Users/towsifraiyan/Desktop/CWRU/Courses/CSDS451/Assignments/A2_Programming/Final/CSDS435/dataset/ml-100k/u.data'
reader= Reader(line_format='user item rating timestamp', sep='\t')#format dataset 
data= Dataset.load_from_file(file_path, reader=reader)  #Loading


In [None]:
import numpy as np

data= Dataset.load_builtin('ml-100k') 
#raw ratings
raw_ratings= data.raw_ratings 
print(f"number of ratings: {len(raw_ratings)}") 
print(f"sample rating: {raw_ratings[0]}")


number of ratings: 100000
sample rating: ('196', '242', 3.0, '881250949')


In [None]:
#ratings
ratings = [float(r[2]) for r in raw_ratings] 
print(f"Minimum Rating: {np.min(ratings)}")   
print(f"Maximum Rating: {np.max(ratings)}") 

print(f"Average Rating: {np.mean(ratings):.2f}")

Minimum Rating: 1.0
Maximum Rating: 5.0
Average Rating: 3.53


In [None]:
users = {r[0] for r in raw_ratings}  
items = {r[1] for r in raw_ratings} 
print(f"number of users: {len(users)}") 
print(f"number of items: {len(items)}")


number of users: 943
number of items: 1682


In [None]:
print("sample: --- user_id, item_id, rating, timestamp ---") 
for r in raw_ratings[:5]: 
    print(r)

sample: --- user_id, item_id, rating, timestamp ---
('196', '242', 3.0, '881250949')
('186', '302', 3.0, '891717742')
('22', '377', 1.0, '878887116')
('244', '51', 2.0, '880606923')
('166', '346', 1.0, '886397596')


In [6]:
from surprise import SVD, NormalPredictor
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.model_selection import GridSearchCV, cross_validate
from surprise.accuracy import rmse, mae 
import random 
import time

    
#for fine tuning
def tuneEva(algo_class, param_grid, data, algoName): 
    print(f"Tuning Algo({algoName}):")  
    gs= GridSearchCV(algo_class, param_grid, measures=['rmse', 'mae'], cv=5) 
    gs.fit(data) 
    print(f"Best RMSE score: {gs.best_score['rmse']}") 
    print(f"Best parameters: {gs.best_params['rmse']}") 
    return gs.best_estimator['rmse'] 


#load the movie lens 100k dataset
#data= Dataset.load_builtin('ml-100k') 
#KNN's hyperparameters
param_grid_knn= {
    'k': [10, 20, 40, 50],                      #neighbors
    'min_k': [1, 3, 5],                         #minimum neighbors 
    'sim_options': {
        'name': ['cosine', 'pearson', 'msd'],   #similarity measures 
        'user_based': [True, False],
    }
}


#SVD's hyperparameters 
param_grid_svd= {
    'n_factors': [20, 50, 100],        #latent factors 
    'n_epochs': [10, 20, 50],          #iterations 
    'lr_all': [0.005, 0.01, 0.02],     #learning rate 
    'reg_all': [0.01, 0.02, 0.05],     #regularization
    #'init_std_dev': [0.05, 0.1, 0.2], #standard deviation 
    'biased': [True, False],           #bias
    
    # 'lr_bu': [0.001, 0.005, 0.01, 0.02],    #lr for user bias 
    # 'lr_bi': [0.001, 0.005, 0.001, 0.02],   #lr for item bias 
    # 'reg_bu': [0.01, 0.02, 0.04],           #regularization for user bias 
    # 'reg_bi': [0.01, 0.02, 0.04],           #for item bias 
    # 'reg_pu': [0.01, 0.02, 0.04],           #for user factors 
    # 'reg_qi': [0.01, 0.02, 0.04],           #for item factors
}


#best model for kNN 
best_knn= tuneEva(KNNBasic, param_grid_knn, data, 'knn')  
#best model for SVD 
best_svd= tuneEva(SVD, param_grid_svd, data, 'svd')  


#models to evaluate
algos= [
    ('random', NormalPredictor()),  #for Random 
    ('knn', best_knn), 
    ('svd', best_svd), 
]



for algo_name, algo in algos:
    print("\n")
    print(f"{algo_name} with 5 Fold Cross Validation:") 
    results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) 
    #print(f"Mean RMSE: {np.mean(results['test_rmse']):.4f}") 
    #print(f"Mean MAE: {np.mean(results['test_mae']):.4f}") 
    print(f"Std Fit Time: {np.std(results['fit_time']):.5f}") 


Tuning Algo(knn):
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity

In [None]:
from surprise.model_selection import train_test_split


#rmse between two algos 
def RMSE_algos(pred1, pred2): 
    if len(pred1) != len(pred2): 
        raise ValueError("Different length, should be same!") 
    squared_errors= [(p1.est - p2.est) ** 2 for p1, p2 in zip(pred1, pred2)] 
    return np.sqrt(np.mean(squared_errors))

train_set, test_set = train_test_split(data, test_size=0.2)  

#algorithms: 
algos= {   
    'random': NormalPredictor(),  
    'knn': best_knn, 
    'svd': best_svd  
}

predictions= {}
for algoName, algo in algos.items(): 
    algo.fit(train_set) 
    predictions[algoName]= algo.test(test_set) 

#rmse table 
algo_lists= list(algos.keys()) 
n_algos= len(algo_lists)
rmse_table= np.zeros((n_algos, n_algos))

for i, algo1 in enumerate(algo_lists): 
    for j, algo2 in enumerate(algo_lists):
        if i != j:  #skipping RMSE for self comparison
            rmse_table[i, j]= RMSE_algos(predictions[algo1], predictions[algo2]) 
        else:
            rmse_table[i, j]= 0  #no need self comparison


#rmse comparison table with the algos
print('\n')
print("Pair wise RMSE table (Algorithms: random, knn and svd):")
print(" "*10 + " ".join(f"{algo:<10}" for algo in algo_lists)) 
for i, algo1 in enumerate(algo_lists): 
    row = f"{algo1:<10}" + " ".join(f"{rmse_table[i, j]:<10.4f}" for j in range(n_algos)) 
    print(row) 


Computing the msd similarity matrix...
Done computing similarity matrix.


Pair wise RMSE table (Algorithms: random, knn and svd):
          random     knn        svd       
random    0.0000     1.1563     1.2055    
knn       1.1563     0.0000     0.3745    
svd       1.2055     0.3745     0.0000    
