In [1]:
from surprise import Dataset, Reader

file_path= '/Users/towsifraiyan/Desktop/CWRU/Courses/CSDS451/Assignments/A2_Programming/Final/CSDS435/dataset/ml-100k/u.data'
reader= Reader(line_format='user item rating timestamp', sep='\t')#format dataset
data= Dataset.load_from_file(file_path, reader=reader)  #Loading


In [2]:
from surprise import SVD, NormalPredictor
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse, mae
import numpy as np
import random
import time

#to split dataset into train and test sets
def dataSplits(data, n_splits=5):
    r_ratings= data.raw_ratings   #access to raw ratings
    random.shuffle(r_ratings)     #shuffle ratings
    set_size= len(r_ratings)//n_splits
    subset= [r_ratings[i* set_size:(i+1)* set_size] for i in range(n_splits)]

    splits= []
    for i in range(n_splits):
        test_set= subset[i]
        train_set= [rating for j in range(n_splits) if j != i for rating in subset[j]]
        splits.append((train_set, test_set))
    return splits

#random masking cross validation
def maskEva(algo, data, algoName, n_splits=5):
    print(f"{algoName} (random masking  with 5 folds):")
    splits= dataSplits(data, n_splits)
    results= {'RMSE': [], 'MAE': [], 'Fit Time': [], 'Test Time': []}

    for i, (trainset_r, test_set) in enumerate(splits):
        train_set= data.construct_trainset(trainset_r)  #train set
        test_set= [(uid, iid, r) for (uid, iid, r, _) in test_set]  #test set
        start_train= time.time()
        algo.fit(train_set)
        train_time= time.time()- start_train

        start_test= time.time()
        predictions= algo.test(test_set)
        test_time= time.time()- start_test

        #result: RMSE and MAE
        rmse_val= rmse(predictions, verbose=False)
        mae_val= mae(predictions, verbose=False)

        #store results
        results['RMSE'].append(rmse_val)
        results['MAE'].append(mae_val)
        results['Fit Time'].append(train_time)
        results['Test Time'].append(test_time)

        print(f"Fold {i + 1}: RMSE= {rmse_val:.4f}, MAE= {mae_val:.4f}, Fit Time= {train_time:.4f}s, Test Time= {test_time:.4f}s")

    #show result
    print(f"\n{algoName} results (5 fold cross validation):")
    print(f"{'Metric':<15}{'Fold 1':<10}{'Fold 2':<10}{'Fold 3':<10}{'Fold 4':<10}{'Fold 5':<10}{'Mean':<10}{'Std':<10}")
    for metric, values in results.items():
        mean= np.mean(values)
        std= np.std(values)
        print(f"{metric:<15}{values[0]:<10.4f}{values[1]:<10.4f}{values[2]:<10.4f}{values[3]:<10.4f}{values[4]:<10.4f}{mean:<10.4f}{std:<10.4f}")
    print()
    
#for fine tuning
def tuneEva(algo_class, param_grid, data, algoName):
    print(f"Tuning Algo({algoName}):")
    gs= GridSearchCV(algo_class, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    print(f"Best RMSE score: {gs.best_score['rmse']}")
    print(f"Best parameters: {gs.best_params['rmse']}")
    return gs.best_estimator['rmse']


#load the movie lens 100k dataset
data= Dataset.load_builtin('ml-100k')
#KNN's hyperparameters
param_grid_knn= {
    'k': [10, 20, 40, 50],                      #neighbors
    'min_k': [1, 3, 5],                         #minimum neighbors
    'sim_options': {
        'name': ['cosine', 'pearson', 'msd'],   #similarity measures
        'user_based': [True, False],
    }
}


#SVD's hyperparameters
param_grid_svd= {
    'n_factors': [20, 50, 100],        #latent factors
    'n_epochs': [10, 20, 50],          #iterations
    'lr_all': [0.005, 0.01, 0.02],     #learning rate
    'reg_all': [0.01, 0.02, 0.05],     #regularization
    #'init_std_dev': [0.05, 0.1, 0.2], #standard deviation
    'biased': [True, False],           #bias
    
    # 'lr_bu': [0.001, 0.005, 0.01, 0.02],    #lr for user bias
    # 'lr_bi': [0.001, 0.005, 0.001, 0.02],   #lr for item bias
    # 'reg_bu': [0.01, 0.02, 0.04],           #regularization for user bias
    # 'reg_bi': [0.01, 0.02, 0.04],           #for item bias
    # 'reg_pu': [0.01, 0.02, 0.04],           #for user factors
    # 'reg_qi': [0.01, 0.02, 0.04],           #for item factors
}


#best model for kNN
best_knn= tuneEva(KNNBasic, param_grid_knn, data, 'knn')
#best model for SVD
best_svd= tuneEva(SVD, param_grid_svd, data, 'svd')


#models to evaluate
algos= [
    ('random', NormalPredictor()),  #for Random
    ('knn', best_knn),
    ('svd', best_svd),
]

#evaluate with random masking
for algoName, algo in algos:
    maskEva(algo, data, algoName)


Tuning Algo(knn):
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity

In [4]:
from surprise.model_selection import train_test_split


#RMSE between two algos
def RMSE_algos(pred1, pred2):
    if len(pred1) != len(pred2):
        raise ValueError("Different length, should be same!")
    squared_errors= [(p1.est - p2.est) ** 2 for p1, p2 in zip(pred1, pred2)]
    return np.sqrt(np.mean(squared_errors))

train_set, test_set = train_test_split(data, test_size=0.2)

#algorithms:
algos= {
    'random': NormalPredictor(),
    'knn': best_knn,
    'svd': best_svd
}

predictions= {}
for algoName, algo in algos.items():
    algo.fit(train_set)
    predictions[algoName]= algo.test(test_set)

#RMSE table
algo_lists= list(algos.keys())
n_algos= len(algo_lists)
rmse_table= np.zeros((n_algos, n_algos))

for i, algo1 in enumerate(algo_lists):
    for j, algo2 in enumerate(algo_lists):
        if i != j:  #skipping RMSE for self comparison
            rmse_table[i, j]= RMSE_algos(predictions[algo1], predictions[algo2])
        else:
            rmse_table[i, j]= 0  #no need self comparison

print('\n')
#RMSE comparison table with the algos
print("Pair wise RMSE table (Algorithms: random, knn and svd):")
print(" "*10 + " ".join(f"{algo:<10}" for algo in algo_lists))
for i, algo1 in enumerate(algo_lists):
    row = f"{algo1:<10}" + " ".join(f"{rmse_table[i, j]:<10.4f}" for j in range(n_algos))
    print(row)


Computing the msd similarity matrix...
Done computing similarity matrix.


Pair wise RMSE table (Algorithms: random, knn and svd):
          random     knn        svd       
random    0.0000     1.1548     1.2042    
knn       1.1548     0.0000     0.3739    
svd       1.2042     0.3739     0.0000    
