# Tools Installation

In [1]:
!pip install scikit-surprise



In [2]:
import pandas as pd
import re
import numpy as np
import random
import scipy
import scipy.io
import scipy.sparse as sp

In [3]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import SlopeOne
from surprise import CoClustering
from surprise import BaselineOnly
from surprise import NMF
from surprise.model_selection import GridSearchCV

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold as skFold

from implementations import *
from als import *

# Work On Data

In [6]:
data = pd.read_csv("Datasets/data_train.csv")

seed = 211 # a prime number is better 
#seed = 988
random.seed = 211

This is where we split training and validation set

In [7]:
cleanedFrame = split3columns(data)

trainingset = cleanedFrame.sample(frac = 0.8, random_state = seed)
testset = cleanedFrame.drop(trainingset.index)

print(trainingset.head(5))
print(testset.head(5))

         userId  movieId  rating
158127     2595      136       1
1130029    8494      945       4
1035571    8775      819       4
631701     7191      542       5
427139     2098      352       4
    userId  movieId  rating
0       44        1       4
11     165        1       3
12     182        1       3
15     333        1       3
16     355        1       2


In [8]:
reader = Reader(rating_scale=(1, 5))

#Here we call surprise function 

trainset = Dataset.load_from_df(trainingset[['userId', 'movieId', 'rating']], reader)
validationset = Dataset.load_from_df(testset[['userId', 'movieId', 'rating']], reader)

## Build Trainset and Validation Set

We need to keep track of validation label for ridge regression later. This is what the function getValidationLabels do

In [9]:
training_data = trainset.build_full_trainset()
validation_test = validationset.build_full_trainset().build_testset()

In [10]:
yvalid = getValidationLabels(validation_test)

# Grid Searchs - Wal

To perform grid Search, we apply function given by Surprise as it is already adapted to each algorithms.

Unfortunately, for ALS we did not had the time to perform a grid search.
Moreover, each grid search is not very big. It would have been nice to use google cloud for example.

## SVD

In [None]:
regression_rates = np.logspace(-6, 2, 5)
lr_space = np.logspace(-6, 2, 5)
factors = [20, 200, 400, 1000] #Reduce since we have an idea of a good number of factors(need more power)
n_epoch_array = [3] #We think there is a linear relation between accuracy & n_epoch so 3 faster and enough.
biased_array = [True]

param_grid = {'n_factors': factors,
              'n_epochs': n_epoch_array,
              'reg_all': regression_rates,
              'lr_all': lr_space,
              'biased': biased_array
              }
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=2, n_jobs = -1, joblib_verbose = 20)

gs.fit(trainset)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
algo = SVD()
algo.biaised = True
algo.n_factors = 400
algo.verbose = True
algo.reg_all = 0.1
algo.lr_all = 0.01
algo.n_epochs = 500
algo.random_state = seed

algo.fit(training_data)

print("Now testing...")

pred_svd = algo.test(validation_test)
accuracy.rmse(pred_svd)

## SVD Without Baselines

In [None]:
regression_rates = np.logspace(-6, 2, 10)
#factors = [1, 2, 4, 10, 50] since we found the best factors we deepen the gridsearch by looking only regression rates
factors = [1]
n_epoch_array = [500]
biased_array = [False]

param_grid = {'n_factors': factors,
              'n_epochs': n_epoch_array,
              'reg_all': regression_rates,
              'biased': biased_array
              }
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=2, n_jobs = -1, joblib_verbose = 10)

gs.fit(trainset)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
algo = SVD()
algo.biaised = False
algo.n_factors = 1
algo.verbose = False
algo.reg_all = 0.001
algo.lr_all = 0.01
algo.n_epochs = 500
algo.random_state = seed

algo.fit(training_data)
print("Now testing...")
pred = algo.test(validation_test)
accuracy.rmse(pred)

## KNN Items

In [None]:
neighbors = [10, 20, 50,220, 300]
sim_options = {'name': ['pearson_baseline'],
               'user_based': [False]  # compute  similarities between items
               }

bsl_options = {'method': ['als'],
               'n_epochs': [50]
               }               

param_grid = {'k': neighbors,
              'sim_options': sim_options,
              'bsl_options': bsl_options
              }
gs_knn = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=2, n_jobs = -1, joblib_verbose = 10)

gs_knn.fit(trainset)

# best RMSE score
print(gs_knn.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs_knn.best_params['rmse'])

In [None]:
algo = KNNBaseline()
sim_options = {'name': 'pearson_baseline',
               'user_based': False # compute  similarities between items
               }

bsl_options = {'method': 'als',
               'n_epochs': 50
               } 

algo.sim_options = sim_options
algo.bsl_options = bsl_options
algo.k = 220

algo.fit(training_data)

print("Now testing...")
pred_knn_items = algo.test(validation_test)
accuracy.rmse(pred_knn_items)

## KNN Users

In [39]:
algo = KNNBaseline()

sim_options = {'name': 'pearson_baseline',
               'user_based': True # compute  similarities between items
               }

bsl_options = {'method': 'als',
               'n_epochs': 50
               } 
algo.sim_options = sim_options
algo.bsl_options = bsl_options
algo.k = 220

algo.fit(training_data)
pred_knn_users = algo.test(validation_test)
accuracy.rmse(pred_knn_users)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9942


0.9941877291405572

## SVD++

In [None]:
algo = SVDpp()
algo.verbose = True
algo.n_factors = 2
algo.n_epochs = 50
algo.random_state = seed

algo.fit(training_data)
pred_svdpp = algo.test(validation_test)
accuracy.rmse(pred_svdpp)

## Slope One

In [None]:
algo = SlopeOne()
algo.fit(training_data)

print("Now testing...")
pred_slope = algo.test(validation_test)
accuracy.rmse(pred_slope)

## Baselines

In [40]:
#Baselines
algo = BaselineOnly()
algo.fit(training_data)

print("Now testing...")
pred_baselines = algo.test(validation_test)
accuracy.rmse(pred_baselines)

Estimating biases using als...
Now testing...
RMSE: 1.0011


1.0010826951956127

## Global Mean

In [None]:
pred_global = globalMean(data, len(validation_test))

## User Mean & Item Mean

In [None]:
copie_validation = validation_test.copy()

validation_frame = pd.DataFrame(copie_validation)
validation_frame.columns= ['userId', 'movieId', 'rating']

In [None]:
pred_user = userMean(trainingset, validation_frame)
pred_items = itemMean(trainingset, validation_frame)

In [None]:
print("Users accuracy ", np.sqrt(np.mean((pred_user - yvalid)**2)))
print("Items accuracy ", np.sqrt(np.mean((pred_items - yvalid)**2)))

## ALS

As said before, ALS need a different format of the data for load_data in helpers. This is what getFrame do here. It reformat what surprise gives us, and make csv nicely so that load_data works as it should.

In [None]:
result_training = getFrame(trainingset)
result_test = getFrame(testset)

In [None]:
result_training.to_csv("training_als.csv", index = False)
result_test.to_csv("testing_als.csv", index = False)

In [None]:
from helpers import load_data, preprocess_data

path_trainingset = "training_als.csv" #So that we can use functions from exercice 10
path_testset = "testing_als.csv"

ratings = load_data(path_trainingset)
test = load_data(path_testset)

test.shape
ratings.shape

In [None]:
user, item = ALS(ratings, test)

predictions = item.T.dot(user)
example_validation_frame = predictionFrameOrdered(pred_baselines, predictions)
pred_als = example_validation_frame['rating'].values

# Blending

In order to blend, we first stack all predictions in a matrix. So each column contains predictions for one model.
And run a ridge regression using Scikit.

## Stack Matrix

In [None]:
pred_svd = np.asarray(pred_svd)
pred_svd = pred_svd[:, 3]

pred_svdpp = np.asarray(pred_svdpp)
pred_svdpp = pred_svdpp[:, 3]

pred_slope = np.asarray(pred_slope)
pred_slope = pred_slope[:, 3]

pred_co = np.asarray(pred_co)
pred_co = pred_co[:, 3]

pred_knn_items = np.asarray(pred_knn_items)
pred_knn_items = pred_knn_items[:, 3]

pred_knn_users = np.asarray(pred_knn_users)
pred_knn_users = pred_knn_users[:, 3]

pred_baselines = np.asarray(pred_baselines)
pred_baselines = pred_baselines[:, 3]

pred = np.asarray(pred)
pred = pred[:, 3]

In [None]:
stacked_predictions = np.column_stack((
    pred_svd,
    pred_svdpp,
    pred_slope,
    pred_knn_items,
    pred_knn_users,
    pred_baselines,
    pred_global,
    pred_user,
    pred_items,
    pred,
    pred_als
    ))

In [None]:
#stacked_pred = pd.DataFrame(stacked_predictions, columns=('Model1', 'Model2','Model3','Model4','Model5','Model6','Model7','Model8', 'Model9', 'Model10', 'Model11'))

#stacked_predictions.to_csv("all_models_validation.csv", index = False)

In [5]:
path = "SafeguardModels/all_models_validation.csv"
stacked_pred = pd.read_csv(path)

stacked_pred.head(5)

Unnamed: 0,Model1,Model2,Model3,Model4,Model5,Model6,Model7,Model8,Model9,Model10,Model11
0,3.876532,3.785445,3.498297,3.657951,3.631313,3.488513,3.857281,3.917874,3.378182,3.475041,3.679562
1,2.822582,3.737054,3.546125,3.361019,3.482206,3.530627,3.857281,3.917874,3.344086,3.65831,3.40758
2,3.427123,3.949398,4.034495,4.119559,4.019512,4.041308,3.857281,3.917874,3.859935,3.895567,3.889519
3,3.27174,4.087095,4.156967,3.945576,4.016178,4.082634,3.857281,3.917874,3.880123,4.154114,3.796
4,3.398989,3.879055,3.642492,3.69177,3.664836,3.690239,3.857281,3.917874,3.517572,3.742409,3.931653


In [6]:
best_models = stacked_pred.drop(columns=['Model6','Model10','Model7', 'Model4', 'Model3', 'Model9', 'Model8'])
stacked_pred_matrix = best_models.values

best_models.head()

Unnamed: 0,Model11
0,3.679562
1,3.40758
2,3.889519
3,3.796
4,3.931653


## Feature Expansion

We obtained a better accuracy doing feature expansion. Here also, it would have been nice to perform a grid search on the degree of the expansion. We did not did that because of a lack of time.

In [52]:
from sklearn.preprocessing import PolynomialFeatures

In [53]:
poly = PolynomialFeatures(2, interaction_only=False)
stacked_pred_matrix = poly.fit_transform(stacked_pred_matrix)
print(stacked_pred_matrix.shape)

(235390, 3)


# Ridge Regression

In [54]:
x_tr, x_te, y_tr, y_te = train_test_split(stacked_pred_matrix, yvalid, test_size=0.3, random_state=seed)

In [55]:
cv_ridge = skFold(n_splits = 3)
alphas_array = np.logspace(-8, 3, 20)
clf = RidgeCV(alphas = alphas_array, fit_intercept=False, scoring="neg_mean_squared_error", cv=cv_ridge)
clf.fit(x_tr, y_tr)

weights_opt = clf.coef_

print("best weight for regression", weights_opt)
print("Best lambda for regression", clf.alpha_)

targets = clf.predict(x_te).clip(1, 5)

best weight for regression [0.73725389 0.70358694 0.03262412]
Best lambda for regression 0.3359818286283788


In [56]:
#compute score
final_accuracy = np.sqrt(np.mean((y_te - np.around(targets)) ** 2))
print(final_accuracy)

1.0286478102832755


1.0239841 -> AICROWD: 1.020 -> Feature exp: 2 Model1 Model2 Model5 Model11