### Libraries to install

In [51]:
import pandas as pd
import surprise as sp

### Constants

In [52]:
BYTES_TO_MB_DIV = 0.000001

### Function to check the memory usage of the dataframe

In [53]:
def df_mem_usage(df):
    print()
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

### Reading the dataset

In [54]:
%%time

cols = ['%%MatrixMarket','matrix','coordinate']

dtypes = {
    '%%MatrixMarket':'int32', 
    'matrix':'int16', 
    'coordinate':'int8'
}

df = pd.read_csv('data/netflix_mm', delim_whitespace=True, usecols=cols, dtype=dtypes, skiprows=range(1, 3))
df.columns = ['userID', 'itemID', 'rating']

print(df.head())
df_mem_usage(df)

   userID  itemID  rating
0       1       1       3
1       2       1       5
2       3       1       4
3       5       1       3
4       6       1       3

Memory usage is 693.505 MB
CPU times: user 22.7 s, sys: 2.51 s, total: 25.2 s
Wall time: 26.4 s


In [55]:
df = df.head(10000)

# Metrics

In [56]:
def rmse(y_pred, y_true):
    
    return sqrt(mean_squared_error(y_pred, y_true))

# SVD

**Documentation of scikit-surprise:**

-https://surprise.readthedocs.io/en/stable/matrix_factorization.html <br>
-https://datascience.stackexchange.com/questions/6814/how-to-split-train-test-in-recommender-systems

### Declare a reader object to parse the ratings

In [57]:
reader = sp.Reader(rating_scale=(1,5))

### Load the dataframe in scikit-surprise's format using the Reader

In [58]:
%%time

data = sp.Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

CPU times: user 20 ms, sys: 7.31 ms, total: 27.3 ms
Wall time: 27.4 ms


### Divide the data into a training and test set

In [59]:
trainset, testset = sp.model_selection.train_test_split(data, test_size=.25)

## K-fold cross validation

### Parameter grid

In [60]:
param_grid = {
    
    'lr_all'   : [0.001, 0.0025, 0.005, 0.01, 0.1, 0.5],
    'reg_all'  : [0.4, 0.6]
    
}

### Run the Grid search

In [61]:
%%time

gs = sp.model_selection.GridSearchCV(sp.SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)

CPU times: user 20.6 s, sys: 296 ms, total: 20.9 s
Wall time: 21.3 s


### Get the results of the GridSearch

In [62]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.to_csv('data/cv_results.csv', index=False)

### Get the best RMSE and set of parameters from the GridSearch

In [63]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.2325818707138645
{'lr_all': 0.01, 'reg_all': 0.4}


### Build the model from the parameters of the GridSearch

In [64]:
algo = gs.best_estimator['rmse']

### Train the model on the training set and evaluate on the test set

In [66]:
algo.fit(trainset)
predictions = algo.test(testset)
print("RMSE of SVD is: {}".format(sp.accuracy.rmse(predictions)))

RMSE: 1.2217
RMSE of SVD is: 1.2216734596328098
