### Libraries to install

In [None]:
from math import sqrt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Constants

In [2]:
BYTES_TO_MB_DIV = 0.000001

### Function to check the memory usage of the dataframe

In [3]:
def df_mem_usage(df):
    print()
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

### Reading the dataset

In [4]:
%%time

cols = ['%%MatrixMarket','matrix','coordinate']

dtypes = {
    '%%MatrixMarket':'int32', 
    'matrix':'int16', 
    'coordinate':'int8'
}

df_read = pd.read_csv('data/netflix_mm', delim_whitespace=True, usecols=cols, dtype=dtypes, skiprows=range(1, 3))
df_read.columns = ['user_id', 'movie_id', 'rating']

print(df_read.head())
df_mem_usage(df_read)

   user_id  movie_id  rating
0        1         1       3
1        2         1       5
2        3         1       4
3        5         1       3
4        6         1       3

Memory usage is 693.505 MB
CPU times: user 36.7 s, sys: 1.34 s, total: 38 s
Wall time: 38.3 s


### Only keep the first 100,000 rows to make the code run faster

In [5]:
df = df_read.head(5000000)

# Metrics

In [6]:
def rmse(y_pred, y_true):
    
    return sqrt(mean_squared_error(y_pred, y_true))

# pcLasso

### Pivot the table

In [7]:
y = df['rating'].values

In [None]:
%%time

df = df.pivot_table(index=df.index, columns='movie_id', values='rating', fill_value=0, dropna=False)

In [165]:
X = df.values

In [145]:
df_mem_usage(df)


Memory usage is 1936.0 MB


### Train-test-split

In [166]:
X_training, X_test, y_training, y_test = train_test_split(X, y, test_size=0.20)
X_train, X_val, y_train, y_val = train_test_split(X_training, y_training, test_size=0.5)

In [167]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(X_val.shape)
print(y_val.shape)

(400000, 241)
(400000,)
(200000, 241)
(200000,)
(400000, 241)
(400000,)


### PCA

In [180]:
def apply_pca(X_train, X_val, X_test, num_components=None):
    
    pca = PCA(n_components=num_components)
    X_train = pca.fit_transform(X_train)
    pca = PCA(n_components=num_components)
    X_test = pca.fit_transform(X_test)
    pca = PCA(n_components=num_components)
    X_val = pca.fit_transform(X_val)
    
    return X_train, X_val, X_test

In [184]:
%%time

X_train, X_val, X_test = apply_pca(X_train, X_val, X_test, 30)

CPU times: user 3.84 s, sys: 356 ms, total: 4.2 s
Wall time: 1.27 s


In [185]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(X_val.shape)
print(y_val.shape)

(400000, 30)
(400000,)
(200000, 30)
(200000,)
(400000, 30)
(400000,)


### Lasso

In [186]:
def flooring(y_pred):
    
    y_pred[y_pred < 0] = 1.
    y_pred[y_pred > 5] = 5.

    y_pred = np.round(y_pred)
    
    return y_pred

### Find the best value of lambda on the validation set

In [187]:
%%time

lambda_values = [0.00001, 0.0001, 0.001, 0.005, 0.01, 0.05,  0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 5, 10, 20, 30, 40, 50, 100, 200, 500, 1000, 100000]
rmse_list = []

for lambda_val in lambda_values:
    lasso = Lasso(lambda_val)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_val)
    y_pred = flooring(y_pred)
    rmse_lasso = rmse(y_pred, y_val)
    rmse_list.append(rmse_lasso)
    print(("Lasso RMSE with Lambda={} is {}").format(lambda_val, rmse_lasso))

Lasso RMSE with Lambda=1e-05 is 1.074804633410184
Lasso RMSE with Lambda=0.0001 is 1.074804633410184
Lasso RMSE with Lambda=0.001 is 1.0721613684515965
Lasso RMSE with Lambda=0.005 is 1.0632391546590072
Lasso RMSE with Lambda=0.01 is 1.0690848890523146
Lasso RMSE with Lambda=0.05 is 1.0891476024855402
Lasso RMSE with Lambda=0.1 is 1.1663908007181811
Lasso RMSE with Lambda=0.2 is 1.1345659963175345
Lasso RMSE with Lambda=0.3 is 1.1345659963175345
Lasso RMSE with Lambda=0.4 is 1.1345659963175345
Lasso RMSE with Lambda=0.5 is 1.1345659963175345
Lasso RMSE with Lambda=1 is 1.1345659963175345
Lasso RMSE with Lambda=2 is 1.1345659963175345
Lasso RMSE with Lambda=5 is 1.1345659963175345
Lasso RMSE with Lambda=10 is 1.1345659963175345
Lasso RMSE with Lambda=20 is 1.1345659963175345
Lasso RMSE with Lambda=30 is 1.1345659963175345
Lasso RMSE with Lambda=40 is 1.1345659963175345
Lasso RMSE with Lambda=50 is 1.1345659963175345
Lasso RMSE with Lambda=100 is 1.1345659963175345
Lasso RMSE with Lambda

In [188]:
best_lambda = lambda_values[rmse_list.index(min(rmse_list))]
print(('Best lambda for Lasso is: {}').format(best_lambda))

Best lambda for Lasso is: 0.005


### Evalue results on the test test

In [189]:
lasso = Lasso(best_lambda)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
y_pred = flooring(y_pred)
rmse_lasso = rmse(y_pred, y_test)
print(("Lasso RMSE with Lambda={} is {}").format(lambda_val, rmse_lasso))

Lasso RMSE with Lambda=100000 is 1.0361056895896288
