### Libraries to install

In [58]:
from math import sqrt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Constants

In [59]:
BYTES_TO_MB_DIV = 0.000001

### Function to check the memory usage of the dataframe

In [60]:
def df_mem_usage(df):
    print()
    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
    print("Memory usage is " + str(mem) + " MB")

### Reading the dataset

In [61]:
%%time

cols = ['%%MatrixMarket','matrix','coordinate']

dtypes = {
    '%%MatrixMarket':'int32', 
    'matrix':'int16', 
    'coordinate':'int8'
}

df = pd.read_csv('data/netflix_mm', delim_whitespace=True, usecols=cols, dtype=dtypes, skiprows=range(1, 3))
df.columns = ['userID', 'itemID', 'rating']

print(df.head())tl
df_mem_usage(df)

   userID  itemID  rating
0       1       1       3
1       2       1       5
2       3       1       4
3       5       1       3
4       6       1       3

Memory usage is 693.505 MB
CPU times: user 21.7 s, sys: 2.44 s, total: 24.2 s
Wall time: 24.7 s


### Only keep the first 100,000 rows to make the code run faster

In [78]:
df = df.head(10000)

In [63]:
print(df.head())

   userID  itemID  rating
0       1       1       3
1       2       1       5
2       3       1       4
3       5       1       3
4       6       1       3


# Metrics

In [64]:
def rmse(y_pred, y_true):
    
    return sqrt(mean_squared_error(y_pred, y_true))

# pcLasso

### Pre-processing

In [None]:
%%time

df = pd.get_dummies(df, columns=['movie_id'], sparse=True)
df_mem_usage(df)

In [None]:
print(len(df.columns))

In [None]:
df_mem_usage(df)

In [None]:
print (df.drop(['user_id','rating'], inplace=False, axis=1).sparse.density)

### Train-test-split

In [None]:
y = df['rating'].values
X = df.drop(['rating'], axis=1).values

In [None]:
X_training, X_test, y_training, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_val, y_train, y_val = train_test_split(X_training, y_training, test_size=0.1)

In [None]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(X_val.shape)
print(y_val.shape)

### PCA

In [None]:
def apply_pca(X_train, X_val, X_test):
    
    pca = PCA(n_components=5)
    X_train = pca.fit_transform(X_train)
    pca = PCA(n_components=5)
    X_test = pca.fit_transform(X_test)
    pca = PCA(n_components=5)
    X_val = pca.fit_transform(X_val)
    
    return X_train, X_val, X_test

In [None]:
%%time

X_train, X_val, X_test = apply_pca(X_train, X_val, X_test)

In [None]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

print(X_val.shape)
print(y_val.shape)

### Lasso

In [None]:
def flooring(y_pred):
    
    y_pred[y_pred < 0] = 1.
    y_pred[y_pred > 5] = 5.

    y_pred = np.round(y_pred)
    
    return y_pred

In [None]:
%%time

lambda_values = [0.00001, 0.0001, 0.001, 0.005, 0.01, 0.05,  0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 5, 10, 20, 30, 40, 50]

for lambda_val in lambda_values:
    lasso = Lasso(lambda_val)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_val)
    y_pred = flooring(y_pred)
    rmse_lasso = rmse(y_pred, y_val)
    print(("Lasso RMSE with Lambda={} is {}").format(lambda_val, rmse_lasso))