In [18]:
# Useful starting lines
%matplotlib inline

import numpy as np
import pandas as pd
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
import re
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the Data
Note that `ratings` is a sparse matrix that in the shape of (num_items, num_users)

In [19]:
from helpers import load_data, preprocess_data
path_dataset = "/content/drive/My Drive/ML_project_recommanders/Datasets/data_train.csv"
path_testset = "/content/drive/My Drive/ML_project_recommanders/Datasets/sample_submission.csv"
data = pd.read_csv(path_dataset)
testset = pd.read_csv(path_testset)
ratings = load_data(path_dataset)
ratings.shape

number of items: 10000, number of users: 1000


(10000, 1000)

In [0]:
cleanedFrame = pd.DataFrame({
    'userId': testset["Id"].apply(lambda x: int(re.search('r(.*)_' , x).group(1))),
    'movieId': testset["Id"].apply(lambda x: int(re.search('c(.*)' , x).group(1))),
    'rating' : testset["Prediction"]
})

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def init_MF(train, num_features):
    """init the parameter for matrix factorization."""
        
    num_item, num_user = train.get_shape()

    user_features = np.random.rand(num_features, num_user)
    item_features = np.random.rand(num_features, num_item)

    # start by item features.
    item_nnz = train.getnnz(axis=1)
    item_sum = train.sum(axis=1)

    for ind in range(num_item):
        item_features[0, ind] = item_sum[ind, 0] / item_nnz[ind]
    return user_features, item_features

In [0]:
def compute_error(data, user_features, item_features, nz):
    """compute the loss (MSE) of the prediction of nonzero elements."""
    mse = 0
    for row, col in nz:
        item_info = item_features[:, row]
        user_info = user_features[:, col]
        mse += (data[row, col] - user_info.T.dot(item_info)) ** 2
    return np.sqrt(1.0 * mse / len(nz))

### Learn the Matrix Factorization using Alternating Least Squares

In [0]:
def update_user_feature(
        train, item_features, lambda_user,
        nnz_items_per_user, nz_user_itemindices):
    """update user feature matrix."""
    """the best lambda is assumed to be nnz_items_per_user[user] * lambda_user"""
    num_user = nnz_items_per_user.shape[0]
    num_feature = item_features.shape[0]
    lambda_I = lambda_user * sp.eye(num_feature)
    updated_user_features = np.zeros((num_feature, num_user))

    for user, items in nz_user_itemindices:
        # extract the columns corresponding to the prediction for given item
        M = item_features[:, items]
        
        # update column row of user features
        V = M @ train[items, user]
        A = M @ M.T + nnz_items_per_user[user] * lambda_I
        X = np.linalg.solve(A, V)
        updated_user_features[:, user] = np.copy(X.T)
    return updated_user_features

def update_item_feature(
        train, user_features, lambda_item,
        nnz_users_per_item, nz_item_userindices):
    """update item feature matrix."""
    """the best lambda is assumed to be nnz_items_per_item[item] * lambda_item"""
    num_item = nnz_users_per_item.shape[0]
    num_feature = user_features.shape[0]
    lambda_I = lambda_item * sp.eye(num_feature)
    updated_item_features = np.zeros((num_feature, num_item))

    for item, users in nz_item_userindices:
        # extract the columns corresponding to the prediction for given user
        M = user_features[:, users]
        V = M @ train[item, users].T
        A = M @ M.T + nnz_users_per_item[item] * lambda_I
        X = np.linalg.solve(A, V)
        updated_item_features[:, item] = np.copy(X.T)
    return updated_item_features

In [0]:
from helpers import build_index_groups

def ALS(train):
    """Alternating Least Squares (ALS) algorithm."""
    # define parameters
    num_features = 20   # K in the lecture notes
    lambda_user = 0.080
    lambda_item = 0.080
    stop_criterion = 1e-4
    change = 1
    error_list = [0, 0]
    
    # set seed
    np.random.seed(988)

    # init ALS
    user_features, item_features = init_MF(train, num_features)
    
    # get the number of non-zero ratings for each user and item
    nnz_items_per_user, nnz_users_per_item = train.getnnz(axis=0), train.getnnz(axis=1)
    
    # group the indices by row or column index
    nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)

    # run ALS
    print("\nstart the ALS algorithm...")
    while change > stop_criterion:
        # update user feature & item feature
        user_features = update_user_feature(
            train, item_features, lambda_user,
            nnz_items_per_user, nz_user_itemindices)
        item_features = update_item_feature(
            train, user_features, lambda_item,
            nnz_users_per_item, nz_item_userindices)

        error = compute_error(train, user_features, item_features, nz_train)
        print("RMSE on training set: {}.".format(error))
        error_list.append(error)
        change = np.fabs(error_list[-1] - error_list[-2])

    return user_features, item_features


In [87]:
user, item = ALS(ratings)


start the ALS algorithm...
RMSE on training set: 1.048153205690884.
RMSE on training set: 1.0140501779440705.
RMSE on training set: 0.9768601371898461.
RMSE on training set: 0.950302100631196.
RMSE on training set: 0.9349440324499751.
RMSE on training set: 0.925982707644209.
RMSE on training set: 0.9202205977082001.
RMSE on training set: 0.9161983467037051.
RMSE on training set: 0.9132391529908328.
RMSE on training set: 0.9109832260336552.
RMSE on training set: 0.9092165084864706.
RMSE on training set: 0.9078025825487783.
RMSE on training set: 0.9066505202210827.
RMSE on training set: 0.9056976118169011.
RMSE on training set: 0.9048993572124057.
RMSE on training set: 0.9042233795795612.
RMSE on training set: 0.9036455923644455.
RMSE on training set: 0.9031477173080162.
RMSE on training set: 0.9027156385258893.
RMSE on training set: 0.9023382865648899.
RMSE on training set: 0.9020068652047664.
RMSE on training set: 0.9017143041164499.
RMSE on training set: 0.9014548632276745.
RMSE on t

start the ALS algorithm...
RMSE on training set: 1.0496386046676427.
RMSE on training set: 1.0158838751562376.
RMSE on training set: 0.9793900114896968.
RMSE on training set: 0.9527565415537781.
RMSE on training set: 0.9370515375355365.
RMSE on training set: 0.9278394882010745.
RMSE on training set: 0.921947350210296.
RMSE on training set: 0.9178593822935838.
RMSE on training set: 0.9148642455681787.
RMSE on training set: 0.9125867882359413.
RMSE on training set: 0.9108062433489581.
RMSE on training set: 0.9093830258572756.
RMSE on training set: 0.9082245530088282.
RMSE on training set: 0.9072671643444371.
RMSE on training set: 0.90646576980994.
RMSE on training set: 0.9057876053586278.
RMSE on training set: 0.9052083203544751.
RMSE on training set: 0.9047094518898342.
RMSE on training set: 0.9042767528585806.
RMSE on training set: 0.9038990594770204.
RMSE on training set: 0.903567507054051.
RMSE on training set: 0.90327497506618.
RMSE on training set: 0.9030156862321221.
RMSE on training set: 0.9027849111476423.
RMSE on training set: 0.9025787468184442.
RMSE on training set: 0.9023939480356864.
RMSE on training set: 0.9022277973264918.
RMSE on training set: 0.9020780036058513.
RMSE on training set: 0.9019426225438488.
RMSE on training set: 0.9018199935867253.
RMSE on training set: 0.9017086898837755.
RMSE on training set: 0.9016074782884868.
RMSE on training set: 0.9015152872617141.

In [0]:
predictions = item.T.dot(user)

In [89]:
predictions_frame = pd.DataFrame(predictions)

predictions_frame.head

cleanedFrame.head

<bound method NDFrame.head of          userId  movieId  rating
0            37        1       3
1            73        1       3
2           156        1       3
3           160        1       3
4           248        1       3
...         ...      ...     ...
1176947    9974     1000       3
1176948    9977     1000       3
1176949    9978     1000       3
1176950    9982     1000       3
1176951    9996     1000       3

[1176952 rows x 3 columns]>

In [90]:
pd.set_option('display.float_format','{:.4f}'.format)

final_predictions = cleanedFrame.copy()

final_predictions["rating"] = pd.to_numeric(final_predictions["rating"])
final_predictions["rating"] = final_predictions["rating"].astype(float)

for index, row in final_predictions.iterrows():
    final_predictions.at[int(index), 'rating'] = predictions[int(row['userId']) - 1][int(row['movieId']) - 1]

final_predictions.head

<bound method NDFrame.head of          userId  movieId  rating
0            37        1  3.3401
1            73        1  3.1245
2           156        1  3.7599
3           160        1  3.3494
4           248        1  3.3018
...         ...      ...     ...
1176947    9974     1000  3.3619
1176948    9977     1000  3.6442
1176949    9978     1000  2.7988
1176950    9982     1000  3.1351
1176951    9996     1000  3.7129

[1176952 rows x 3 columns]>

In [91]:
d = []

for index, row in final_predictions.iterrows():
    pred = np.round(final_predictions.at[index, 'rating'])

    d.append(("r"+str(int(row['userId']))+"_c"+str(int(row['movieId'])), int(pred)))

result_als = pd.DataFrame(d, columns = ('Id', 'Prediction'))
print(result_als.head)

<bound method NDFrame.head of                   Id  Prediction
0             r37_c1           3
1             r73_c1           3
2            r156_c1           4
3            r160_c1           3
4            r248_c1           3
...              ...         ...
1176947  r9974_c1000           3
1176948  r9977_c1000           4
1176949  r9978_c1000           3
1176950  r9982_c1000           3
1176951  r9996_c1000           4

[1176952 rows x 2 columns]>


In [0]:
result_als['Prediction'] = result_als["Prediction"].clip(1, 5)

In [0]:
result_als.to_csv("result_als_update.csv", index = False)

In [94]:
(result_als['Prediction'] < 1).any()

False