In [None]:
# Useful starting lines
%matplotlib inline

import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the Data
Note that `ratings` is a sparse matrix that in the shape of (num_items, num_users)

In [None]:
from helpers import load_data, preprocess_data

path_dataset = "../data/data_train.csv"
ratings = load_data(path_dataset)

### Plot the number of ratings per movie and user

In [None]:
from plots import plot_raw_data

num_items_per_user, num_users_per_item = plot_raw_data(ratings)

print("min # of items per user = {}, min # of users per item = {}.".format(
        min(num_items_per_user), min(num_users_per_item)))

### Split the data into a train and test set

In [None]:
def split_data(ratings, num_items_per_user, num_users_per_item,
               min_num_ratings, p_test=0.1):
    """split the ratings to training data and test data.
    Args:
        min_num_ratings: 
            all users and items we keep must have at least min_num_ratings per user and per item. 
    """
    # set seed
    np.random.seed(998)
    
    # select user and item based on the condition.
    valid_users = np.where(num_items_per_user >= min_num_ratings)[0]
    valid_items = np.where(num_users_per_item >= min_num_ratings)[0]
    valid_ratings = ratings[valid_items, :][: , valid_users]  
    
    xs, ys = valid_ratings.nonzero()
    indices = list(zip(xs, ys))
    np.random.shuffle(indices)
    
    cut = int(p_test * len(indices))
    train = valid_ratings.copy()
    xs, ys = zip(*indices)
    train[xs[:cut], ys[:cut]] = 0
    test = valid_ratings.copy()
    test[xs[cut:], ys[cut:]] = 0
    
    print("Total number of nonzero elements in origial data:{v}".format(v=ratings.nnz))
    print("Total number of nonzero elements in train data:{v}".format(v=train.nnz))
    print("Total number of nonzero elements in test data:{v}".format(v=test.nnz))
    return valid_ratings, train, test

In [None]:
from plots import plot_train_test_data

valid_ratings, train, test = split_data(ratings, num_items_per_user, num_users_per_item, min_num_ratings=0, p_test=0.1)
plot_train_test_data(train, test)

### Learn the Matrix Factorization using Alternating Least Squares

In [None]:
def init_MF(train, num_features):
    """init the parameter for matrix factorization."""
    num_item, num_user = train.shape
    item_features = np.random.random((num_features, num_item)) * np.sqrt(5 / num_features) # W
    user_features = np.random.random((num_features, num_user)) * np.sqrt(5 / num_features) # Z
    #item_features = np.ones((num_features, num_item)) * np.sqrt(3.9/5) * np.sqrt(5 / num_features) # W
    #item_features[1,:] = 
    #user_features = np.ones((num_features, num_user)) * np.sqrt(3.9/5) * np.sqrt(5 / num_features) # Z
    #user_features[1,:] = 
    return user_features, item_features

In [None]:
def rmse(data, user_features, item_features):
    nz_row, nz_col = data.nonzero()
    nz = list(zip(nz_row, nz_col))
    WZ = item_features.T @ user_features
    print(WZ.min(), WZ.max())
    s = 0
    for u, i in nz:
        s += np.square(data[u, i] - WZ[u, i])
    return np.sqrt(s / len(nz))

In [None]:
from tqdm import tqdm_notebook as tqdm
from random import sample

def update_user_feature(ratings, user_features, item_features, lambda_user):
    """update user feature matrix."""
    num_item = ratings.shape[0]
    num_user = ratings.shape[1]
    num_features = item_features.shape[0]
    
    batch_size = 3000
    batch_user = sample(range(num_user), batch_size)
    
    for i in tqdm(range(num_user), desc="update user"):
        nz = ratings[:, i].nonzero()[0]
        y = ratings[nz, i].todense()
        X = item_features[:, nz].T
        
        user_features.T[i] = np.squeeze(np.linalg.inv(X.T.dot(X) + lambda_user * np.eye(X.shape[1])).dot(X.T.dot(y)))
    return user_features

def update_item_feature(ratings, user_features, item_features, lambda_item):
    """update item feature matrix."""
    xs, ys = ratings.nonzero()
    
    num_item = ratings.shape[0]
    num_user = ratings.shape[1]
    num_features = user_features.shape[0]
    
    batch_size = 300
    batch_item = sample(range(num_item), batch_size)
    
    ratingsT = ratings.T
    
    for i in tqdm(range(num_item), desc="update item"):
        nz = ratingsT[:, i].nonzero()[0]
        y = ratingsT[nz, i].todense()
        X = user_features[:, nz].T
        
        item_features[:,i] = np.squeeze(np.linalg.inv(X.T.dot(X) + lambda_item * np.eye(X.shape[1])).dot(X.T.dot(y)))
    return item_features

In [None]:
from helpers import build_index_groups
from helpers import predict


def ALS(train, test, num_features, lambda_user, lambda_item):
    """Alternating Least Squares (ALS) algorithm."""
    # define parameters
    #num_features = 11   # K in the lecture notes
    #lambda_user = 1.8
    #lambda_item = 1.4 
    max_iter = 2
    
    # set seed
    np.random.seed(988)

    # init ALS
    user_features, item_features = init_MF(train, num_features)
    
    tr_error = rmse(train, user_features, item_features)
    te_error = rmse(test, user_features, item_features)
    print("initial train rmse : ", tr_error, "\ninitial test rmse : ", te_error)
    
    train_error_list = [tr_error]
    test_error_list = [te_error]

    i = 0
    while True:
        if i >= max_iter:
            break
            
        item_features = update_item_feature(train, user_features, item_features, lambda_item)
        user_features = update_user_feature(train, user_features, item_features, lambda_user)
        
        tr_error = rmse(train, user_features, item_features)
        te_error = rmse(test, user_features, item_features)
        train_error_list.append(tr_error)
        test_error_list.append(te_error)
        print("train rmse : ", tr_error, "\ntest rmse : ", te_error)
        i += 1
        
    plt.plot(train_error_list)
    plt.plot(test_error_list)
    plt.show()  
    WZ = item_features.T @ user_features
    return WZ    

In [None]:
pred = load_data("../data/sampleSubmission.csv")
nz = pred.nonzero()
WZ = ALS(train, test, 2, 1, 3)
WZ[WZ < 1] = 1
WZ[WZ > 5] = 5
pred[nz] = WZ[nz]
predict(pred)