In [1]:
import numpy as np
from sklearn.model_selection import KFold
import random    
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

learning_rate = 0.0005
num_of_iterations = 75
reguralization_factor = 0.05
num_of_factors = 10

In [2]:
ratings_table = pd.read_csv(filepath_or_buffer='./ml-1m/ratings.dat',
                            sep='::', delimiter=None, header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'] ,engine='python')

In [3]:
from cmath import isnan

def create_matrices(ratings):
    
    user_ids = ratings['user_id'].unique().tolist()
    movie_ids = ratings['movie_id'].unique().tolist()

    num_users = len(user_ids)
    num_movies = len(movie_ids)

    idx_user = range(num_users)
    idx_movie = range(num_movies)

    mapping_user = dict(zip(user_ids, idx_user))
    mapping_movie = dict(zip(movie_ids, idx_movie))
    
    X = np.array(ratings.pivot(index='user_id',columns='movie_id', values='rating'))
    U = np.random.uniform(-0.01, 0.01, (num_users, num_of_factors))
    M = np.random.uniform(-0.01, 0.01, (num_of_factors, num_movies))

    return [X, U, M, mapping_user, mapping_movie]


In [4]:
def partial_der_reg(error, element):
    deriv_reg = 2*error - reguralization_factor*element
    return deriv_reg


def calculate_predictions(X, U, M):
    previous_rmse = 100000

    for _ in range(num_of_iterations):
        total_errors = []
        for i, xi in enumerate(X):
            for j, xj in enumerate(X[i]):

                if isnan(X[i, j]):
                    continue

                pred_xij = np.dot(U[i, :], M[:, j])

                error_xij = X[i, j] - pred_xij

                total_errors.append(error_xij)

                for k in range(num_of_factors):
                    Uik = U[i, k] + learning_rate * \
                        (partial_der_reg(error_xij, M[k, j]))

                    Mkj = M[k, j] + learning_rate * \
                        (partial_der_reg(error_xij, U[i, k]))

                    U[i, k] = Uik
                    M[k, j] = Mkj
    
        rmse = np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)
        if rmse == previous_rmse:
            break

        previous_rmse = rmse

    return [U, M]


In [5]:
def calculate_rmse(predictions, actual):
    total_errors = []
    for i in range(len(actual)):
        for j in range(len(actual[i])):
            if not isnan(actual[i][j]):
                error = predictions[i][j] - actual[i][j]
                total_errors.append(error)
                
    return np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)

In [6]:
def calculate_rmse_test(predictions, actual, user_train_mapping, movie_train_mapping, user_test_mapping, movie_test_mapping):
    total_errors = []
    for u_id, u_index in user_test_mapping.items():
      for m_id, m_index in movie_test_mapping.items():
            if u_id in user_train_mapping and m_id in movie_train_mapping:
                if not isnan(actual[u_index][m_index]):
                    error = predictions[user_train_mapping[u_id]][movie_train_mapping[m_id]] - actual[u_index][m_index]
                    total_errors.append(error)

    return np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)

In [7]:
def post_process_data(Users, Movies):
    predictions = np.matmul(Users, Movies)
    
    predictions[predictions > 5] = 5
    predictions[predictions < 1] = 1
     
    return predictions

In [8]:
def k_fold_matrix_factorization(data, learning_rate, iterations, regularization, num_factors):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    rmse_train_total = []
    rmse_test_total = []
    previous_test_rmse = 10000

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]
        
        X_train, U_train, M_train, user_train_mapping, movie_train_mapping = create_matrices(train_data)
        
        X_test, U_test, M_test, user_test_mapping, movie_test_mapping = create_matrices(test_data)

        U_predicted, M_predicted = calculate_predictions(
            X_train, U_train, M_train)

        P_predicted = post_process_data(U_predicted, M_predicted)

        rmse_train = calculate_rmse(P_predicted, X_train)
        rmse_test = calculate_rmse_test(P_predicted, X_test, user_train_mapping, movie_train_mapping, user_test_mapping, movie_test_mapping)

        if rmse_test < previous_test_rmse:
            previous_test_rmse = rmse_test
            U_best, M_best = U_predicted, M_predicted
            user_train_mapping_best, movie_train_mapping_best = user_train_mapping, movie_train_mapping #maximos

        rmse_train_total.append(rmse_train)
        rmse_test_total.append(rmse_test)

    return [U_best, M_best, rmse_train_total, rmse_test_total, user_train_mapping_best, movie_train_mapping_best]

In [9]:
U, M, rmse_train, rmse_test, user_train_mapping_best, movie_train_mapping_best = k_fold_matrix_factorization(ratings_table, learning_rate, num_of_iterations, reguralization_factor, num_of_factors)
