In [None]:
import numpy as np
from sklearn.model_selection import KFold
import random    
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

learning_rate = 0.0005
num_of_iterations = 75
reguralization_factor = 0.05
num_of_factors = 10

In [None]:
ratings_table = pd.read_csv(filepath_or_buffer='./ml-1m/ratings.dat',
                            sep='::', delimiter=None, header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'] ,engine='python')

In [None]:
from cmath import isnan

def create_matrices(ratings):
    X = np.array(ratings.pivot(index='user_id',columns='movie_id', values='rating'))

    U = np.random.uniform(-0.01, 0.01, (len(X), num_of_factors))
    M = np.random.uniform(-0.01, 0.01, (num_of_factors, len(X[0])))

    return [X, U, M]


In [None]:
def partial_der_reg(error, element):
    deriv_reg = 2*error - reguralization_factor*element
    return deriv_reg


def calculate_predictions(X, U, M):
    previous_rmse = 100000

    for _ in range(num_of_iterations):
        total_errors = []
        for i, xi in enumerate(X):
            for j, xj in enumerate(X[i]):

                if isnan(X[i, j]):
                    continue

                pred_xij = np.dot(U[i, :], M[:, j])

                error_xij = X[i, j] - pred_xij

                total_errors.append(error_xij)

                for k in range(num_of_factors):
                    Uik = U[i, k] + learning_rate * \
                        (partial_der_reg(error_xij, M[k, j]))

                    Mkj = M[k, j] + learning_rate * \
                        (partial_der_reg(error_xij, U[i, k]))

                    U[i, k] = Uik
                    M[k, j] = Mkj
    
        rmse = np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)
        if rmse == previous_rmse:
            break

        previous_rmse = rmse

    return [U, M]


In [None]:
X, U, M = create_matrices(ratings_table)

In [None]:
def calculate_rmse(predictions, actual):
    total_errors = []
    for i in range(len(predictions)):
        for j in range(len(predictions[i])):
            if not isnan(predictions[i][j]):
                error = predictions[i][j] - actual[i][j]
                total_errors.append(error)
                
    return np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)

In [None]:
def post_process_data(Users, Movies):
    predictions = np.matmul(Users, Movies)
    
    predictions[predictions > 5] = 5
    predictions[predictions < 1] = 1
     
    return predictions

In [None]:
def k_fold_matrix_factorization(data, learning_rate, iterations, regularization, num_factors):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    rmse_train_total = []
    rmse_test_total = []
    previous_test_rmse = 10000

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]

        X_train, U_train, M_train = create_matrices(train_data)
        X_test, U_test, M_test = create_matrices(test_data)

        U_predicted, M_predicted = calculate_predictions(
            X_train, U_train, M_train)

        P_predicted = post_process_data(U_predicted, M_predicted)

        rmse_train = calculate_rmse(P_predicted, X_train)
        rmse_test = calculate_rmse(P_predicted, X_test)

        if rmse_test < previous_test_rmse:
            previous_test_rmse = rmse_test
            U_best, M_best = U_predicted, M_predicted

        rmse_train_total.append(rmse_train)
        rmse_test_total.append(rmse_test)

    return [U_best, M_best, rmse_train_total, rmse_test_total]


In [None]:
U, M, rmse_train, rmse_test = k_fold_matrix_factorization(ratings_table, learning_rate, num_of_iterations, reguralization_factor, num_of_factors)