In [212]:
import numpy as np
import sklearn as sk
import random    
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

## Task 1

In [None]:

ratings_table = pd.read_csv(filepath_or_buffer='./ml-1m/ratings.dat',
                            sep='::', delimiter=None, header=0, engine='python')

# 1.
mean_rating_global = ratings_table['Rating'].mean()
# 2.
mean_rating_per_movie = ratings_table.groupby('MovieID')['Rating'].mean()
# .3
mean_rating_per_user = ratings_table.groupby('UserID')['Rating'].mean()


def generate_X_set(columns, *features):
    matrix = np.vstack(features[:2]).T
    X_set = pd.DataFrame(data=matrix, columns=columns)
    return X_set
    
X = generate_X_set(['UserID', 'MovieID'],*[ratings_table['UserID'], ratings_table['MovieID']])

X_per_movie = generate_X_set(['MovieID', 'UserID'],*[ratings_table['UserID'], ratings_table['MovieID'], mean_rating_per_movie])


In [209]:
X_global = X.copy()

X_global['global_average'] = mean_rating_global

X_per_user = pd.merge(left=X, right=mean_rating_per_user, how='left', 
              left_on='UserID', right_index=True)

X_per_movie = pd.merge(left=X, right=mean_rating_per_movie, how='left', 
              left_on='MovieID', right_index=True)

X_per_user_and_movie = pd.merge(left=X_per_user, right=mean_rating_per_movie, how='left', 
              left_on='MovieID', right_index=True)

y = ratings_table['Rating']

In [210]:
from re import T
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def k_fold_split_training(X, y, intercept=False): 
    
    kf = KFold(n_splits=5, shuffle=True, random_state=32)
    rmse_test = []
    rmse_train = []
    
    mae_test = []
    mae_train = []
    
    for train_index, test_index in kf.split(X):
    
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        linear_reg = LinearRegression(fit_intercept=intercept)

        linear_reg.fit(X_train, y_train)
        
        y_predict_test = linear_reg.predict(X_test)
        y_predict_train = linear_reg.predict(X_train)
        
        rmse_train.append(np.sqrt(mean_squared_error(y_train, y_predict_train)))
        rmse_test.append(np.sqrt(mean_squared_error(y_test, y_predict_test)))
        mae_train.append(mean_absolute_error(y_train, y_predict_train))
        mae_test.append(mean_absolute_error(y_test, y_predict_test))

    return np.sum(rmse_test)/5, np.sum(rmse_train)/5, np.sum(mae_test)/5, np.sum(mae_train)/5

In [215]:
total_rmses_train = []
total_rmses_test = []
total_maes_train = []
total_maes_test = []

rmse_test, rmse_train, mae_test, mae_train = k_fold_split_training(X_global, y)
total_rmses_train.append(rmse_train)
total_rmses_test.append(rmse_test)
total_maes_train.append(mae_train)
total_maes_test.append(mae_test)
rmse_test, rmse_train, mae_test, mae_train = k_fold_split_training(X_per_user, y)
total_rmses_train.append(rmse_train)
total_rmses_test.append(rmse_test)
total_maes_train.append(mae_train)
total_maes_test.append(mae_test)
rmse_test, rmse_train, mae_test, mae_train = k_fold_split_training(X_per_movie, y)
total_rmses_train.append(rmse_train)
total_rmses_test.append(rmse_test)
total_maes_train.append(mae_train)
total_maes_test.append(mae_test)
rmse_test, rmse_train, mae_test, mae_train = k_fold_split_training(X_per_user_and_movie, y)
total_rmses_train.append(rmse_train)
total_rmses_test.append(rmse_test)
total_maes_train.append(mae_train)
total_maes_test.append(mae_test)
rmse_test, rmse_train, mae_test, mae_train = k_fold_split_training(X_per_user_and_movie, y, True)
total_rmses_train.append(rmse_train)
total_rmses_test.append(rmse_test)
total_maes_train.append(mae_train)
total_maes_test.append(mae_test)

print('rmse train: ', total_rmses_train)
print('rmse test: ', total_rmses_test)
print('mae train: ', total_maes_train)
print('mae test: ', total_maes_test)

rmse train:  [1.1147380346457683, 1.0263643938220377, 0.9747097226241838, 0.9442214366365042, 0.9155427386251869]
rmse test:  [1.1147405104341572, 1.026366945489675, 0.9747135005426525, 0.9442245824478661, 0.9155464240466344]
mae train:  [0.9301582475623554, 0.8213175634092712, 0.7787995924883884, 0.7565995537623753, 0.7258099996638087]
mae test:  [0.9301598218103528, 0.8213185423861319, 0.7788031490903922, 0.7566016666222036, 0.7258125623984603]


## Task 2

In [200]:
def create_u_v(m):
    u = np.full((m.shape[0],2), 1)
    v = np.full((2,m.shape[1]), 1)
    u = u.astype(np.float32)
    v = v.astype(np.float32)
    return u,v

In [206]:
def normalize_data(m): 
    Row_df = m.pivot(index = 'user_id', columns ='movie_id', values = 'rating')
    u_mean = Row_df.mean(axis=1)
    Row_df_array = Row_df.to_numpy()
    u_mean = u_mean.to_numpy()
    #creating a normal matrix to compare to our uv matrix
    normal = Row_df_array - u_mean.reshape(-1,1)
    N = normal
    return N,Row_df

In [201]:
def update_v(u,v,N):
    sums = 0
    u_ik = u[:,:]
    v_ks = v[:,s]
    u_ik_del = np.delete(u_ik, r, 1)
    v_ks_del = np.delete(v_ks, r, 0)
    u_ir = u[:,r]
    u_ir_squared = u_ir ** 2
    u_ik_v_ks = np.dot(u_ik_del, v_ks_del)
    m_is = N[:,s]
    error = m_is - u_ik_v_ks
    uir_dot_er = u_ir * error
    sumsv = np.nansum(uir_dot_er)
    u_ir_ssum = np.nansum(u_ir_squared * (~np.isnan(m_is)))
    newval_v =  sumsv / u_ir_ssum
    v[r,s] = v[r,s] + ((newval_v - v[r,s]))
    return u,v

In [202]:
def mae(dif):
    dif_abs= (np.absolute(dif))
        #converting all nan values to a zero value.
    dif_abs_0s = np.nan_to_num(dif_abs)
    dif_abs_sum = np.sum(dif_abs_0s,axis=0)
    sum_dif = dif_abs_sum.sum()
    non_0_count = np.count_nonzero(dif_abs_0s)
    MAE=sum_dif/non_0_count
    return MAE

In [203]:
def rmse(dif):
    dif_sqr = dif ** 2
    dif_sqr_0s = np.nan_to_num(dif_sqr)
    dif_sqr_total= np.sum( dif_sqr_0s ,axis=0)
    sumz = dif_sqr_total.sum()
    non_0_count_sqr = np.count_nonzero( dif_sqr_0s )
    RMSE = sumz/ non_0_count_sqr
    return RMSE

In [207]:
# UV Decomposition - Training

#input the path of ratings.dat file
RT = pd.read_csv('./ml-1m/ratings.dat', engine='python', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'])

#create a kfold function to divide the data into 5 random sets for cross validation
KF = KFold(n_splits=5, shuffle=True, random_state=9)
c = 2
i = 5

#start the iteration for each of the 5 folds
for train_index, test_index in KF.split(RT):
    RT_train, RT_test = RT.loc[train_index], RT.loc[test_index]
    #create a dataframe to store all ratings as values for each movie in a coloumn with every user id as index of the rows.
    normal,Row_df = normalize_data(RT_train)
    N = normal
    Row_df_array = Row_df.to_numpy()
    #creating uv matrix components with u having n X d and v having d X m ( where n = number of users, m = number of movies and d = 2)
    u,v = create_u_v(normal)
    uv = np.dot(u,v)
    print("Index:", train_index)
  # updating u using the formula x =(Σj vsj (mrj−Σk̸=surkvkj))/Σjv^2sj
    for iterations in range(i):
        for r in range(6040):
            for s in range(c):
                u,v = update_u(u,v,N)
        #update v using the formula y = (Σiuir(mis−Σk̸=ruikvks))/Σiu^2ir
        for r in range(c):
            for s in range(Row_df_array.shape[1]):
                u,v = update_v(u,v,N)
        uv = np.dot(u,v)
        dif = uv-normal
        print("Iteration Number: ",iterations )
        MAE = mae(dif)
        print('MAE',MAE)
        #calculating RMSE
        RMSE = rmse(dif)
        print('RMSE=',RMSE)

  u_mean = Row_df.mean(axis=1)


TypeError: unsupported operand type(s) for -: 'str' and 'float'

In [None]:
# UV Decomposition - Test

#input the path of ratings.dat file
RT = pd.read_csv('ratings.dat', engine='python', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'])

#create a kfold function to divide the data into 5 random sets for cross validation
KF = KFold(n_splits=5, shuffle=True, random_state=9)
c = 2
i = 5

#start the iteration for each of the 5 folds
for train_index, test_index in KF.split(RT):
    RT_train, RT_test = RT.loc[train_index], RT.loc[test_index]
    #create a dataframe to store all ratings as values for each movie in a coloumn with every user id as index of the rows.
    normal,Row_df = normalize_data(RT_test)
    N = normal
    Row_df_array = Row_df.to_numpy()
    #creating uv matrix components with u having n X d and v having d X m ( where n = number of users, m = number of movies and d = 2)
    u,v = create_u_v(normal)
    uv = np.dot(u,v)
    print("Index:", test_index)
  # updating u using the formula x =(Σj vsj (mrj−Σk̸=surkvkj))/Σjv^2sj
    for iterations in range(i):
        for r in range(1510):
            for s in range(c):
                u,v = update_u(u,v,N)
        #update v using the formula y = (Σiuir(mis−Σk̸=ruikvks))/Σiu^2ir
        for r in range(c):
            for s in range(Row_df_array.shape[1]):
                u,v = update_v(u,v,N)
        uv = np.dot(u,v)
        dif = uv-normal
        print("Iteration Number: ",iterations )
        MAE = mae(dif)
        print('MAE',MAE)
        #calculating RMSE
        RMSE = rmse(dif)
        print('RMSE=',RMSE)

## Task 3

In [None]:
learning_rate = 0.0005
num_of_iterations = 75
reguralization_factor = 0.05
num_of_factors = 10

In [2]:
ratings_table = pd.read_csv(filepath_or_buffer='./ml-1m/ratings.dat',
                            sep='::', delimiter=None, header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'] ,engine='python')

In [3]:
from cmath import isnan

def create_matrices(ratings):
    
    user_ids = ratings['user_id'].unique().tolist()
    movie_ids = ratings['movie_id'].unique().tolist()

    num_users = len(user_ids)
    num_movies = len(movie_ids)

    idx_user = range(num_users)
    idx_movie = range(num_movies)

    mapping_user = dict(zip(user_ids, idx_user))
    mapping_movie = dict(zip(movie_ids, idx_movie))
    
    X = np.array(ratings.pivot(index='user_id',columns='movie_id', values='rating'))
    U = np.random.uniform(-0.01, 0.01, (num_users, num_of_factors))
    M = np.random.uniform(-0.01, 0.01, (num_of_factors, num_movies))

    return [X, U, M, mapping_user, mapping_movie]


In [4]:
def partial_der_reg(error, element):
    deriv_reg = 2*error - reguralization_factor*element
    return deriv_reg


def calculate_predictions(X, U, M):
    previous_rmse = 100000

    for _ in range(num_of_iterations):
        total_errors = []
        for i, xi in enumerate(X):
            for j, xj in enumerate(X[i]):

                if isnan(X[i, j]):
                    continue

                pred_xij = np.dot(U[i, :], M[:, j])

                error_xij = X[i, j] - pred_xij

                total_errors.append(error_xij)

                for k in range(num_of_factors):
                    Uik = U[i, k] + learning_rate * \
                        (partial_der_reg(error_xij, M[k, j]))

                    Mkj = M[k, j] + learning_rate * \
                        (partial_der_reg(error_xij, U[i, k]))

                    U[i, k] = Uik
                    M[k, j] = Mkj
    
        rmse = np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)
        if rmse == previous_rmse:
            break

        previous_rmse = rmse

    return [U, M]


In [5]:
def calculate_rmse(predictions, actual):
    total_errors = []
    for i in range(len(actual)):
        for j in range(len(actual[i])):
            if not isnan(actual[i][j]):
                error = predictions[i][j] - actual[i][j]
                total_errors.append(error)
                
    return np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)

In [6]:
def calculate_rmse_test(predictions, actual, user_train_mapping, movie_train_mapping, user_test_mapping, movie_test_mapping):
    total_errors = []
    for u_id, u_index in user_test_mapping.items():
      for m_id, m_index in movie_test_mapping.items():
            if u_id in user_train_mapping and m_id in movie_train_mapping:
                if not isnan(actual[u_index][m_index]):
                    error = predictions[user_train_mapping[u_id]][movie_train_mapping[m_id]] - actual[u_index][m_index]
                    total_errors.append(error)

    return np.sqrt(np.sum(np.array(total_errors)**2))/len(total_errors)

In [7]:
def post_process_data(Users, Movies):
    predictions = np.matmul(Users, Movies)
    
    predictions[predictions > 5] = 5
    predictions[predictions < 1] = 1
     
    return predictions

In [8]:
def k_fold_matrix_factorization(data, learning_rate, iterations, regularization, num_factors):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    rmse_train_total = []
    rmse_test_total = []
    previous_test_rmse = 10000

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]
        
        X_train, U_train, M_train, user_train_mapping, movie_train_mapping = create_matrices(train_data)
        
        X_test, U_test, M_test, user_test_mapping, movie_test_mapping = create_matrices(test_data)

        U_predicted, M_predicted = calculate_predictions(
            X_train, U_train, M_train)

        P_predicted = post_process_data(U_predicted, M_predicted)

        rmse_train = calculate_rmse(P_predicted, X_train)
        rmse_test = calculate_rmse_test(P_predicted, X_test, user_train_mapping, movie_train_mapping, user_test_mapping, movie_test_mapping)

        if rmse_test < previous_test_rmse:
            previous_test_rmse = rmse_test
            U_best, M_best = U_predicted, M_predicted
            user_train_mapping_best, movie_train_mapping_best = user_train_mapping, movie_train_mapping #maximos

        rmse_train_total.append(rmse_train)
        rmse_test_total.append(rmse_test)

    return [U_best, M_best, rmse_train_total, rmse_test_total, user_train_mapping_best, movie_train_mapping_best]

In [9]:
U, M, rmse_train, rmse_test, user_train_mapping_best, movie_train_mapping_best = k_fold_matrix_factorization(ratings_table, learning_rate, num_of_iterations, reguralization_factor, num_of_factors)
