In [108]:
import numpy as np
import pandas as pd

import sklearn as sk

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold

In [217]:

ratings_table = pd.read_csv(filepath_or_buffer='../ml-1m/ratings.dat',
                            sep='::', delimiter=None, header=0, engine='python')

movies_table = pd.read_csv(filepath_or_buffer='../ml-1m/movies.dat',
                           sep='::', delimiter=None, header=0, engine='python')

users_table = pd.read_csv(filepath_or_buffer='../ml-1m/users.dat',
                          sep='::', delimiter=None, header=0, engine='python')


# 1.
mean_rating_global = ratings_table['Rating'].mean()
# 2.
mean_rating_per_movie = ratings_table.groupby('MovieID')['Rating'].mean()
# .3
mean_rating_per_user = ratings_table.groupby('UserID')['Rating'].mean()


def generate_X_set(columns, *features):
    matrix = np.vstack(features[:2]).T
    X_set = pd.DataFrame(data=matrix, columns=columns)
    return X_set
    # X_set = pd.pivot_table(data=X_set, values=['UserID', 'MovieID'], index=['UserID'], columns=['MovieID'])
    
X = generate_X_set(['UserID', 'MovieID'],*[ratings_table['UserID'], ratings_table['MovieID']])

# X_per_user = generate_X_set(['UserID', 'MovieID'],*[ratings_table['UserID'], ratings_table['MovieID'], mean_rating_per_user])

X_per_movie = generate_X_set(['MovieID', 'UserID'],*[ratings_table['UserID'], ratings_table['MovieID'], mean_rating_per_movie])


In [226]:
# def add_features_in_X_set(X_set, features, left_on):
#     X_set = pd.merge(left=X_set, right=mean_rating_per_user, how='left', 
#               left_on=left_on, right_index=True)
X_global = X.copy()

X_global['global_average'] = mean_rating_global

X_per_user = pd.merge(left=X, right=mean_rating_per_user, how='left', 
              left_on='UserID', right_index=True)

X_per_movie = pd.merge(left=X, right=mean_rating_per_movie, how='left', 
              left_on='MovieID', right_index=True)

X_per_user_and_movie = pd.merge(left=X_per_user, right=mean_rating_per_movie, how='left', 
              left_on='MovieID', right_index=True)

y = ratings_table['Rating']

In [228]:
from re import T
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error



def k_fold_split_training(X, y, intercept=False): 
    
    kf = KFold(n_splits=5, shuffle=True, random_state=32)

    for train_index, test_index in kf.split(X):
    
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        linear_reg = LinearRegression(fit_intercept=intercept)

        linear_reg.fit(X_train, y_train)
        
        y_predict = linear_reg.predict(X_test)

        # The mean squared error
        print("Mean squared error: %.3f" % mean_squared_error(y_test, y_predict))
        # The root mean squared error
        print("Root mean squared error: %.3f" % np.sqrt(mean_squared_error(y_test, y_predict)))
        # The mean absolute error
        print("Mean absolute error: %.3f" % mean_absolute_error(y_test, y_predict))
        # The coefficient of determination: 1 is perfect prediction
        print("Coefficient of determination: %.3f" % r2_score(y_test, y_predict))
        
        # initialize the linear regression model with gamma or other hyperparameters

k_fold_split_training(X_global, y)
k_fold_split_training(X_per_user, y)
k_fold_split_training(X_per_movie, y)
k_fold_split_training(X_per_user_and_movie, y)
k_fold_split_training(X_per_user_and_movie, y, True)


Mean squared error: 1.242
Root mean squared error: 1.114
Mean absolute error: 0.930
Coefficient of determination: 0.004
Mean squared error: 1.243
Root mean squared error: 1.115
Mean absolute error: 0.931
Coefficient of determination: 0.005
Mean squared error: 1.243
Root mean squared error: 1.115
Mean absolute error: 0.930
Coefficient of determination: 0.004
Mean squared error: 1.243
Root mean squared error: 1.115
Mean absolute error: 0.930
Coefficient of determination: 0.004
Mean squared error: 1.242
Root mean squared error: 1.114
Mean absolute error: 0.931
Coefficient of determination: 0.004
Mean squared error: 1.053
Root mean squared error: 1.026
Mean absolute error: 0.821
Coefficient of determination: 0.156
Mean squared error: 1.053
Root mean squared error: 1.026
Mean absolute error: 0.821
Coefficient of determination: 0.157
Mean squared error: 1.053
Root mean squared error: 1.026
Mean absolute error: 0.821
Coefficient of determination: 0.156
Mean squared error: 1.054
Root mean squa

In [None]:
#     X_total_train_sets = np.array([])
#     X_total_test_sets = np.array([])
#     Y_total_train_sets = np.array([])
#     Y_total_test_sets = np.array([])
   

# X_total_train_sets, X_total_test_sets, Y_total_train_sets, Y_total_test_sets = k_fold_split_training(X2_features,y)
# #, X_total_test_sets, Y__total_train_sets, Y__total_test_sets
# # len(X_total_test_sets)

# for X_train, y_train in zip(X_total_train_sets, Y_total_train_sets):
    
#     linear_reg = LinearRegression()

#     linear_reg.fit(X_train, y_train)
    
#     # The coefficients
#     print("Coefficients: \n", linear_reg.coef_)
    
#     for X_test, y_test in zip(X_total_test_sets, Y_total_test_sets):
        
        
#         y_predict = linear_reg.predict(X_test)
        
#         print(True in np.isnan(y_predict))
#         # The mean squared error
#         print("Mean squared error: %.3f" % mean_squared_error(y_test, y_predict))
#         print("Mean squared root error: %.3f" % np.sqrt(mean_squared_error(y_test, y_predict)))
#         # The mean absolute error
#         print("Mean absolute error: %.3f" % mean_absolute_error(y_test, y_predict))
#         # The coefficient of determination: 1 is perfect prediction
#         print("Coefficient of determination: %.3f" % r2_score(y_test, y_predict))


In [None]:
linear_reg = LinearRegression()

# train the model with the training data
linear_reg.fit(X_train, y_train)


y_predict = linear_reg.predict(X_test)


# The coefficients
print("Coefficients: \n", linear_reg.coef_)
# The mean squared error
print("Mean squared error: %.3f" % mean_squared_error(y_test, y_predict))
print("Mean squared root error: %.3f" % np.sqrt(mean_squared_error(y_test, y_predict)))
# The mean absolute error
print("Mean absolute error: %.3f" % mean_absolute_error(y_test, y_predict))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.3f" % r2_score(y_test, y_predict))


In [None]:
# movies_users_merged = pd.merge(left=ratings_table['UserID'], right=ratings_table['MovieID'], on='UserID')

def pivot_table(data, values, index, column, fill_value=None):
    return pd.pivot_table(data=data,values=values, index=index, columns=column, fill_value=fill_value)


df_users_movies = pivot_table(data=ratings_table, values=['Rating'], index=['MovieID'], column=['UserID'], fill_value=mean_rating_global)

print(df_users_movies)
y = df_users_movies.to_numpy()