## Matrix Factorization

In [2]:
# import later used packages
from surprise import SVD
import surprise
import pandas as pd
import numpy as np

# custom functions see .py files
from Rec_split import rec_split
from Kendall_distance import kendall_distance_with_penalty

pd.set_option('mode.chained_assignment', None)

### Data Loading and Preporcessing

In [3]:
#read data
df = pd.read_csv('data/ml_1M_full.csv')

#drop not needed columns
df = df.drop(columns=['Gender', 'Age', 'Occupation', 'Genre'])
#split data
train_df, val_df, test = rec_split(df, 'User', 'Timestamp', train_share=0.7, val_share=0.15)

#read data to surprise
reader = surprise.Reader(rating_scale=(1, 5))
train = surprise.Dataset.load_from_df(train_df[['User', 'Movie', 'Rating']], reader)
val = surprise.Dataset.load_from_df(val_df[['User', 'Movie', 'Rating']], reader)

train = train.build_full_trainset()
val = val.build_full_trainset()

### Hyperparameter Tuning

In [4]:
#hyperparameters
epochs = [50, 100, 250, 500, 1000]
learning_rate = [0.0001, 0.001, 0.1, 1]
emb_dim = [16, 32, 64, 128, 256]

In [14]:
# generate empty dataframe to save results
results = pd.DataFrame()

for epoch in epochs:
    for lr in learning_rate:
        for K in emb_dim:
            # train the model for each hyperparameter combination
            mf = SVD(n_factors=K,
                        n_epochs = epoch,
                        lr_all = lr)
            mf.fit(train)
            # evaluate hyperparameter on validation set
            preds = mf.test(val.build_testset())
            val_rmse = surprise.accuracy.rmse(preds)

            # save results of a specific hyperparameter combination
            res = pd.DataFrame({'val RMSE': val_rmse, 'embedding dimention': K, 'learning rate': lr, 'epochs': epoch}, index=[0])
            # add result to dataframe with all results
            results = pd.concat([results, res], ignore_index=True)
            results.to_csv('results/MF_hyperparameter.csv')

RMSE: 0.9577
RMSE: 0.9584
RMSE: 0.9596
RMSE: 0.9621
RMSE: 0.9682
RMSE: 0.9073
RMSE: 0.9072
RMSE: 0.9035
RMSE: 0.9040
RMSE: 0.9062
RMSE: 0.9748
RMSE: 1.0131
RMSE: 1.0560
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 0.9344
RMSE: 0.9350
RMSE: 0.9364
RMSE: 0.9393
RMSE: 0.9447
RMSE: 0.8858
RMSE: 0.8841
RMSE: 0.8865
RMSE: 0.8929
RMSE: 0.9001
RMSE: 0.9741
RMSE: 1.0168
RMSE: 1.0570
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 0.9181
RMSE: 0.9187
RMSE: 0.9199
RMSE: 0.9204
RMSE: 0.9240
RMSE: 0.8824
RMSE: 0.9000
RMSE: 0.9186
RMSE: 0.9235
RMSE: 0.9119
RMSE: 0.9751
RMSE: 1.0157
RMSE: 1.0569
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 0.9099
RMSE: 0.9071
RMSE: 0.9046
RMSE: 0.9048
RMSE: 0.9067
RMSE: 0.8964
RMSE: 0.9256
RMSE: 0.9482
RMSE: 0.9380
RMSE: 0.9104
RMSE: 0.9738
RMSE: 1.0173
RMSE: 1.0564
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814
RMSE: 1.8814

In [8]:
# check for the best performing hyper parameter combination
results = pd.read_csv('results/MF_hyperparameter.csv').drop(columns='Unnamed: 0')
results.sort_values('val RMSE', inplace=True)
results

Unnamed: 0,val RMSE,embedding dimention,learning rate,epochs
45,0.882416,16,0.0010,250
26,0.884062,32,0.0010,100
25,0.885840,16,0.0010,100
81,0.885891,32,0.0001,1000
27,0.886497,64,0.0010,100
...,...,...,...,...
15,1.881353,16,1.0000,50
39,1.881353,256,1.0000,100
73,1.881353,128,0.1000,500
59,1.881353,256,1.0000,250


### Model Evaluation

In [9]:
#optimal hyperparameter
epochs_opt = 250
lr_opt  = 0.001
emb_dim_opt = 16

In [10]:
# train the model with optimal hyperparameters
mf_opt = SVD(n_factors=emb_dim_opt,
             n_epochs = epochs_opt,
             lr_all = lr_opt)

mf_opt.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x237ff434050>

In [11]:
# build prediction dataset
predset = train.build_anti_testset()

# generate predictions
predictions = mf_opt.test(predset)

# convert predictions to a pandas dataframe
predictions = [(pred.uid, pred.iid, pred.r_ui, pred.est, pred.details['was_impossible']) for pred in predictions]
predictions = pd.DataFrame(predictions, columns=['User', 'Movie', 'r_ui', 'Prediction', 'was_impossible']).drop(columns=['r_ui', 'was_impossible'])

In [31]:
a = test[test['User']==1]

b = a[a['Movie']==2687]['Rating']

b

0    3
Name: Rating, dtype: int64

In [37]:
# Extracting unique users from the training data
users = train_df.User.unique()

# Initializing DataFrames to store results
awhrs = pd.DataFrame()
asats = pd.DataFrame()
asats_2 = pd.DataFrame()

# List to store Kendall Distance sums for each user
kendal_sum = []  
kendal_sum_2 = []

# Looping through different values of k
for k in [1, 5, 10, 20, 50]:
    whrs = []  # List to store Weighted Hit Rates for each user
    sat_us = []  # List to store User Satisfaction values for each user
    sat_us_2 = []  # List to store User Satisfaction values (with different threshold) for each user
    recommendations_allu = []  # List to store recommendations for each user
    
    # Looping through each user
    for user in users:
        whr = 0  # Initializing Weighted Hit Rate for the user
        sat = 0  # Initializing User Satisfaction for the user
        sat_2 = 0  # Initializing User Satisfaction (with different threshold) for the user
        
        ratings = test[test['User']==user]  # Extracting ratings for the current user
        
        predictions_user = predictions[predictions['User']==user]  # Extracting predictions for the current user
        recommendations = predictions_user.sort_values('Prediction', ascending=False).head(k)  # Selecting top k recommendations
        
        # Calculating Weighted Hit Rate and User Satisfaction for the current user
        for rec in recommendations['Movie'].values:
            if len(ratings[ratings['Movie']==rec]['Rating']) == 1:
                rat = ratings[ratings['Movie']==rec]['Rating'].values[0]
                if rat == 1:
                    whr -= 5
                elif rat == 2:
                    whr -= 2
                elif rat == 3:
                    whr += 2
                elif rat == 4:
                    whr += 6
                    sat = 1
                elif rat == 5:
                    whr += 12
                    sat = 1
                    sat_2 = 1
        
        whr = whr / k  # Calculating Weighted Hit Rate per recommendation
        whrs.append(whr)
        sat_us.append(sat)
        sat_us_2.append(sat_2)
        recommendations_allu.append(list(recommendations['Movie']))  # Storing recommendations for the current user
        
        # Calculating Kendall Distance with panalty
        # only once as it uses the whole sequence of predictions and is therefore independend of k
        if k == 1:
            kendal_u = kendall_distance_with_penalty(predictions_user, ratings, 'Movie', 'Movie', 'Prediction', 'Rating', p=0.05)
            kendal_u_2 = kendall_distance_with_penalty(predictions_user, ratings, 'Movie', 'Movie', 'Prediction', 'Rating', p=0.2)
            
            kendal_sum.append(kendal_u)
            kendal_sum_2.append(kendal_u_2)
    
    # Calculating average Weighted Hit Rate for current k
    average_whr = pd.DataFrame({'Average Weigthed Hit Rate': np.mean(whrs), 'k': k}, index=[0])
    # Calculating average User Satisfaction for current k
    average_sat = pd.DataFrame({'Average User Satisfaction': np.mean(sat_us), 'k': k}, index=[0])
    # Calculating average User Satisfaction (with different threshold) for current k
    average_sat_2 = pd.DataFrame({'Average User Satisfaction': np.mean(sat_us_2), 'k': k}, index=[0])
    
    # Saving recommendation distribution for current k to a CSV file
    recommendations_k = pd.DataFrame({'Element': pd.Series(recommendations_allu).index,
                                      'Occurrence Count': pd.Series(recommendations_allu).values})
    recommendations_k.to_csv(f'results/Recommendation_distribution@{k}.csv')
    
    # Appending results to respective DataFrames
    awhrs = pd.concat([awhrs, average_whr], ignore_index=True)
    asats = pd.concat([asats, average_sat], ignore_index=True)
    asats_2 = pd.concat([asats_2, average_sat_2], ignore_index=True)

# Calculating average Kendall Distance
kendal = pd.DataFrame({'Kendall Distance': np.mean(kendal_sum), 'p': 0.05}, index=[0])
kendal_2 = pd.DataFrame({'Kendall Distance': np.mean(kendal_sum_2), 'p': 0.2}, index=[0])
# Concatenating both Kendall Distance DataFrames
kendal = pd.concat([kendal, kendal_2], ignore_index=True)

In [38]:
awhrs.to_csv('results/MF_awhrs.csv')
asats.to_csv('results/MF_asats.csv')
asats_2.to_csv('results/MF_asats2.csv')
kendal.to_csv('results/MF_Kendall.csv')