In [1]:
# import later used packages
import pandas as pd
from itertools import product
import numpy as np

#import custom made functions, see .py files
from Rec_split import rec_split
from Kendall_distance import kendall_distance_with_penalty

np.random.seed(123)
pd.set_option('mode.chained_assignment', None)

### Load and Preprocess Data

In [2]:
#load data
data =pd.read_csv('data/ml_1M_full.csv')
train, val , test = rec_split(data, 'User', 'Timestamp', train_share=0.7, val_share=0.15)

In [3]:
#generate dataset for prediction generation
movie_columns = ['Movie', 'Genre', 'Release_Year']
user_columns = ['User', 'Gender', 'Age', 'Occupation']

movie_df = train[movie_columns].drop_duplicates()
user_df = train[user_columns].drop_duplicates()

# Create combinations of rows from both DataFrames
combined_rows = [list(row1) + list(row2) for row1, row2 in product(movie_df.values, user_df.values)]

# Create a new DataFrame with columns from both DataFrames
columns = list(movie_df.columns) + list(user_df.columns)
prediction_df = pd.DataFrame(combined_rows, columns=columns)

#remove data already in train
merged_df = pd.merge(prediction_df, train, on=['User', 'Movie'], how='outer', indicator=True)
merged_df = merged_df[merged_df['_merge']=='left_only'].drop(columns=['_merge', 'Rating', 'Gender_y', 'Age_y', 'Occupation_y', 'Genre_y', 'Release_Year_y']).rename(columns=lambda x: x.replace('_x', ''))

#remove data already in val
merged_df = pd.merge(prediction_df, val, on=['User', 'Movie'], how='outer', indicator=True)
prediction_df = merged_df[merged_df['_merge']=='left_only'].drop(columns=['_merge', 'Rating', 'Gender_y', 'Age_y', 'Occupation_y', 'Genre_y', 'Release_Year_y']).rename(columns=lambda x: x.replace('_x', ''))

#remove movies exclusivly in test
prediction_df = pd.merge(prediction_df, test, on=['User', 'Movie'], how='outer', indicator=True)
test = prediction_df[prediction_df['_merge']!='right_only'].drop(columns=['_merge', 'Gender_y', 'Age_y', 'Occupation_y', 'Genre_y', 'Release_Year_y']).rename(columns=lambda x: x.replace('_x', ''))

In [4]:
# generate 'Predictions'
test['Prediction'] = np.random.randint(1, 6, size=len(test))
test['Prediction_2'] = np.random.randint(1, 6, size=len(test))
test['Prediction_3'] = np.random.randint(1, 6, size=len(test))
test['Prediction_4'] = np.random.randint(1, 6, size=len(test))
test['Prediction_5'] = np.random.randint(1, 6, size=len(test))

### Generate metrics for random recommendations

In [6]:
users = test.User.unique()

awhrs = pd.DataFrame()
asats = pd.DataFrame()
asats_2 = pd.DataFrame()

for k in [1, 5, 10, 20, 50]:
    whrs = []
    sat_us = []
    sat_us_2 = []
    recommendations_allu = []
    
    for user in users:
        whr = 0
        sat = 0
        sat_2 = 0
        kendal_u = 0
        kendal_u_2 = 0
        
        for pred in ['Prediction', 'Prediction_2', 'Prediction_3', 'Prediction_4', 'Prediction_5']:
            predictions_user = test[test['User']==user]
            recommendations = predictions_user.sort_values(pred, ascending=False).head(k)
            
            # Calculate weighted hit rate and user satisfaction
            for rec in recommendations['Rating']:
                if rec == 1:
                    whr -= 5
                elif rec == 2:
                    whr -= 2
                elif rec == 3:
                    whr += 2
                elif rec == 4:
                    whr += 6
                    sat = 1
                elif rec == 5:
                    whr += 12
                    sat = 1
                    sat_2 = 1
        
        whr = whr / 5    
        whr = whr / k
        whrs.append(whr)

        sat = sat / 5
        sat_us.append(sat)

        sat_2 = sat_2 / 5
        sat_us_2.append(sat_2)

        # Store recommendations for the user
        recommendations_allu.append(list(recommendations['Movie']))   

    average_whr = pd.DataFrame({'Average Weigthed Hit Rate': np.mean(whrs), 'k': k}, index=[0])
    average_sat = pd.DataFrame({'Average User Satisfaction':np.mean(sat_us), 'k': k}, index=[0])
    average_sat_2 = pd.DataFrame({'Average User Satisfaction':np.mean(sat_us_2), 'k': k}, index=[0])

    # Store recommendation distribution for current k
    recommendations_k = pd.DataFrame({'Element': pd.Series(recommendations_allu).index, 'Occurrence Count': pd.Series(recommendations_allu).values})
    recommendations_k.to_csv(f'results/Recommendation_distribution@{k}.csv')

    awhrs = pd.concat([awhrs, average_whr], ignore_index=True)
    asats = pd.concat([asats, average_sat], ignore_index=True)
    asats_2 = pd.concat([asats_2, average_sat_2], ignore_index=True)

In [7]:
# save results to csv
awhrs.to_csv('results/random_awhrs.csv')
asats.to_csv('results/random_asats.csv')
asats_2.to_csv('results/random_asats2.csv')

In [None]:
#compute Kendall distance with p
kendal_pred = []
kendal_pred_2 = []

for pred in ['Prediction', 'Prediction_2', 'Prediction_3', 'Prediction_4', 'Prediction_5']:
    kendal_list = []
    kendal_list_2 = []

    for user in test.User.unique():
        predictions_user = test[test['User']==user]

        kendal_u = kendall_distance_with_penalty(predictions_user[~predictions_user['Rating'].isna()], predictions_user[~predictions_user['Rating'].isna()], 'Movie', 'Movie', 'Rating_x', f'{pred}_x', p = 0.05)
        kendal_u_2 = kendall_distance_with_penalty(predictions_user[~predictions_user['Rating'].isna()], predictions_user[~predictions_user['Rating'].isna()], 'Movie', 'Movie', 'Rating_x', f'{pred}_x', p = 0.2)

        kendal_list.append(kendal_u)
        kendal_list_2.append(kendal_u_2)

    kendal_avr = np.mean(kendal_list)
    kendal_avr_2 = np.mean(kendal_list_2)

    kendal_pred.append(kendal_avr)
    kendal_pred_2.append(kendal_avr_2)

# Calculate average Kendall distance
kendal = pd.DataFrame({'Kendall Distance':np.mean(kendal_pred), 'p': 0.05}, index=[0])
kendal_2 = pd.DataFrame({'Kendall Distance':np.mean(kendal_pred_2), 'p': 0.2}, index=[0])
# Concatenate results for different values of p
kendal = pd.concat([kendal, kendal_2], ignore_index=True)

In [10]:
#save Kendall distances to csv
kendal.to_csv('results/random_Kendall.csv')