In [1]:
import os

# change dir for custom imports
os.chdir('../')

In [2]:
import pickle
import numpy as np
import pandas as pd
import math
from helpers.dataset_helpers import get_genres_as_columns, get_all_genres_list
from metric.helpers import (
    get_user_genre_list,
    get_user_max_likelihood,
    get_ideal_rankings,
    build_all_likelihood_dict
)

dataset_name = 'ml-25m'
# original data from the dataset
ratings = pd.read_csv('datasets/' + dataset_name + '/shrunk/ratings_small_v2.csv')
movies = pd.read_csv('datasets/' + dataset_name + '/movies.csv')

# data from recommender
testset = pd.read_csv('output/' + dataset_name + '/test.csv')
trainset = pd.read_csv('output/' + dataset_name + '/train.csv')
rankings = pd.read_csv('output/' + dataset_name + '/rankings.csv')

# merge item data to the ratings df
test_df = testset.merge(movies, how='inner', on='movieId').sort_values(by='userId')
ratings_df = ratings.merge(movies, how='inner', on='movieId').sort_values(by='userId')
rankings_df = rankings.merge(movies, how='inner', on='movieId').sort_values(by='rank').drop('title', axis=1)

# create a new column for the genres count
test_df['count_genres'] = test_df.apply(lambda x: len(x['genres'].split('|')), axis=1)

# get all unique genres and users
genres = get_all_genres_list(ratings_df)
users = list(set(ratings_df.userId.to_list()))

# merge rankings with the real ratings
rankings_hits = rankings_df.merge(ratings_df, how='left', on=['userId', 'movieId', 'genres']).sort_values(by=['userId', 'rank'])
rankings_hits = rankings_hits.drop(['title', 'timestamp'], axis=1)

# build the likelihood dict (dictionary will be saved in ./output/dataset)
# can be build it once per dataset (if we run group validation multiple times and vary some parameters)
build_all_likelihood_dict(users, genres, ratings_df, dataset_name)
# load user likelihood data
with open('output/' + dataset_name + '/likelihood.pkl', 'rb') as pkl_handle:
	likelihood_dict = pickle.load(pkl_handle)

In [None]:
# small example on user likelihood - could be skipped
# get user likelihood value example
user_id = 12
target_genre = 'Action'

get_user_max_likelihood(user_id, target_genre, genres, ratings_df)
# user offline dict for likelihood
likelihood_dict[target_genre][user_id]

### Clustering - kmeans

In [3]:
clustered_df = pd.read_csv('output/' + dataset_name + '/clusters.csv')

# group clusters into another dataframe with different representation
grouped_clusters = clustered_df.groupby('cluster')['userId'].apply(list).reset_index(name='users_list')
grouped_clusters['users_per_cluster'] = grouped_clusters.apply(lambda x: list(set(x.users_list)), axis=1)
grouped_clusters = grouped_clusters[['cluster', 'users_per_cluster']]

In [None]:
grouped_clusters.users_per_cluster.to_list()[50]

## Model evaluation - alpha-beta-ndcg

### get ideal rankings

In [None]:
test_df[test_df['userId'] == 3586]

In [4]:
discarded_users = []
ideal_rankings = []
k = 5
# get ideal rankings for every unique user in the dataset
for user in users:
    # check if ratings of user in the test_df is less than k
    if len(test_df[test_df['userId'] == user]) <= k:
        discarded_users.append(user)
        continue

    user_ideal_ranks = get_ideal_rankings(user, likelihood_dict, test_df, k=k)
    df = pd.DataFrame(user_ideal_ranks, columns=["userId", "movieId", "prediction", "rank"])
    ideal_rankings.append(df)

user_ideal_ranks_df = pd.concat(ideal_rankings)

# merge item data to the ratings df and then left joing to get real ratings
user_ideal_ranks_df = user_ideal_ranks_df.merge(movies, how='inner', on='movieId').sort_values(by='userId')
user_ideal_ranks_df = user_ideal_ranks_df.merge(ratings_df, how='left', on=['userId', 'movieId', 'genres', 'title']).sort_values(by=['userId', 'rank'])
user_ideal_ranks_df = user_ideal_ranks_df.drop(['title', 'timestamp'], axis=1)

In [None]:
user_ideal_ranks_df.head(2)

### alpha-beta-nDCG - Test data

In [None]:
len(discarded_users)

In [6]:
from metric.metric import transform_rankings_hits, get_user_dcg, get_dcg

rankings_hits_transformed = transform_rankings_hits(rankings_hits, genres)
rankings_hits_transformed_ideal = transform_rankings_hits(user_ideal_ranks_df, genres)

user_id = 12
k = 5
user_dcg = get_user_dcg(user_id, rankings_hits_transformed, ratings_df, likelihood_dict, k)
user_idcg = get_user_dcg(user_id, rankings_hits_transformed_ideal, ratings_df, likelihood_dict, k)
user_idcg

Unnamed: 0,userId,movieId,prediction,rank,genres,rating,Western,Sci-Fi,Mystery,Drama,...,Animation,Action,Comedy,Horror,Crime,War,Fantasy,Musical,IMAX,DCG
4,12,911,5.0,1,Comedy|Crime|Mystery|Romance|Thriller,5.0,0.0,0.0,5.0,0.0,...,0.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.209333
3,12,923,5.0,2,Drama|Mystery,5.0,0.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054923
2,12,260,5.0,3,Action|Adventure|Sci-Fi,5.0,0.0,5.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031146
0,12,608,5.0,4,Comedy|Crime|Drama|Thriller,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.043231
1,12,2692,5.0,5,Action|Crime,5.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.006703


In [7]:
# alpha-beta-nDCG for the whole test data
dcg = get_dcg(rankings_hits_transformed, ratings_df, likelihood_dict, discarded_users, k)
idcg = get_dcg(rankings_hits_transformed_ideal, ratings_df, likelihood_dict, discarded_users, k)
    
alpha_beta_ndcg = dcg[0]/idcg[0]
alpha_beta_ndcg

0.5121970103285001

### Group validation

In [8]:
dcg_df = dcg[1]
idcg_df = idcg[1]
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
print(all_clusters_list[0])
test_array = all_clusters_list[0]

[40705, 71683, 64516, 18565, 117507, 23431, 35719, 55689, 132099, 58251, 142984, 155022, 131343, 90128, 37906, 46101, 64790, 121112, 140696, 45851, 148510, 93471, 60325, 9382, 40486, 78117, 74794, 98094, 9391, 66865, 62132, 6325, 39861, 28092, 145597, 103230, 88261, 125565, 161481, 48586, 158667, 105935, 109394, 146263, 117081, 59866, 47708, 46049, 92257, 140897, 156387, 37486, 155374, 74608, 19701, 88181, 9463, 150395, 7036, 123901, 123391]


In [9]:
cluster_dcg_df = dcg_df.loc[~dcg_df['userId'].isin(test_array)]
cluster_dcg_df.head()

Unnamed: 0,userId,movieId,prediction,rank,genres,rating,Western,Sci-Fi,Mystery,Drama,...,Animation,Action,Comedy,Horror,Crime,War,Fantasy,Musical,IMAX,DCG
2896,40961,1244,7.356726,1,Comedy|Drama|Romance,4.5,0.0,0.0,0.0,4.5,...,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.237736
3674,40961,898,7.048339,2,Comedy|Drama|Romance,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062074
9768,40961,953,6.867205,3,Children|Drama|Fantasy|Romance,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.028916
13273,40961,905,6.846769,4,Comedy|Romance,,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000117
13977,40961,923,6.824763,5,Drama|Mystery,,0.0,0.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.9e-05


In [10]:
dcg_df = dcg[1]
idcg_df = idcg[1]
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
group_metric = {}

for cluster_id, cluster in enumerate(all_clusters_list):
    # dcg/idcg for the cluster
    cluster_dcg_df = dcg_df.loc[dcg_df['userId'].isin(cluster)]
    cluster_idcg_df = idcg_df.loc[idcg_df['userId'].isin(cluster)]
    cluster_dcg = sum(cluster_dcg_df['DCG'].to_list())
    cluster_idcg = sum(cluster_idcg_df['DCG'].to_list())
    # cluster ab-nDCG
    cluster_alpha_beta_ndcg = cluster_dcg/cluster_idcg

    # dcg/idcg for the equivalent (cluster - dataset = rest of the examples)
    cluster_equiv_dcg_df = dcg_df.loc[~dcg_df['userId'].isin(cluster)]
    cluster_equiv_idcg_df = idcg_df.loc[~idcg_df['userId'].isin(cluster)]
    cluster_equiv_dcg = sum(cluster_equiv_dcg_df['DCG'].to_list())
    cluster_equiv_idcg = sum(cluster_equiv_idcg_df['DCG'].to_list())
    # equivalent set ab-nDCG
    cluster_equiv_alpha_beta_ndcg = cluster_equiv_dcg/cluster_equiv_idcg

    group_metric[cluster_id] = [cluster_alpha_beta_ndcg, cluster_equiv_alpha_beta_ndcg]

In [11]:
group_metric_df = pd.DataFrame.from_dict(group_metric, orient='index')\
    .reset_index()\
    .rename({'index':'cluster', 0:'cluster-ab-nDCG', 1: 'equiv-ab-nDCG'}, axis=1)
group_metric_df['ab-ndcg'] = alpha_beta_ndcg
group_metric_df['perc-change'] = group_metric_df.apply(lambda x: ((x['cluster-ab-nDCG'] - x['ab-ndcg']) / x['ab-ndcg'])*100, axis=1)
group_metric_df['condition-1'] = group_metric_df.apply(lambda x: (x['cluster-ab-nDCG'] - x['equiv-ab-nDCG']), axis=1)
group_metric_df.to_csv('group_ndcg.csv', index=False)