In [3]:
import pickle
import numpy as np
import pandas as pd
import math
from helpers.dataset_helpers import get_genres_as_columns, get_all_genres_list
from metric.helpers import (
    get_user_genre_list,
    get_user_max_likelihood,
    get_ideal_rankings,
    build_all_likelihood_dict
)

dataset = 'ml-latest-small'

# data from recommender
testset = pd.read_csv('output/' + dataset + '/test.csv')
trainset = pd.read_csv('output/' + dataset + '/train.csv')
rankings = pd.read_csv('output/' + dataset + '/rankings.csv')

# original data from the dataset
ratings = pd.read_csv('datasets/' + dataset + '/ratings.csv')
movies = pd.read_csv('datasets/' + dataset + '/movies.csv')

# merge item data to the ratings df
test_df = testset.merge(movies, how='inner', on='movieId').sort_values(by='userId')
ratings_df = ratings.merge(movies, how='inner', on='movieId').sort_values(by='userId')
rankings_df = rankings.merge(movies, how='inner', on='movieId').sort_values(by='rank').drop('title', axis=1)

# create a new column for the genres count
test_df['count_genres'] = test_df.apply(lambda x: len(x['genres'].split('|')), axis=1)

# get all unique genres and users
genres = get_all_genres_list(ratings_df)
users = list(set(ratings_df.userId.to_list()))

# merge rankings with the real ratings
rankings_hits = rankings_df.merge(ratings_df, how='left', on=['userId', 'movieId', 'genres']).sort_values(by=['userId', 'rank'])
rankings_hits = rankings_hits.drop(['title', 'timestamp'], axis=1)

# build the likelihood dict (dictionary will be saved in ./output/dataset)
# build it once per dataset
# build_all_likelihood_dict(users, genres, ratings_df)
# load user likelihood data
with open("./output/" + dataset + "/likelihood.pkl", "rb") as pkl_handle:
	likelihood_dict = pickle.load(pkl_handle)

In [4]:
# small example on user likelihood - could be skipped
# get user likelihood value example
user_id = 2
target_genre = 'Action'

get_user_max_likelihood(user_id, target_genre, genres, ratings_df)
# user offline dict for likelihood
likelihood_dict[target_genre][user_id - 1][1]

0.14897260273972604

### Clustering - kmeans

In [6]:
from clustering.helpers import transform_df
from clustering.kmeans import create_clsuters

# number of clusters
n = 20

transformed_df = transform_df(ratings_df)
clustered_df = create_clsuters(n, transformed_df)

# group clusters into another dataframe with different representation
grouped_clusters = clustered_df.groupby('cluster')['userId'].apply(list).reset_index(name='users_list')
grouped_clusters['users_per_cluster'] = grouped_clusters.apply(lambda x: list(set(x.users_list)), axis=1)
grouped_clusters = grouped_clusters[['cluster', 'users_per_cluster']]

  dataset.loc[dataset['genres'].str.contains(i), i] = 1


### Ideal rankings - Recommender output

In [8]:
ideal_rankings = []
# get ideal rankings example
for user in users:
    user_ideal_ranks = get_ideal_rankings(user, likelihood_dict, test_df, k=3)
    df = pd.DataFrame(user_ideal_ranks, columns=["userId", "movieId", "prediction", "rank"])
    ideal_rankings.append(df)

user_ideal_ranks_df = pd.concat(ideal_rankings)

# merge item data to the ratings df and then left joing to get real ratings
user_ideal_ranks_df = user_ideal_ranks_df.merge(movies, how='inner', on='movieId').sort_values(by='userId')
user_ideal_ranks_df = user_ideal_ranks_df.merge(ratings_df, how='left', on=['userId', 'movieId', 'genres', 'title']).sort_values(by=['userId', 'rank'])
user_ideal_ranks_df = user_ideal_ranks_df.drop(['title', 'timestamp'], axis=1)

In [9]:
user_ideal_ranks_df.head()

Unnamed: 0,userId,movieId,prediction,rank,genres,rating,Adventure,Crime,Comedy,Action,...,Mystery,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,1617,5.0,1,Crime|Film-Noir|Mystery|Thriller,5.0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
2,1,3671,5.0,2,Comedy|Western,5.0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,1,2459,5.0,3,Horror,5.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2,89774,5.0,1,Drama,5.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,48516,4.0,2,Crime|Drama|Thriller,4.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### alpha-beta-nDCG - Test data

In [10]:
from metric.metric import transform_rankings_hits, get_user_dcg, get_dcg

rankings_hits_transformed = transform_rankings_hits(rankings_hits, genres)
rankings_hits_transformed_ideal = transform_rankings_hits(user_ideal_ranks_df, genres)

user_id = 2
k = 4
user_dcg = get_user_dcg(user_id, rankings_hits_transformed, ratings_df, likelihood_dict, k)
user_idcg = get_user_dcg(user_id, rankings_hits_transformed_ideal, ratings_df, likelihood_dict, k)
user_idcg

Unnamed: 0,userId,movieId,prediction,rank,genres,rating,Adventure,Crime,Comedy,Action,...,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War,IMAX,DCG
5,2,89774,5.0,1,Drama,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.113014
4,2,48516,4.0,2,Crime|Drama|Thriller,4.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.086893
3,2,112552,4.0,3,Drama,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.01093


In [11]:
# alpha-beta-nDCG for the whole test data
dcg = get_dcg(rankings_hits_transformed, ratings_df, likelihood_dict, k)
idcg = get_dcg(rankings_hits_transformed_ideal, ratings_df, likelihood_dict, k)

alpha_beta_ndcg = dcg[0]/idcg[0]
alpha_beta_ndcg

0.35984493189931616

### Group validation

In [None]:
dcg_df = dcg[1]
idcg_df = idcg[1]
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
print(all_clusters_list[0])
test_array = all_clusters_list[0]

In [None]:
dcg_df

In [None]:
cluster_dcg_df = dcg_df.loc[~dcg_df['userId'].isin(test_array)]
cluster_dcg_df

In [15]:
dcg_df = dcg[1]
idcg_df = idcg[1]
all_clusters_list = grouped_clusters.users_per_cluster.to_list()
group_metric = {}

for cluster_id, cluster in enumerate(all_clusters_list):
    # dcg/idcg for the cluster
    cluster_dcg_df = dcg_df.loc[dcg_df['userId'].isin(cluster)]
    cluster_idcg_df = idcg_df.loc[idcg_df['userId'].isin(cluster)]
    cluster_dcg = sum(cluster_dcg_df['DCG'].to_list())
    cluster_idcg = sum(cluster_idcg_df['DCG'].to_list())
    # cluster ab-nDCG
    cluster_alpha_beta_ndcg = cluster_dcg/cluster_idcg

    # dcg/idcg for the equivalent (cluster - dataset = rest of the examples)
    cluster_equiv_dcg_df = dcg_df.loc[~dcg_df['userId'].isin(cluster)]
    cluster_equiv_idcg_df = idcg_df.loc[~idcg_df['userId'].isin(cluster)]
    cluster_equiv_dcg = sum(cluster_equiv_dcg_df['DCG'].to_list())
    cluster_equiv_idcg = sum(cluster_equiv_idcg_df['DCG'].to_list())
    # equivalent set ab-nDCG
    cluster_equiv_alpha_beta_ndcg = cluster_equiv_dcg/cluster_equiv_idcg

    group_metric[cluster_id] = [cluster_alpha_beta_ndcg, cluster_equiv_alpha_beta_ndcg]

In [16]:
group_metric_df = pd.DataFrame.from_dict(group_metric, orient='index')\
    .reset_index()\
    .rename({'index':'cluster', 0:'cluster-ab-nDCG', 1: 'equiv-ab-nDCG'}, axis=1)
group_metric_df['ab-ndcg'] = alpha_beta_ndcg
group_metric_df['perc-change'] = group_metric_df.apply(lambda x: ((x['cluster-ab-nDCG'] - x['ab-ndcg']) / x['ab-ndcg'])*100, axis=1)
group_metric_df['condition-1'] = group_metric_df.apply(lambda x: (x['cluster-ab-nDCG'] - x['equiv-ab-nDCG']), axis=1)
group_metric_df.to_csv('group_ndcg.csv', index=False)