In [1]:
import pickle
import numpy as np
import pandas as pd
import math
from helpers.dataset_helpers import get_genres_as_columns, get_all_genres_list
from metric.helpers import (
    get_user_genre_list,
    get_user_max_likelihood,
    get_ideal_rankings,
    build_all_likelihood_dict
)

# data from recommender
testset = pd.read_csv('output/test.csv')
trainset = pd.read_csv('output/train.csv')
rankings = pd.read_csv('output/rankings.csv')

# original data from the dataset
ratings = pd.read_csv('datasets/ml-latest-small/ratings.csv')
movies = pd.read_csv('datasets/ml-latest-small/movies.csv')

# merge item data to the ratings df
test_df = testset.merge(movies, how='inner', on='movieId').sort_values(by='userId')
ratings_df = ratings.merge(movies, how='inner', on='movieId').sort_values(by='userId')
rankings_df = rankings.merge(movies, how='inner', on='movieId').sort_values(by='rank').drop('title', axis=1)

# create a new column for the genres count
test_df['count_genres'] = test_df.apply(lambda x: len(x['genres'].split('|')), axis=1)

# get all unique genres and users
genres = get_all_genres_list(ratings_df)
users = list(set(ratings_df.userId.to_list()))

# merge rankings with the real ratings
rankings_hits = rankings_df.merge(ratings_df, how='left', on=['userId', 'movieId', 'genres']).sort_values(by=['userId', 'rank'])
rankings_hits = rankings_hits.drop(['title', 'timestamp'], axis=1)

# build the likelihood dict (dictionary will be saved in ./output)
# build it once per dataset
# build_all_likelihood_dict(users, genres, ratings_df)
# load user likelihood data
with open("./output/data.pkl", "rb") as pkl_handle:
	likelihood_dict = pickle.load(pkl_handle)

## Examples

In [2]:
# get user likelihood value example
user_id = 2
target_genre = 'Action'

get_user_max_likelihood(user_id, target_genre, genres, ratings_df)
# user offline dict for likelihood
likelihood_dict[target_genre][user_id - 1][1]

0.14897260273972604

In [27]:
ideal_rankings = []
# get ideal rankings example
for user in users:
    user_ideal_ranks = get_ideal_rankings(user, likelihood_dict, test_df, k=3)
    df = pd.DataFrame(user_ideal_ranks, columns=["userId", "movieId", "prediction", "rank"])
    ideal_rankings.append(df)

user_ideal_ranks_df = pd.concat(ideal_rankings)

# merge item data to the ratings df and then left joing to get real ratings
user_ideal_ranks_df = user_ideal_ranks_df.merge(movies, how='inner', on='movieId').sort_values(by='userId')
user_ideal_ranks_df = user_ideal_ranks_df.merge(ratings_df, how='left', on=['userId', 'movieId', 'genres', 'title']).sort_values(by=['userId', 'rank'])
user_ideal_ranks_df = user_ideal_ranks_df.drop(['title', 'timestamp'], axis=1)

In [28]:
user_ideal_ranks_df.head()

Unnamed: 0,userId,movieId,prediction,rank,genres,rating
0,1,1617,5.0,1,Crime|Film-Noir|Mystery|Thriller,5.0
1,1,3671,5.0,2,Comedy|Western,5.0
2,1,2459,5.0,3,Horror,5.0
5,2,89774,5.0,1,Drama,5.0
4,2,48516,4.0,2,Crime|Drama|Thriller,4.0


## alpha-beta-nDCG

In [31]:
from metric.metric import transform_rankings_hits, get_user_dcg, get_dcg

rankings_hits_transformed = transform_rankings_hits(rankings_hits, genres)
rankings_hits_transformed_ideal = transform_rankings_hits(user_ideal_ranks_df, genres)

user_id = 2
k = 3
user_dcg = get_user_dcg(user_id, rankings_hits_transformed, ratings_df, likelihood_dict, k)
user_idcg = get_user_dcg(user_id, rankings_hits_transformed_ideal, ratings_df, likelihood_dict, k)
user_idcg

Unnamed: 0,userId,movieId,prediction,rank,genres,rating,IMAX,Action,Thriller,Horror,...,Adventure,Children,Romance,Film-Noir,Musical,Fantasy,Documentary,War,Comedy,DCG
5,2,89774,5.0,1,Drama,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113014
4,2,48516,4.0,2,Crime|Drama|Thriller,4.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086893
3,2,112552,4.0,3,Drama,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01093


In [33]:
dcg = get_dcg(rankings_hits_transformed, ratings_df, likelihood_dict, k)

In [32]:
idcg = get_dcg(rankings_hits_transformed_ideal, ratings_df, likelihood_dict, k)

In [36]:
dcg/idcg

0.32089282205081354