In [1]:
import pandas as pd
import numpy as np

In [2]:
import tarfile

In [3]:
tar1 = tarfile.open("../data/model_results.tar.gz")

In [31]:
tar1.getnames()

['model_results',
 'model_results/final_recommendations.csv',
 'model_results/model_test_results.csv',
 'model_results/base_model_selection.csv']

In [4]:
df_rec = pd.read_csv(tar1.extractfile(tar1.getnames()[1]), header=0)

In [5]:
df_rec

Unnamed: 0,userId,movieId,prediction
0,4,318,4.624113
1,4,2571,4.556694
2,4,1196,4.533291
3,4,260,4.530176
4,4,58559,4.517324
...,...,...,...
99995,283185,166643,4.228540
99996,283185,3916,4.179170
99997,283185,8533,4.143071
99998,283185,160980,4.082581


In [21]:
tar2 = tarfile.open("../data/sample.tar.gz")

In [7]:
tar2.getnames()

['samples',
 'samples/sample_train.csv',
 'samples/sample.csv',
 'samples/sample_test.csv']

In [34]:
sample = pd.read_csv(tar2.extractfile(tar2.getnames()[2]),header=0)

In [29]:
sample['rank'] = sample[['userId','rating']].groupby("userId").rank(method='first').squeeze()

In [30]:
sample[["userId",'movieId','rank']]

Unnamed: 0,userId,movieId,rank
0,4,1,187.0
1,4,5,37.0
2,4,6,282.0
3,4,10,188.0
4,4,11,117.0
...,...,...,...
4340399,283185,4023,92.0
4340400,283185,4025,107.0
4340401,283185,4027,50.0
4340402,283185,4039,93.0


In [25]:
combine = pd.merge(left=sample,right=df_rec,on=["userId","movieId"],how='inner')

In [36]:
combine

Unnamed: 0,userId,movieId,rating,rank,prediction
0,100704,858,3.5,16.0,4.812945
1,102684,318,5.0,42.0,4.719297
2,104454,50,3.0,13.0,3.948186
3,10518,1260,4.5,55.0,4.560972
4,105592,296,4.0,44.0,4.114065
...,...,...,...,...,...
6151,88155,318,5.0,44.0,4.630070
6152,89010,1206,3.5,12.0,4.324143
6153,89010,541,4.0,27.0,4.328405
6154,96837,318,4.0,16.0,4.595734


In [37]:
tt = combine[["userId",'rank']]
tt["rank"].apply(lambda x:1/np.log2(x+1))

0       0.244651
1       0.184289
2       0.262650
3       0.172195
4       0.182088
          ...   
6151    0.182088
6152    0.270238
6153    0.208015
6154    0.244651
6155    0.205847
Name: rank, Length: 6156, dtype: float64

In [38]:
tt

Unnamed: 0,userId,rank
0,100704,16.0
1,102684,42.0
2,104454,13.0
3,10518,55.0
4,105592,44.0
...,...,...
6151,88155,44.0
6152,89010,12.0
6153,89010,27.0
6154,96837,16.0


# NDCG func

In [35]:
def NDCG(rec,test):
    """
    if ratings are equal, we choose dense rank.
    
    Args:
        rec(pd.DataFrame): the model recommended movies and predictions for users in testset
        sample(pd.DataFrame): the testset with user and ratings for movies
    Return:
        NDCG of this serie of recommendations on the users in test
    """
    test_copy = test.copy(deep=True)
    rec_copy = rec.copy(deep=True)
    k = rec_copy.groupby('userId')['movieId'].count().unique()
    if len(k) > 1:
        print("not all users have same number of recommendations!")
        return -1
    k = k[0]
    # generate rank for each users of movies in test
    test_copy['Rank'] = test_copy[['userId','rating']].groupby("userId").rank(method='dense').squeeze()
    test_copy = test_copy[["userId",'movieId','Rank']]
    # this would filter out any irrelavent movies using inner join with the testset
    combine = pd.merge(left=test_copy,right=rec_copy,on=["userId","movieId"],how='inner')
    combine = combine[['userId','Rank']]
    combine['Rank/log2'] = combine.Rank.apply(lambda x:1/np.log2(x+1))
    DCG = combine['Rank/log2'].sum() / combine.userId.nunique()
    IDCG = sum([1/np.log2(i+1) for i in range(1,k+1)])
    return DCG/IDCG

NDCG(df_rec,sample)

0.3463440476343716