In [108]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from typing import List
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *


In [109]:
config= {
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'data_path': root_dir / 'cache',
    
    'csvs_path': root_dir / 'outputs',
    }


In [110]:
def get_validation_target(config: dict):
    
    print('Getting sessions')
    
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])
    
    val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                   .drop(['is_train','is_validation'],axis=1) \
                   .reset_index(drop=True)

    target = val.loc[(val.action_type=='clickout item') & (val.reference.isnull()),['user_id','session_id','timestamp','step','target']].copy()
    
    target['timestamp']=target['timestamp'].astype(np.int64)//10**9
    
    print('num_sessions_target: ', target.shape[0])
    


    return target



def get_long_format(df: pd.DataFrame):
    
    df['item_recommendations']=df['item_recommendations'].str.split(' ')
    df['item_probs']=df['item_probs'].str.split(' ')
    
    df_imp_long=df.item_recommendations.apply(pd.Series) \
        .merge(df, right_index = True, left_index = True) \
        .drop(['item_recommendations','item_probs'], axis = 1)  \
        .melt(id_vars=['user_id', 'session_id', 'timestamp', 'step'], \
              value_name = "item_recommendations", \
              var_name = "rank" ) \
        .dropna() \
        .sort_values(['user_id', 'session_id', 'timestamp', 'step','rank']) \
        .reset_index(drop=True) \
        .copy()
    
    df_probs_long=df.item_probs.apply(pd.Series) \
        .merge(df, right_index = True, left_index = True) \
        .drop(['item_recommendations','item_probs'], axis = 1)  \
        .melt(id_vars=['user_id', 'session_id', 'timestamp', 'step'], \
              value_name = "item_probs", \
              var_name = "rank" ) \
        .dropna() \
        .sort_values(['user_id', 'session_id', 'timestamp', 'step','rank']) \
        .reset_index(drop=True) \
        .copy()
    
    df_long = df_imp_long.merge(df_probs_long,on=['user_id', 'session_id', 'timestamp', 'step','rank'])
    df_long['rank'] = df_long['rank'] + 1
    df_long['item_probs'] = df_long['item_probs'].astype(float)
    
    sum_probs=df_long.groupby(['user_id', 'session_id', 'timestamp', 'step']).item_probs.sum()
    df_long=df_long.join(sum_probs,on=['user_id', 'session_id', 'timestamp', 'step'],rsuffix='_sum')
    df_long['item_probs'] = df_long['item_probs']/df_long['item_probs_sum']
    

    
    return df_long



def evaluate_MRR_stats(long: pd.DataFrame, target: pd.DataFrame):
#     long = get_long_format(df=frame.copy())
    
    # re rank and calc reciprocal rank
    long.sort_values(['user_id', 'session_id', 'timestamp', 'step','item_probs','rank'],\
                 ascending=[True,True,True,True,False,True],inplace=True)

    long['rank2'] = 1
    long['rank2'] = long.groupby(['user_id', 'session_id', 'timestamp', 'step'])['rank2'].cumsum()

    long['RR']=1/long['rank2']
    
    # calc MRR
    long=long.merge(target,on=['user_id', 'session_id', 'timestamp', 'step'])
    
    num_null_target = sum(long.target.isnull())

    MRR = long.loc[long.item_recommendations==long.target,'RR'].mean()
    
    num_sessions = long.loc[long.item_recommendations==long.target,'RR'].shape[0]
    

    
    return MRR, num_sessions, num_null_target



def main(config: dict):
    pth = config['csvs_path']
    
    target = get_validation_target(config)
    
    MRR_stats = []
    merged = pd.DataFrame

    for ii, fpath in enumerate(pth.glob('*.csv')):
        
        print('Reading ', fpath.name)
        
        frame = get_long_format(df=pd.read_csv(fpath))
        
        MRR, num_sessions, num_null_target = \
            evaluate_MRR_stats(long=frame, target=target)
        
        print('MRR: ', MRR)
        
        MRR_stats.append([fpath.name, MRR, num_sessions, num_null_target])
    
        
        frame.drop('rank',axis=1,inplace=True)
        if ii == 0:
            merged = frame
        else:
            if merged.shape[0]>frame.shape[0]:
                merged = merged.merge(frame,on=['user_id', 'session_id', 'timestamp', 'step','item_recommendations'],how='left')
            else:
                merged = frame.merge(merged,on=['user_id', 'session_id', 'timestamp', 'step','item_recommendations'],how='left')
                
            merged.fillna(0,inplace=True)
            merged['item_probs']=merged['item_probs_x']+merged['item_probs_y']
            merged.drop(['item_probs_x','item_probs_y'],axis=1,inplace=True)
            
    merged['rank'] = 1
    merged['rank'] = merged.groupby(['user_id', 'session_id', 'timestamp', 'step'])['rank'].cumsum()
    
    MRR, num_sessions, num_null_target = \
    evaluate_MRR_stats(long=merged, target=target)

    print('Merged MRR: ', MRR)
    
    MRR_stats.append(['merged', MRR, num_sessions, num_null_target])
            
        
                
    
    return merged, MRR_stats


In [111]:
merged, MRR_stats=main(config)

MRR_stats
pd.DataFrame(MRR_stats,columns=['file','MRR','num_sessions','num_null_target'])

Getting sessions
num_sessions_target:  10336
Reading  Impress_data_sub_005_nvars_2_output_meta.csv
MRR:  0.5183518724167401
Reading  NB_data_sub_005_sl_1_val_1_output_meta_2nd_filter_0.01.csv
MRR:  0.5904056081561067
Merged MRR:  0.5315256255393853


Unnamed: 0,file,MRR,num_sessions,num_null_target
0,Impress_data_sub_005_nvars_2_output_meta.csv,0.518352,10331,0
1,NB_data_sub_005_sl_1_val_1_output_meta_2nd_fil...,0.590406,1447,0
2,merged,0.531526,10331,0
