In [1]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *




In [2]:
config= {
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'n_vars': 5,
    
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

root_path=('Impress_data_sub_' + str(int(100*config['subset_frac'])).zfill(3)) \
    + '_nvars_' + str(config['n_vars']) 

config['output_meta_csv_path']=config['data_path'] / (root_path +  '_output_meta.csv')


In [3]:
print('Getting sessions')
sessions=get_sessions(config['use_subset'],
                      config['subset_frac'],
                      config['use_validation'],
                      config['validation_frac'],
                      config['reference_to_nan_frac'],
                      config['reference_to_nan_seed'])

print('Drop session with no clickout')
if (not config['use_validation']) & (not config['use_subset']):
    sessions=filter_sessions_with_no_clicks(sessions)





print(sessions.dtypes)
# sessions.head()

print('Quick unit test')

print(len(sessions.index))
print(len(sessions.index.unique()))

if not config['use_validation']:
    sessions['is_validation']=False
    sessions['target']=np.NaN

print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True),'target'].count())
print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False),'target'].count())
print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False),'target'].count())

print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item'),'step'].count())
print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') ,'step'].count())
print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item'),'step'].count())
print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())


print('Get Splits')

train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)] \
                    .drop(['impressions','prices','is_train','is_validation','target'],axis=1) \
                    .reset_index(drop=True)

test = sessions.loc[sessions.is_train==False] \
                   .drop(['is_train','is_validation'],axis=1) \
                   .reset_index(drop=True)

val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                   .drop(['is_train','is_validation'],axis=1) \
                   .reset_index(drop=True)

print('train',train.shape)
print('test',test.shape)
print('val',val.shape)

# val

Getting sessions
Drop session with no clickout
user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
target                          object
dtype: object
Quick unit test
893499
893499
10336
0
0
19995
10336
59661
0
26345
12856
Get Splits
train (537963, 10)
test (200069, 13)
val (155467, 13)


In [4]:
def get_long_format(df: pd.DataFrame):
    output = val.loc[(val.action_type=='clickout item') & (val.reference.isnull()),['user_id','session_id','timestamp','step','impressions']].copy()

    output['timestamp']=output['timestamp'].astype(np.int64)//10**9
    output['impressions']=output['impressions'].str.split('\\|')
    # output

    output_long=output.impressions.apply(pd.Series) \
        .merge(output, right_index = True, left_index = True) \
        .drop(["impressions"], axis = 1)  \
        .melt(id_vars=['user_id', 'session_id', 'timestamp', 'step'], \
              value_name = "item_recommendations", \
              var_name = "rank" ) \
        .dropna() \
        .sort_values(['user_id', 'session_id', 'timestamp', 'step','rank']) \
        .reset_index(drop=True) \
        .copy()
    
    
    return output_long



def add_probs(df: pd.DataFrame, n_vars: int):
    df['item_probs'] = df['rank']
    df.loc[df.item_probs < n_vars,'item_probs'] = 1/n_vars
    df.loc[df.item_probs >= n_vars,'item_probs'] = 0
    
    
    return df


def get_output_format(df: pd.DataFrame):
    df['key'] = df['user_id'].astype(str) + '_' + df['session_id'].astype(str) + '_' + df['timestamp'].astype(str) + '_' + df['step'].astype(str) 
    df.drop(['user_id', 'session_id', 'timestamp', 'step'],axis=1,inplace=True)

    rank_imp_wide=df.pivot(index='key',columns='rank',values=['item_recommendations','item_probs']).copy()
    rank_imp_wide.columns=rank_imp_wide.columns.map(lambda x: '|'.join([str(i) for i in x]))
    
    rank_imp_wide.reset_index(inplace=True)

    rank_imp_wide[['user_id', 'session_id', 'timestamp', 'step']]=rank_imp_wide['key'].str.split('_',expand=True)

    rank_imp_wide.drop(['key'],axis=1,inplace=True)

    rank_imp_wide['item_recommendations'] = rank_imp_wide.iloc[:,:25].apply(
        lambda x: ' '.join(x.dropna()),
        axis=1
    )

    rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

    rank_imp_wide['item_probs'] = rank_imp_wide.iloc[:,:25].apply(
        lambda x: ' '.join(x.dropna().astype(str)),
        axis=1
    )

    rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)


    return rank_imp_wide




In [5]:
probs = add_probs(get_long_format(val),config['n_vars'])

rank_imp_wide = get_output_format(probs.copy())

rank_imp_wide.to_csv(config['output_meta_csv_path'],index=False)

print(rank_imp_wide.shape)
rank_imp_wide

(10336, 6)


Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,item_probs
0,00EI1R7YK601,9639ee039c1d0,1541068329,3,135917 104177 106691 1409858 4529846 3822896 3...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,00GKOZLYVI9R,8e74b912cb1b4,1541065310,16,3060180 3176094 5658130 4467826 5200924 198564...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,00J9RN4XAC2N,d2397c03bc9b4,1541065050,122,3134112 1104106 2714672 4073430 3827338 106274...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,00M5AMMLYQG5,8fd417ebd2d5b,1541062793,1,9882016 106769 48219 147360 7333050 3533848 75...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,00QD3TS82ZP1,65f6d52da6c28,1541064419,3,3538112 1518663 1749805 3537348 2230166 848674...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0
5,00VFIQZH8RZ6,b485e039d93c7,1541062460,33,6546628 147278 3905094 3087002 8980586 100137 ...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
6,00WWQYCIDPUF,a8962895bad41,1541064252,6,7929996 8767518 5742058 7195114 7325018 196079...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
7,00XDNZNRBSAH,94ab75d928c18,1541064630,4,357136 62695 1328816 67719 356436 1108711 4038...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0
8,00ZDLU2ASH0X,85ca517b4f0da,1541064284,5,552221 5050230 552281 3978936 8436080 552236 2...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
9,0124PIAGFWFO,bc904eed65f07,1541059875,3,153371 6841986 10443208 12289 12197 149812 327...,0.2 0.2 0.2 0.2 0.2 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
