In [1]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *




In [2]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 1,
    'drop_no_references': True,
    'train_model_on_test_data': True,
    'add_prices': False,
    'add_hour': False,
    'cols_to_append': [],#['platform','city','device'],
    'drop_action_type': True,
    
    'train_session_chunksize': 15000,
    'parts_nrows_test': 10000,
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

config['le_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) + '_le.pickle')
config['train_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train.csv')
config['train_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train_last_step.csv')
config['test_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test_last_step.csv')
config['test_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test.csv')
config['val_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val.csv')
config['prices_pickle_path']=config['data_path'] / 'mean_prices.pickle'
config['model_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val_long.csv')
config['output_recsys_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_meta.csv')


In [3]:
config['session_length']

1

In [4]:
meta=get_metadata()
# meta.dtypes

meta['item_id']=meta['item_id'].astype(str)

In [5]:
if config['save_train_test_val']:
    print('Getting sessions')
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])

    print('Filter session with no clickout')
    if (not config['use_validation']) & (not config['use_subset']):
        print('filtering sessions with clickout')
        sessions=filter_sessions_with_no_clicks(sessions)

        
    columns_to_encode = ['action_type','platform','city','device']

    if config['add_prices']:
        print('adding prices')
        prices=pd.read_pickle(config['prices_pickle_path'])
        sessions=sessions.join(prices.set_index('reference'),on='reference')
#         sessions['city_price']=sessions['city'] + sessions['mean_prices'].astype(str)
#         columns_to_encode=np.append(columns_to_encode,['city_price'])
    
    if config['add_hour']:
        sessions['hour']=sessions.timestamp.dt.hour.astype(str)
        sessions['city_hour']=sessions['city'] + sessions['hour']
        sessions.drop('hour',axis=1,inplace=True)
        columns_to_encode=np.append(columns_to_encode,['city_hour'])
        config['cols_to_append']=np.append(config['cols_to_append'],['city_hour'])
    
    
    
    print(sessions.dtypes)
    # sessions.head()

    print('Quick unit test')

    print(len(sessions.index))
    print(len(sessions.index.unique()))

    if not config['use_validation']:
        sessions['is_validation']=False
        sessions['target']=np.NaN

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True),'target'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False),'target'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False),'target'].count())

    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') ,'step'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())
    
    if config['use_validation']:
        n_clickouts_test = sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count()
    else:
        n_clickouts_test = sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count()
        
        
    # Get possible classes
    # classes=list(set(train.loc[(train.action_type=='clickout item') & (train.step>1),'reference']))
    classes=list(set(sessions.loc[(sessions.action_type=='clickout item') & ~(sessions.reference.isnull()),'reference']))

    
    print('Train encoders and save')
    encoders = {}
    for col in columns_to_encode:
        le = preprocessing.LabelEncoder()
        encoders[col]=le.fit(sessions[col])
        print(encoders[col].classes_)
    #     val_wide[col]=encoders[col].transform(val_wide[col])

    with open(config['le_pickle_path'], 'wb') as handle:
        pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # with open(config['le_pickle_path'], 'rb') as handle:
    #     b = pickle.load(handle)
    
    print('Get Splits')

    train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)] \
                        .drop(['impressions','prices','is_train','is_validation','target'],axis=1) \
                        .reset_index(drop=True)

    test = sessions.loc[sessions.is_train==False] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    print('train',train.shape)
    print('test',test.shape)
    print('val',val.shape)

    print('train',train.dtypes)
    print('test',test.dtypes)
    print('val',val.dtypes)


    print('Save either test or val')
    if config['use_validation']:
        val.to_csv(config['val_csv_path'])
    else:
        test.to_csv(config['test_csv_path'])


    print('delete session, test and val')
    try:
        del sessions, test, val
    except NameError:
        pass
    else:
        gc.collect()
     
    


    print('save train')
    train.to_csv(config['train_csv_path'])

else:
    print('loading train and encoders...')
    
    train=pd.read_csv(config['train_csv_path'])
    train.drop(['Unnamed: 0'],axis=1,inplace=True)
    
    with open(config['le_pickle_path'], 'rb') as handle:
        encoders = pickle.load(handle)

    print('done')
    

last_step_per_session=train.groupby('session_id',sort=False)['step'].max().reset_index().to_csv(config['train_last_step_csv_path'])

names=train.columns


print('delete train')
try:
    del train
except NameError:
    pass
else:
    gc.collect()

Getting sessions
Filter session with no clickout
user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
target                          object
dtype: object
Quick unit test
893499
893499
10336
0
0
19995
10336
59661
0
26345
12856
Train encoders and save
['change of sort order' 'clickout item' 'filter selection'
 'interaction item deals' 'interaction item image' 'interaction item info'
 'interaction item rating' 'search for destination' 'search for item'
 'search for poi']
['AA' 'AE' 'AR' 'AT' 'AU' 'BE' 'B

# Add lost sessions back
Load test data

In [6]:
if config['drop_no_references']:
    if config['use_validation']:
        data=pd.read_csv(config['val_csv_path'],parse_dates=['timestamp'])
    else:
        data=pd.read_csv(config['test_csv_path'],parse_dates=['timestamp'])

    data.drop(['Unnamed: 0'],axis=1,inplace=True)

    val_wide = process_test_naives_bayes(data=data, metadata=meta, encoders=encoders, config=config)
    val_wide=val_wide.loc[val_wide.impressions!=0]

    try:
        del test
    except NameError:
        pass
    else:
        gc.collect()


#     val_wide_allnull=val_wide.loc[(val_wide.iloc[:,0:(2*config['session_length']-1)].T == 0).all()].copy()
    val_wide_allnull=val_wide.copy()
    try:
        del val_wide
    except NameError:
        pass
    else:
        gc.collect()

Munge into same format

In [7]:
columns_ordered = ['user_id', 'session_id', 'timestamp', 'step', 'item_recommendations',  'item_probs']

val_wide_allnull=val_wide_allnull[['timestamp','impressions']]
val_wide_allnull['timestamp']=val_wide_allnull['timestamp'].astype(np.int64)//10**9
val_wide_allnull.reset_index(inplace=True)
val_wide_allnull[['user_id','session_id','step']]=val_wide_allnull['key'].str.split('_',expand=True)
val_wide_allnull.drop(['key'],axis=1,inplace=True)
val_wide_allnull['item_probs']=val_wide_allnull['impressions'].str.replace('[0-9]+\\||[0-9]+','0 ')
val_wide_allnull['impressions']=val_wide_allnull['impressions'].str.replace('\\|',' ')
val_wide_allnull.rename(columns={'impressions':'item_recommendations'},inplace=True)


columns_ordered = ['user_id', 'session_id', 'timestamp', 'step', 'item_recommendations',  'item_probs']
val_wide_allnull=val_wide_allnull.reindex(columns=columns_ordered)

val_wide_allnull.head()



Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,item_probs
0,00EI1R7YK601,9639ee039c1d0,1541068329,3,135917 104177 106691 1409858 4529846 3822896 3...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,00GKOZLYVI9R,8e74b912cb1b4,1541065310,16,3060180 3176094 5658130 4467826 5200924 198564...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,00J9RN4XAC2N,d2397c03bc9b4,1541065050,122,3134112 1104106 2714672 4073430 3827338 106274...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,00M5AMMLYQG5,8fd417ebd2d5b,1541062793,1,9882016 106769 48219 147360 7333050 3533848 75...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,00QD3TS82ZP1,65f6d52da6c28,1541064419,3,3538112 1518663 1749805 3537348 2230166 848674...,0 0 0 0 0 0 0 0 0 0 0


append with NB fitted test data

In [9]:
output=val_wide_allnull

save submission

In [10]:
# output.drop('item_probs',axis=1).to_csv(config['output_recsys_csv_path'],index=False)
output.to_csv(root_dir / 'cache' / 'IMPRESS_data_005_output_meta.csv',index=False)

In [12]:
output.shape

(10336, 6)

In [None]:
val_wide_allnull.shape

In [None]:
rank_imp_wide.shape

In [None]:
pd.DataFrame(clf.feature_log_prob_).mean()