In [None]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *


In [None]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': False,
    'subset_frac': 0.05,
    'use_validation': False,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 1,
    
    'train_session_chunksize': 5000,
    'parts_nrows_test': 5000,
    'parts_path_to_data': root_dir / 'cache' / 'parts',
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

config['le_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) + '_le.pickle')
config['train_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train.csv')
config['train_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train_last_step.csv')
config['test_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test.csv')
config['val_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val.csv')
config['model_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val_long.csv')
config['output_recsys_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_meta.csv')


In [None]:
meta=get_metadata()
# meta.dtypes

In [None]:
meta['item_id']=meta['item_id'].astype(str)

In [None]:
if config['save_train_test_val']:
    print('Getting sessions')
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])

    print('Filter session with no clickout')
    if (not config['use_validation']) & (not config['use_subset']):
        print('filtering sessions with clickout')
        sessions=filter_sessions_with_no_clicks(sessions)

In [None]:
print('Split impressions and prices')
sessions['impressions']=sessions['impressions'].str.split('\\|')

sessions['prices']=sessions['prices'].str.split('\\|')

In [None]:
print(sessions.dtypes)
# sessions.head()

In [None]:
sessions.shape

In [None]:
columns_to_encode = ['action_type','platform','city','device']

encoders = {}
for col in columns_to_encode:
    le = preprocessing.LabelEncoder()
    encoders[col]=le.fit(sessions[col])
    print(encoders[col].classes_)
#     val_wide[col]=encoders[col].transform(val_wide[col])

with open(config['le_pickle_path'], 'wb') as handle:
    pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open(config['le_pickle_path'], 'rb') as handle:
#     b = pickle.load(handle)

In [None]:
def get_encoded_prices(sessions, n_splits: int):
    sessions_with_impressions=sessions.loc[~(sessions.impressions.isnull())].copy()
    sessions_with_impressions['key'] = (sessions_with_impressions['user_id'] + '_' + sessions_with_impressions['session_id'] + '_' + sessions_with_impressions['step'].astype(str))
    sessions_with_impressions=sessions_with_impressions[['key','impressions','prices']].copy()
    
    sessions_imp_long=sessions_with_impressions.impressions.apply(pd.Series) \
    .merge(sessions_with_impressions, right_index = True, left_index = True) \
    .drop(['impressions','prices'],axis=1) \
    .melt(id_vars = ['key'], value_name='impressions') \
    .dropna() \
    .sort_values(['key','variable']) \
    .copy()

    sessions_price_long=sessions_with_impressions.prices.apply(pd.Series) \
    .merge(sessions_with_impressions, right_index = True, left_index = True) \
    .drop(['impressions','prices'],axis=1) \
    .melt(id_vars = ['key'], value_name='prices') \
    .dropna() \
    .sort_values(['key','variable']) \
    .copy()
    
    sessions_imp_price_long=sessions_imp_long.merge(sessions_price_long, left_on=['key','variable'], right_on=['key','variable'])
    
    stats=sessions_imp_price_long['prices'].astype(int).describe(percentiles=np.linspace(0,1,n_splits))

    sessions_imp_price_long['prices']=pd.cut(sessions_imp_price_long['prices'].astype(int),bins=np.append(0.0,stats[4:(n_splits+5)]),labels=np.linspace(0,n_splits,n_splits+1))
    
    return sessions_imp_price_long.drop(['variable'],axis=1)
    

In [None]:
prices=get_encoded_prices(sessions,n_splits=10)



In [None]:
prices.prices.dtypes

In [None]:
prices.to_pickle(root_dir / 'cache' / 'prices.pickle')