In [1]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *

In [2]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': False,
    'subset_frac': 0.01,
    'use_validation': False,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 3,
    
    'train_session_chunksize': 5000,
    'parts_nrows_test': 5000,
    'parts_path_to_data': root_dir / 'cache' / 'parts',
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

config['le_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) + '_le.pickle')
config['train_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train.csv')
config['train_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train_last_step.csv')
config['test_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test.csv')
config['val_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val.csv')
config['model_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val_long.csv')
config['output_recsys_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_meta.csv')


In [3]:
meta=get_metadata()
# meta.dtypes

meta['item_id']=meta['item_id'].astype(str)

print('Getting sessions')
sessions=get_sessions(config['use_subset'],
                      config['subset_frac'],
                      config['use_validation'],
                      config['validation_frac'],
                      config['reference_to_nan_frac'],
                      config['reference_to_nan_seed'])

print('Filter session with no clickout')
if (not config['use_validation']) & (not config['use_subset']):
    print('filtering sessions with clickout')
    sessions=filter_sessions_with_no_clicks(sessions)
    
print(sessions.shape)

Getting sessions
Filter session with no clickout
filtering sessions with clickout
(17907922, 13)


In [4]:

sessions['impressions']=sessions['impressions'].str.split('\\|')
sessions['prices']=sessions['prices'].str.split('\\|')

In [5]:
l=sessions.loc[~(sessions.impressions.isnull()),'impressions'].tolist()

imp_flat = [item for sublist in l for item in sublist]

l=sessions.loc[~(sessions.impressions.isnull()),'prices'].tolist()

prices_flat = [item for sublist in l for item in sublist]

prices = pd.DataFrame(data={'reference':imp_flat, 'prices':prices_flat})

prices['prices']=prices['prices'].astype(int)

In [6]:
prices=prices.groupby('reference').prices.mean().reset_index()

In [7]:
prices.columns=['reference','mean_prices']

n_splits = 10
stats=prices['mean_prices'].describe(percentiles=np.linspace(0,1,n_splits))

prices['mean_prices']=pd.cut(prices['mean_prices'],bins=np.append(0.0,stats[4:(n_splits+5)]),labels=np.linspace(0,n_splits,n_splits+1))


In [8]:
prices.to_pickle(root_dir / 'cache' / 'mean_prices.pickle')

In [9]:
# sessions=sessions.join(prices.set_index('reference'),on='reference')

In [10]:
# sessions