In [1]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *




In [2]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 1,
    'drop_no_references': False,
    'train_model_on_test_data': True,
    'add_prices': True,
    'add_hour': True,
    
    'train_session_chunksize': 15000,
    'parts_nrows_test': 10000,
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

config['le_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) + '_le.pickle')
config['train_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train.csv')
config['train_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train_last_step.csv')
config['test_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test_last_step.csv')
config['test_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test.csv')
config['val_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val.csv')
config['prices_pickle_path']=config['data_path'] / 'mean_prices.pickle'
config['model_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val_long.csv')
config['output_recsys_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_meta.csv')


In [3]:
# meta=get_metadata()
# # meta.dtypes

# meta['item_id']=meta['item_id'].astype(str)

In [4]:
if config['save_train_test_val']:
    print('Getting sessions...')
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])

    
    train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)] \
                        .drop(['is_train','is_validation','target'],axis=1) \
                        .reset_index(drop=True)

    test = sessions.loc[sessions.is_train==False] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    

Getting sessions...


In [5]:
def return_impressions_RR(data: pd.DataFrame, is_train: bool):
    if is_train:
        target_col='reference'
    else:
        target_col='target'
    
    
    data_imp=data.loc[~train.impressions.isnull(),np.append(target_col,['session_id','step','impressions'])].copy()
    data_imp['impressions']=data_imp['impressions'].str.split('\\|')
    
    data_imp=data_imp.impressions.apply(pd.Series) \
    .merge(data_imp, right_index = True, left_index = True) \
    .drop(["impressions"], axis = 1)  \
    .melt(id_vars = np.append(target_col,['session_id','step']), value_name = "impressions") \
    .dropna() \
    .copy()
    
    data_imp_ans=data_imp.loc[data_imp[target_col]==data_imp.impressions].copy()
    data_imp_ans['variable']=data_imp_ans['variable']+1
    data_imp_ans['RR']=1/data_imp_ans['variable']
    

    return data_imp_ans

In [6]:
data_train=return_impressions_RR(data=train, is_train=True)

data_train.RR.mean()

0.45704799970223486

In [7]:
data_val=return_impressions_RR(data=val, is_train=False)

data_val.RR.mean()

0.5036607961910856

In [8]:
data_test=return_impressions_RR(data=test, is_train=True)

data_test.RR.mean()


0.411352665974076