In [1]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *


In [2]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 1,
    'drop_no_references': True,
    
    'train_session_chunksize': 5000,
    'parts_nrows_test': 5000,
    'parts_path_to_data': root_dir / 'cache' / 'parts',
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

config['le_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) + '_le.pickle')
config['train_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train.csv')
config['train_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train_last_step.csv')
config['test_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test.csv')
config['val_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val.csv')
config['model_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val_long.csv')
config['output_recsys_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_meta.csv')


In [3]:
meta=get_metadata()
# meta.dtypes

In [4]:
meta['item_id']=meta['item_id'].astype(str)

In [5]:
if config['save_train_test_val']:
    print('Getting sessions')
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])

    print('Filter session with no clickout')
    if (not config['use_validation']) & (not config['use_subset']):
        print('filtering sessions with clickout')
        sessions=filter_sessions_with_no_clicks(sessions)

#     print('Split impressions and prices')
#     sessions['impressions']=sessions['impressions'].str.split('\\|')

#     sessions['prices']=sessions['prices'].str.split('\\|')

    print(sessions.dtypes)
    # sessions.head()

    print('Quick unit test')

    print(len(sessions.index))
    print(len(sessions.index.unique()))

    if not config['use_validation']:
        sessions['is_validation']=False
        sessions['target']=np.NaN

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True),'target'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False),'target'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False),'target'].count())

    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') ,'step'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print('Train encoders and save')

    columns_to_encode = ['action_type','platform','city','device']

    encoders = {}
    for col in columns_to_encode:
        le = preprocessing.LabelEncoder()
        encoders[col]=le.fit(sessions[col])
        print(encoders[col].classes_)
    #     val_wide[col]=encoders[col].transform(val_wide[col])

    with open(config['le_pickle_path'], 'wb') as handle:
        pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # with open(config['le_pickle_path'], 'rb') as handle:
    #     b = pickle.load(handle)

    print('Get Splits')

    train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)] \
                        .drop(['impressions','prices','is_train','is_validation','target'],axis=1) \
                        .reset_index(drop=True)

    test = sessions.loc[sessions.is_train==False] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    print('train',train.shape)
    print('test',test.shape)
    print('val',val.shape)


    print('Save either test or val')
    if config['use_validation']:
        val.to_csv(config['val_csv_path'])
    else:
        test.to_csv(config['test_csv_path'])


    print('delete session, test and val')
    try:
        del sessions, test, val
    except NameError:
        pass
    else:
        gc.collect()
     
    


    print('save train')
    train.to_csv(config['train_csv_path'])

else:
    print('loading train and encoders...')
    
    train=pd.read_csv(config['train_csv_path'])
    train.drop(['Unnamed: 0'],axis=1,inplace=True)
    
    with open(config['le_pickle_path'], 'rb') as handle:
        encoders = pickle.load(handle)

    print('done')
    

last_step_per_session=train.groupby('session_id',sort=False)['step'].max().reset_index().to_csv(config['train_last_step_csv_path'])

names=train.columns

# classes=list(set(train.loc[(train.action_type=='clickout item') & (train.step>1),'reference']))
classes=list(set(train.loc[(train.action_type=='clickout item'),'reference']))

print('delete train')
try:
    del train
except NameError:
    pass
else:
    gc.collect()

Getting sessions
Filter session with no clickout
user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
target                          object
dtype: object
Quick unit test
893499
893499
10336
0
0
19995
10336
59661
0
26345
12856
Train encoders and save
['change of sort order' 'clickout item' 'filter selection'
 'interaction item deals' 'interaction item image' 'interaction item info'
 'interaction item rating' 'search for destination' 'search for item'
 'search for poi']
['AA' 'AE' 'AR' 'AT' 'AU' 'BE' 'B

In [6]:
cols_to_drop=([])
for i in range(config['session_length']):
    print('reference|' + str(i+1))
    cols_to_drop=np.append(cols_to_drop,'reference|' + str(i+1))

reference|1


### Fit NB

In [7]:
if not config['load_fitted_model']:
    start = time.time()
    reader=pd.read_csv(config['train_last_step_csv_path'], chunksize=config['train_session_chunksize'])

    skiprows=0
    clf=BernoulliNB()
    for i,chunk in enumerate(reader):
        nrows=chunk.step.sum()
        train_part=pd.read_csv(config['train_csv_path'],header=0,skiprows=skiprows,nrows=nrows,names=names)

        train_wide = process_train_naives_bayes(data=train_part, metadata=meta, session_length=config['session_length'], encode = True,encoders=encoders,cols_to_encode=list(encoders.keys()))

        del train_part

        if config['drop_no_references']:
            train_wide_not_allnull=train_wide[(train_wide.iloc[:,0:(2*config['session_length']-1)].T != 0).all()].copy()
        else:
            train_wide_not_allnull=train_wide.copy()


        del train_wide

        print(train_wide_not_allnull.shape)
        
        if not train_wide_not_allnull.empty:
            if i==0:
                clf.partial_fit(train_wide_not_allnull \
                            .drop(np.append(cols_to_drop,'y'),axis=1), \
                                   np.ravel(train_wide_not_allnull.y), \
                                   classes = classes)
            else:
                clf.partial_fit(train_wide_not_allnull \
                            .drop(np.append(cols_to_drop,'y'),axis=1), \
                                   np.ravel(train_wide_not_allnull.y))

        skiprows += nrows

    stop = time.time()
    print('Seconds elapsed  :', stop-start) 
    
    # Output a pickle file for the model
    joblib.dump(clf, config['model_pickle_path'])
    
    del train_wide_not_allnull
    
else:
    clf=joblib.load(config['model_pickle_path'])

(6386, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


(6598, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


(6760, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


(6956, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


(6737, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


(6541, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


(1396, 163)


  self.class_log_prior_ = (np.log(self.class_count_) -


Seconds elapsed  : 81.51977372169495


get test data in wide format

## split for partial fit

### PREDICT

In [8]:
if config['use_validation']:
    data=pd.read_csv(config['val_csv_path'],parse_dates=['timestamp'])
else:
    data=pd.read_csv(config['test_csv_path'],parse_dates=['timestamp'])

data.drop(['Unnamed: 0'],axis=1,inplace=True)

val_wide = process_test_naives_bayes(data=data, metadata=meta, session_length=config['session_length'], encode = True,encoders=encoders,cols_to_encode=list(encoders.keys()))
val_wide=val_wide.loc[val_wide.impressions!=0]


try:
    del test
except NameError:
    pass
else:
    gc.collect()

if config['drop_no_references']:
    val_wide_not_allnull=val_wide.loc[(val_wide.iloc[:,0:(2*config['session_length']-1)].T != 0).all()].copy()
else:
    val_wide_not_allnull=val_wide.copy()

val_wide_not_allnull['impressions']=val_wide_not_allnull['impressions'].str.split('\\|')

try:
    del val_wide
except NameError:
    pass
else:
    gc.collect()

In [9]:
val_wide_not_allnull.head()

Unnamed: 0_level_0,action_type|1,reference|1,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,Adults Only,...,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms),platform,city,device,impressions,timestamp,target
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00EI1R7YK601_9639ee039c1d0_3,1,135917,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,32,3700,0,"[135917, 104177, 106691, 1409858, 4529846, 382...",2018-11-01 10:32:09,135917.0
00GKOZLYVI9R_8e74b912cb1b4_16,1,5658130,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,29,8600,1,"[3060180, 3176094, 5658130, 4467826, 5200924, ...",2018-11-01 09:41:50,3060180.0
00J9RN4XAC2N_d2397c03bc9b4_122,1,2714672,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,27,2111,0,"[3134112, 1104106, 2714672, 4073430, 3827338, ...",2018-11-01 09:37:30,3134112.0
00QD3TS82ZP1_65f6d52da6c28_3,4,3538112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,29,5795,0,"[3538112, 1518663, 1749805, 3537348, 2230166, ...",2018-11-01 09:26:59,3538112.0
00VFIQZH8RZ6_b485e039d93c7_33,3,3213142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,19,310,1,"[6546628, 147278, 3905094, 3087002, 8980586, 1...",2018-11-01 08:54:20,5627048.0


Create long table for results

In [10]:
val_imp=val_wide_not_allnull[['impressions','target']].reset_index().copy()

val_imp_long=val_imp.impressions.apply(pd.Series) \
    .merge(val_imp, right_index = True, left_index = True) \
    .drop(["impressions"], axis = 1)  \
    .melt(id_vars = ['key','target'], value_name = "impressions") \
    .drop("variable", axis = 1) \
    .dropna() \
    .sort_values('key') \
    .reset_index(drop=True) \
    .copy()

try:
    del val_imp
except NameError:
    pass
else:
    gc.collect()

Get row, col indexes for extracting probabilities from pred

In [11]:
row_index=val_imp_long['key'].copy()
col_index=val_imp_long['impressions'].copy()

mask_in_class = col_index.isin(clf.classes_) #True is in class
col_index.loc[~mask_in_class]='not_in_class'

Do predict and munge output on the fly

In [12]:
print(val_wide_not_allnull.shape)

n_splits = round(val_wide_not_allnull.shape[0]/config['parts_nrows_test'])
index_split = round(val_wide_not_allnull.shape[0]/n_splits)

impression_probs=([])
for i in range(n_splits): # n_splits
    if (i+1)==n_splits:
        pred=clf.predict_proba(val_wide_not_allnull.iloc[(index_split*(i)):,:]. \
                                         drop(np.append(cols_to_drop,['timestamp', 'impressions','target']),axis=1))
        
        preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=val_wide_not_allnull.iloc[(index_split*(i)):,:].index)
        preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_
        
        first_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):,:].head(1).index
        first_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(first_key_last_pred)].index.min()

        last_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):,:].tail(1).index
        last_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(last_key_last_pred)].index.max()

        row_index[first_index_for_row_col:(last_index_for_row_col+1)]
        
        impression_probs=np.append(impression_probs,preddf.lookup(row_index[first_index_for_row_col:(last_index_for_row_col+1)], \
                                              col_index[first_index_for_row_col:(last_index_for_row_col+1)]),axis=0)
        
    else:
        pred=clf.predict_proba(val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:]. \
                                         drop(np.append(cols_to_drop,['timestamp', 'impressions','target']),axis=1))
        
        
        preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].index)
        preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_
        
        first_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].head(1).index
        first_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(first_key_last_pred)].index.min()

        last_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].tail(1).index
        last_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(last_key_last_pred)].index.max()

        row_index[first_index_for_row_col:(last_index_for_row_col+1)]
        
        impression_probs=np.append(impression_probs,preddf.lookup(row_index[first_index_for_row_col:(last_index_for_row_col+1)], \
                                              col_index[first_index_for_row_col:(last_index_for_row_col+1)]),axis=0)

(6549, 165)


Munge to get rank and RR

In [13]:
val_imp_long['probs'] = impression_probs

val_imp_long.sort_values(['key','probs'],ascending=[True,False],inplace=True)

val_imp_long['rank'] = 1
val_imp_long['rank'] = val_imp_long.groupby('key')['rank'].cumsum()

val_imp_long['RR']=1/val_imp_long['rank']

val_imp_long.to_csv(config['val_long_csv_path'])

get output file

In [14]:
rank_imp_wide=val_imp_long.pivot(index='key',columns='rank',values=['impressions','probs']).copy()

# collapse column multi index for ease of indexing
rank_imp_wide.columns=rank_imp_wide.columns.map(lambda x: '|'.join([str(i) for i in x]))
    
rank_imp_wide=rank_imp_wide.join(val_wide_not_allnull['timestamp'], on='key')

rank_imp_wide['timestamp']=rank_imp_wide['timestamp'].astype(np.int64)//10**9

rank_imp_wide.reset_index(inplace=True)

rank_imp_wide[['user_id','session_id','step']]=rank_imp_wide['key'].str.split('_',expand=True)

rank_imp_wide.drop(['key'],axis=1,inplace=True)

rank_imp_wide['item_recommendations'] = rank_imp_wide.iloc[:,:25].apply(
    lambda x: ' '.join(x.dropna()),
    axis=1
)

rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

rank_imp_wide['item_probs'] = rank_imp_wide.iloc[:,:25].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

cols=rank_imp_wide.columns.tolist()
cols=cols[1:3]+cols[:1]+cols[3:4]+cols[-2:]
rank_imp_wide=rank_imp_wide[cols]

rank_imp_wide.head()

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,item_probs
0,00EI1R7YK601,9639ee039c1d0,1541068329,3,104136 104132 4529846 393751 3143230 6003158 1...,2.8549866195958558e-05 6.137422847746534e-08 9...
1,00GKOZLYVI9R,8e74b912cb1b4,1541065310,16,4467826 5452024 1031940 2596290 8162422 582721...,0.09955310213551848 3.8340571319871677e-05 2.6...
2,00J9RN4XAC2N,d2397c03bc9b4,1541065050,122,3134112 1104106 1104108 4073430 4702338 585638...,2.1746880231882764e-07 9.820942034302735e-12 7...
3,00QD3TS82ZP1,65f6d52da6c28,1541064419,3,3537348 3536828 2034239 1518663 2832878 390219...,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4,00VFIQZH8RZ6,b485e039d93c7,1541062460,33,2005355 7793494 2772604 3213142 100137 9549282...,2.9137703499490515e-12 0.0 0.0 0.0 0.0 0.0 0.0...


save submission

In [15]:
rank_imp_wide.drop('item_probs',axis=1).to_csv(config['output_recsys_csv_path'],index=False)
rank_imp_wide.to_csv(config['output_meta_csv_path'],index=False)

In [16]:
if config['use_validation']:
    MRR_NB=val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) \
                           & (val_imp_long.probs!=0.0),'RR'].mean()
    MRR_parse_imp=val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)),'RR'].mean()
    
    print(MRR_NB)
    print(MRR_parse_imp)
    print('support MRR_NB: ', str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) & (val_imp_long.probs!=0.0),'RR'].count()))
    print('support MRR_NB: ', str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)),'RR'].count()))

0.593750782743839
0.2656982324738837
support MRR_NB:  1936
support MRR_NB:  6544


In [17]:
str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) & (val_imp_long.probs!=0.0),'RR'].count())

'1936'

# Add lost sessions back