In [1]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *




In [2]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 1,
    'drop_no_references': True,
    'train_model_on_test_data': True,
    'add_prices': False,
    'add_hour': False,
    'cols_to_append': [],#['platform','city','device'],
    'drop_action_type': True,
    
    'train_session_chunksize': 10000,
    'parts_nrows_test': 10000,
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

root_path=('NB_data_sub_' + str(int(100*config['subset_frac'])).zfill(3) \
 + '_sl_' + str(config['session_length']) \
 + '_val_' + str(int(config['use_validation'])))                       
                       
config['le_pickle_path']=config['data_path'] / (root_path + '_le.pickle')

config['train_wide_pickle_path']=config['data_path'] / (root_path +  '_train_wide.pickle')
config['classes_pickle_path']=config['data_path'] / (root_path +  '_classes.pickle')

config['test_pickle_path']=config['data_path'] / (root_path +  '_test.pickle')
config['val_pickle_path']=config['data_path'] / (root_path +  '_val.pickle')

config['prices_pickle_path']=config['data_path'] / 'mean_prices.pickle'

config['model_pickle_path']=config['data_path'] / (root_path +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / (root_path +  '_val_long.csv')

config['output_recsys_csv_path']=config['data_path'] / (root_path +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / (root_path +  '_output_meta.csv')
config['output_meta_only_fiited_csv_path']=config['data_path'] / (root_path +  '_output_meta_only_fiited.csv')


In [3]:
meta=get_metadata()
# meta.dtypes

meta['item_id']=meta['item_id'].astype(str)

In [4]:
if config['save_train_test_val']:
    print('Getting sessions')
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])

    print('Drop session with no clickout')
    if (not config['use_validation']) & (not config['use_subset']):
        sessions=filter_sessions_with_no_clicks(sessions)

        
    columns_to_encode = ['action_type','platform','city','device']

    if config['add_prices']:
        print('adding prices')
        prices=pd.read_pickle(config['prices_pickle_path'])
        sessions=sessions.join(prices.set_index('reference'),on='reference')
#         sessions['city_price']=sessions['city'] + sessions['mean_prices'].astype(str)
#         columns_to_encode=np.append(columns_to_encode,['city_price'])
    
    if config['add_hour']:
        sessions['hour']=sessions.timestamp.dt.hour.astype(str)
        sessions['city_hour']=sessions['city'] + sessions['hour']
        sessions.drop('hour',axis=1,inplace=True)
        columns_to_encode=np.append(columns_to_encode,['city_hour'])
        config['cols_to_append']=np.append(config['cols_to_append'],['city_hour'])
    
    
    
    print(sessions.dtypes)
    # sessions.head()

    print('Quick unit test')

    print(len(sessions.index))
    print(len(sessions.index.unique()))

    if not config['use_validation']:
        sessions['is_validation']=False
        sessions['target']=np.NaN

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True),'target'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False),'target'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False),'target'].count())

    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') ,'step'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())
    
    print('Get possible classes')
#     classes=list(set(sessions.loc[(sessions.action_type=='clickout item') & ~(sessions.reference.isnull()),'reference']))
    classes=list(set(sessions.loc[(sessions.action_type=='clickout item') & ~(sessions.reference.isnull()) & (sessions.step>1),'reference']))
    
#     def get_classes(sessions):
#         refs=sessions.loc[(sessions.action_type=='clickout item') & ~(sessions.reference.isnull()),'reference'].tolist()

#         sessions['impressions']=sessions['impressions'].str.split('\\|')
#         l=sessions.loc[~(sessions.impressions.isnull()),'impressions'].tolist()

#         imp_flat = [item for sublist in l for item in sublist]
        
#         return list(set(refs+imp_flat))
    
#     classes = get_classes(sessions)

        
    
    print('Train encoders and save')
    encoders = {}
    for col in columns_to_encode:
        le = preprocessing.LabelEncoder()
        encoders[col]=le.fit(sessions[col])
        print(encoders[col].classes_)
    #     val_wide[col]=encoders[col].transform(val_wide[col])

    with open(config['le_pickle_path'], 'wb') as handle:
        pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # with open(config['le_pickle_path'], 'rb') as handle:
    #     b = pickle.load(handle)
    
    print('Get Splits')

    train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)] \
                        .drop(['impressions','prices','is_train','is_validation','target'],axis=1) \
                        .reset_index(drop=True)

    test = sessions.loc[sessions.is_train==False] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    print('train',train.shape)
    print('test',test.shape)
    print('val',val.shape)

    print('train',train.dtypes)
    print('test',test.dtypes)
    print('val',val.dtypes)


    print('Save either test or val')
    if config['use_validation']:
        val.to_pickle(config['val_pickle_path'])
    else:
        test.to_pickle(config['test_pickle_path'])


    print('delete session, test and val')
    try:
        del sessions, test, val
    except NameError:
        pass
    else:
        gc.collect()
     
    
    print('Calc. train_wide')
    train_wide = process_train_naives_bayes(data=train, metadata=meta, encoders=encoders, config=config)
    
    print('delete train')
    try:
        del train
    except NameError:
        pass
    else:
        gc.collect()
        
    if config['drop_no_references']:
        train_wide_not_allnull=train_wide.loc[train_wide.iloc[:,:config['session_length']].sum(axis=1)>0].copy()
    else:
        train_wide_not_allnull=train_wide.copy()

    try:
        del train_wide
    except NameError:
        pass
    else:
        gc.collect()
    
    print('save train_wide and classes')
    train_wide_not_allnull.to_pickle(config['train_wide_pickle_path'])
    
    with open(config['classes_pickle_path'], 'wb') as fp:
        pickle.dump(classes, fp)

else:
    print('loading train_wide, classes and encoders...')
    
    train_wide_not_allnull=pd.read_pickle(config['train_wide_pickle_path'])
    
    with open (config['classes_pickle_path'], 'rb') as fp:
        classes = pickle.load(fp)
    
    with open(config['le_pickle_path'], 'rb') as handle:
        encoders = pickle.load(handle)

    print('done')    

Getting sessions
Drop session with no clickout
user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
target                          object
dtype: object
Quick unit test
893499
893499
10336
0
0
19995
10336
59661
0
26345
12856
Get possible classes
Train encoders and save
['change of sort order' 'clickout item' 'filter selection'
 'interaction item deals' 'interaction item image' 'interaction item info'
 'interaction item rating' 'search for destination' 'search for item'
 'search for poi']
['AA' 'AE' 'AR

In [5]:
train_wide_not_allnull.shape

(41384, 160)

In [6]:
# if config['drop_no_references']:
#     train_wide_not_allnull=train_wide[(train_wide.iloc[:,0:((2*config['session_length'])-1)].T != 0).any()].copy()
# else:
#     train_wide_not_allnull=train_wide.copy()

# try:
#     del train_wide
# except NameError:
#     pass
# else:
#     gc.collect()

# train_wide_not_allnull.shape

In [7]:
train_wide_not_allnull.head()

Unnamed: 0_level_0,action_type|1,reference|1,y,1 Star,2 Star,3 Star,4 Star,5 Star,Accessible Hotel,Accessible Parking,...,Terrace (Hotel),Theme Hotel,Towels,Very Good Rating,Volleyball,Washing Machine,Water Slide,Wheelchair Accessible,WiFi (Public Areas),WiFi (Rooms)
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0017FIR55K7R_dbd605dbee1e5_3,1,2745276,2745276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0017FIR55K7R_dbd605dbee1e5_4,1,2745276,2745276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0017FIR55K7R_dbd605dbee1e5_5,1,2745276,2745276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0025B8BU0NYP_37bfe437f8b89_43,6,4773134,2829394,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
002BISXP1U1Q_8cd721ffb8e03_2,3,503071,503071,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


### Fit NB

In [8]:
cols_to_drop=([])
for i in range(config['session_length']):
    print('reference|' + str(i+1))
    cols_to_drop=np.append(cols_to_drop,'reference|' + str(i+1))
    if config['drop_action_type']:
        print('action_type|' + str(i+1))
        cols_to_drop=np.append(cols_to_drop,'action_type|' + str(i+1))
    

reference|1
action_type|1


In [9]:
if not config['load_fitted_model']:
    start = time.time()
    
    clf=BernoulliNB()
    
    
    for i in range(train_wide_not_allnull.shape[0]//config['train_session_chunksize']):
        print('%s of %s' % (i,train_wide_not_allnull.shape[0]//config['train_session_chunksize']))
        idxi = i*config['train_session_chunksize']
        idxf = ((i+1)*config['train_session_chunksize'])-1
        print([idxi,idxf])
        
        if i==0:
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[0])
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[-1])
            
            clf.partial_fit(train_wide_not_allnull.iloc[idxi:idxf,:] \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.iloc[idxi:idxf,:].y), \
                               classes = classes)
            
        else:
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[0])
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[-1])
            
            clf.partial_fit(train_wide_not_allnull.iloc[idxi:idxf,:] \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.iloc[idxi:idxf,:].y))
            
    
    if train_wide_not_allnull.shape[0]%config['train_session_chunksize']!=0:
        if train_wide_not_allnull.shape[0]//config['train_session_chunksize']==0:
            clf.partial_fit(train_wide_not_allnull \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.y), \
                               classes = classes)
            
        else:
            idx = train_wide_not_allnull.shape[0]%config['train_session_chunksize']

            print(train_wide_not_allnull.iloc[-idx:,:].index[0])

            clf.partial_fit(train_wide_not_allnull.iloc[-idx:,:] \
                                .drop(np.append(cols_to_drop,'y'),axis=1), \
                                       np.ravel(train_wide_not_allnull.iloc[-idx:,:].y))

    stop = time.time()
    print('Seconds elapsed  :', stop-start) 
    
    # Output a pickle file for the model
    joblib.dump(clf, config['model_pickle_path'])
    
    del train_wide_not_allnull
    
else:
    clf=joblib.load(config['model_pickle_path'])

0 of 4
[0, 9999]
0017FIR55K7R_dbd605dbee1e5_3
7CCX84QYA7RZ_7d1088fa7e2c3_38
1 of 4
[10000, 19999]
7CE0S4E64ZMZ_56e01e69d5018_3
FPYETWGI3KT3_3749da5becc70_65
2 of 4
[20000, 29999]
FPYETWGI3KT3_3749da5becc70_85
P3JCVZFA5Q38_74cc0ced1d42b_2
3 of 4
[30000, 39999]
P3OV66293CG3_ff507ffe93f73_35
YMW2YM5X5O6E_739781d4f77f4_7
YMW2YM5X5O6E_739781d4f77f4_9
Seconds elapsed  : 73.2729127407074


# Train model on test data clickouts that have a reference

In [10]:
if config['train_model_on_test_data']:
    
    if config['use_validation']:
        data=pd.read_pickle(config['val_pickle_path'])
    else:
        data=pd.read_pickle(config['test_pickle_path'])
    
    # remove empty reference rows
    data=data.loc[~(data.reference.isnull())]
    train_wide=process_train_naives_bayes(data=data, metadata=meta, encoders=encoders, config=config)
    
    try:
        del data
    except NameError:
        pass
    else:
        gc.collect()
    
    if config['drop_no_references']:
        train_wide_not_allnull=train_wide.loc[train_wide.iloc[:,:config['session_length']].sum(axis=1)>0].copy()
    else:
        train_wide_not_allnull=train_wide.copy()
        
    try:
        del train_wide
    except NameError:
        pass
    else:
        gc.collect()
    
    start = time.time()
    
    for i in range(train_wide_not_allnull.shape[0]//config['train_session_chunksize']):
        print('%s of %s' % (i,train_wide_not_allnull.shape[0]//config['train_session_chunksize']))
        idxi = i*config['train_session_chunksize']
        idxf = ((i+1)*config['train_session_chunksize'])-1
        print([idxi,idxf])
        
        if i==0:
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[0])
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[-1])
            
            clf.partial_fit(train_wide_not_allnull.iloc[idxi:idxf,:] \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.iloc[idxi:idxf,:].y), \
                               classes = classes)
            
        else:
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[0])
            print(train_wide_not_allnull.iloc[idxi:idxf,:].index[-1])
            
            clf.partial_fit(train_wide_not_allnull.iloc[idxi:idxf,:] \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.iloc[idxi:idxf,:].y))
            
    
    if train_wide_not_allnull.shape[0]%config['train_session_chunksize']!=0:
        if train_wide_not_allnull.shape[0]//config['train_session_chunksize']==0:
            clf.partial_fit(train_wide_not_allnull \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.y), \
                               classes = classes)
            
        else:
            idx = train_wide_not_allnull.shape[0]%config['train_session_chunksize']

            print(train_wide_not_allnull.iloc[-idx:,:].index[0])

            clf.partial_fit(train_wide_not_allnull.iloc[-idx:,:] \
                                .drop(np.append(cols_to_drop,'y'),axis=1), \
                                       np.ravel(train_wide_not_allnull.iloc[-idx:,:].y))

    stop = time.time()
    print('Seconds elapsed  :', stop-start) 
    
#     # Output a pickle file for the model
#     joblib.dump(clf, config['model_pickle_path'])
    
    del train_wide_not_allnull


Seconds elapsed  : 12.552316665649414


get test data in wide format

## split for partial fit

### PREDICT

In [11]:
if config['use_validation']:
    data=pd.read_pickle(config['val_pickle_path'])
else:
    data=pd.read_pickle(config['test_pickle_path'])

n_clickouts_test = data.loc[(data.action_type=='clickout item') & (data.reference.isnull()),'step'].count()

val_wide = process_test_naives_bayes(data=data, metadata=meta, encoders=encoders, config=config)
val_wide=val_wide.loc[val_wide.impressions!=0]


try:
    del data
except NameError:
    pass
else:
    gc.collect()

if config['drop_no_references']:#
    val_wide_not_allnull=val_wide.loc[val_wide.iloc[:,:config['session_length']].sum(axis=1)>0].copy()
else:
    val_wide_not_allnull=val_wide.copy()

val_wide_not_allnull['impressions']=val_wide_not_allnull['impressions'].str.split('\\|')

try:
    del val_wide
except NameError:
    pass
else:
    gc.collect()

Create long table for results

In [12]:
val_imp=val_wide_not_allnull[['impressions','target']].reset_index().copy()

val_imp_long=val_imp.impressions.apply(pd.Series) \
    .merge(val_imp, right_index = True, left_index = True) \
    .drop(["impressions"], axis = 1)  \
    .melt(id_vars = ['key','target'], value_name = "impressions") \
    .drop("variable", axis = 1) \
    .dropna() \
    .sort_values('key') \
    .reset_index(drop=True) \
    .copy()

try:
    del val_imp
except NameError:
    pass
else:
    gc.collect()

Get row, col indexes for extracting probabilities from pred

In [13]:
row_index=val_imp_long['key'].copy()
col_index=val_imp_long['impressions'].copy()

mask_in_class = col_index.isin(clf.classes_) #True is in class
col_index.loc[~mask_in_class]='not_in_class'

Do predict and munge output on the fly

In [14]:
print(val_wide_not_allnull.shape)

n_splits = round(val_wide_not_allnull.shape[0]/config['parts_nrows_test'])
if n_splits==0:
    n_splits=1
    index_split = val_wide_not_allnull.shape[0]
else:
    index_split = round(val_wide_not_allnull.shape[0]/n_splits)
    
print(index_split)

start = time.time()
impression_probs=([])
for i in range(n_splits): # n_splits
    print('%s of %s' % (i,n_splits))
    if (i+1)==n_splits:
        pred=clf.predict_proba(val_wide_not_allnull.iloc[(index_split*(i)):,:] \
                                         .drop(np.append(cols_to_drop,['timestamp', 'impressions','target']),axis=1))
        
        preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=val_wide_not_allnull.iloc[(index_split*(i)):,:].index)
        preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_
        
        first_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):,:].head(1).index
        first_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(first_key_last_pred)].index.min()

        last_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):,:].tail(1).index
        last_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(last_key_last_pred)].index.max()

        row_index[first_index_for_row_col:(last_index_for_row_col+1)]
        
        impression_probs=np.append(impression_probs,preddf.lookup(row_index[first_index_for_row_col:(last_index_for_row_col+1)], \
                                              col_index[first_index_for_row_col:(last_index_for_row_col+1)]),axis=0)
        
    else:
        pred=clf.predict_proba(val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:] \
                                         .drop(np.append(cols_to_drop,['timestamp', 'impressions','target']),axis=1))
        
        
        preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].index)
        preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_
        
        first_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].head(1).index
        first_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(first_key_last_pred)].index.min()

        last_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].tail(1).index
        last_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(last_key_last_pred)].index.max()

        row_index[first_index_for_row_col:(last_index_for_row_col+1)]
        
        impression_probs=np.append(impression_probs,preddf.lookup(row_index[first_index_for_row_col:(last_index_for_row_col+1)], \
                                              col_index[first_index_for_row_col:(last_index_for_row_col+1)]),axis=0)
        
stop = time.time()
print('Seconds elapsed  :', stop-start) 

(6549, 162)
6549
0 of 1
Seconds elapsed  : 17.109497547149658


Munge to get rank and RR

In [15]:
val_imp_long['probs'] = impression_probs

val_imp_long.sort_values(['key','probs'],ascending=[True,False],inplace=True)

val_imp_long['rank'] = 1
val_imp_long['rank'] = val_imp_long.groupby('key')['rank'].cumsum()

val_imp_long['RR']=1/val_imp_long['rank']

val_imp_long.to_csv(config['val_long_csv_path'])

get output file

In [16]:
rank_imp_wide=val_imp_long.pivot(index='key',columns='rank',values=['impressions','probs']).copy()

# collapse column multi index for ease of indexing
rank_imp_wide.columns=rank_imp_wide.columns.map(lambda x: '|'.join([str(i) for i in x]))
    
rank_imp_wide=rank_imp_wide.join(val_wide_not_allnull['timestamp'], on='key')

rank_imp_wide['timestamp']=rank_imp_wide['timestamp'].astype(np.int64)//10**9

rank_imp_wide.reset_index(inplace=True)

rank_imp_wide[['user_id','session_id','step']]=rank_imp_wide['key'].str.split('_',expand=True)

rank_imp_wide.drop(['key'],axis=1,inplace=True)

rank_imp_wide['item_recommendations'] = rank_imp_wide.iloc[:,:25].apply(
    lambda x: ' '.join(x.dropna()),
    axis=1
)

rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

rank_imp_wide['item_probs'] = rank_imp_wide.iloc[:,:25].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

cols=rank_imp_wide.columns.tolist()
cols=cols[1:3]+cols[:1]+cols[3:4]+cols[-2:]
rank_imp_wide=rank_imp_wide[cols]

rank_imp_wide.head()

rank_imp_wide.to_csv(config['output_meta_only_fiited_csv_path'],index=False)

In [17]:
if config['use_validation']:
    MRR_NB=val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) \
                           & (val_imp_long.probs!=0.0),'RR'].mean()
    MRR_parse_imp=val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)),'RR'].mean()
    
    print(MRR_NB)
    print(MRR_parse_imp)
    print('support MRR_NB: ', str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) & (val_imp_long.probs!=0.0),'RR'].count()))
    print('support MRR_NB: ', str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)),'RR'].count()))
    print(n_clickouts_test)

0.6255510502812318
0.3352119653916944
support MRR_NB:  2793
support MRR_NB:  6544
10336


In [18]:
threshold=0.1

sum_probs=val_imp_long.groupby('key').probs.sum()
val_imp_sum=val_imp_long.join(sum_probs,on='key',rsuffix='_sum')
print(val_imp_sum.loc[(val_imp_sum.probs_sum>threshold) & (val_imp_sum.target.astype(int)==val_imp_sum.impressions.astype(int))].RR.mean())
print(val_imp_sum.loc[(val_imp_sum.probs_sum>threshold) & (val_imp_sum.target.astype(int)==val_imp_sum.impressions.astype(int))].RR.count())

0.6155831391969003
1261


# Add lost sessions back
Load test data

In [19]:
if config['drop_no_references']:
    if config['use_validation']:
        data=pd.read_pickle(config['val_pickle_path'])
    else:
        data=pd.read_pickle(config['test_pickle_path'])
        
    


    val_wide = process_test_naives_bayes(data=data, metadata=meta, encoders=encoders, config=config)
    val_wide=val_wide.loc[val_wide.impressions!=0]

    try:
        del test
    except NameError:
        pass
    else:
        gc.collect()


    val_wide_allnull=val_wide.loc[val_wide.iloc[:,:config['session_length']].sum(axis=1)==0].copy()
    
    try:
        del val_wide
    except NameError:
        pass
    else:
        gc.collect()

Munge into same format

In [20]:
columns_ordered = ['user_id', 'session_id', 'timestamp', 'step', 'item_recommendations',  'item_probs']

val_wide_allnull=val_wide_allnull[['timestamp','impressions']]
val_wide_allnull['timestamp']=val_wide_allnull['timestamp'].astype(np.int64)//10**9
val_wide_allnull.reset_index(inplace=True)
val_wide_allnull[['user_id','session_id','step']]=val_wide_allnull['key'].str.split('_',expand=True)
val_wide_allnull.drop(['key'],axis=1,inplace=True)
val_wide_allnull['item_probs']=val_wide_allnull['impressions'].str.replace('[0-9]+\\||[0-9]+','0 ')
val_wide_allnull['impressions']=val_wide_allnull['impressions'].str.replace('\\|',' ')
val_wide_allnull.rename(columns={'impressions':'item_recommendations'},inplace=True)


columns_ordered = ['user_id', 'session_id', 'timestamp', 'step', 'item_recommendations',  'item_probs']
val_wide_allnull=val_wide_allnull.reindex(columns=columns_ordered)

val_wide_allnull.head()



Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,item_probs
0,00M5AMMLYQG5,8fd417ebd2d5b,1541062793,1,9882016 106769 48219 147360 7333050 3533848 75...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,00WWQYCIDPUF,a8962895bad41,1541064252,6,7929996 8767518 5742058 7195114 7325018 196079...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,0168S4C5E60K,016150a298e6e,1541063179,9,42109 42127 42270 137704 1577271 1750769 42146...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3,01BS6LPQ5JTX,2a35cec32ede2,1541063117,1,127745 43084 1257131 1797067 1848773 5452918 4...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,01MTZ6S2OHUA,2d123642df132,1541059612,10,1277246 1341812 1550767 1388724 2187086 124244...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


append with NB fitted test data

In [21]:
output=rank_imp_wide.append(val_wide_allnull,ignore_index=True,sort=False)

save submission

In [22]:
output.drop('item_probs',axis=1).to_csv(config['output_recsys_csv_path'],index=False)
output.to_csv(config['output_meta_csv_path'],index=False)

In [23]:
output.shape

(10336, 6)

In [24]:
val_wide_allnull.shape

(3787, 6)

In [25]:
rank_imp_wide.shape

(6549, 6)

In [26]:
val_wide_allnull.shape[0]+rank_imp_wide.shape[0]-n_clickouts_test

0

In [27]:
n_clickouts_test-output.shape[0]

0