In [63]:
import pandas as pd
import numpy as np
import gc
import pickle

from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.externals import joblib

import time

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *


In [64]:
config= {
    'save_train_test_val': True,
    'load_fitted_model': False,
    
    'use_subset': True,
    'subset_frac': 0.05,
    'use_validation': True,
    'validation_frac': 0.25,
    'reference_to_nan_frac': 1,
    'reference_to_nan_seed': 1234,
    
    'session_length': 1,
    'drop_no_references': True,
    'train_model_on_test_data': True,
    'add_prices': False,
    'add_hour': False,
    'cols_to_append': [],#['platform','city','device'],
    'drop_action_type': True,
    
    'train_session_chunksize': 15000,
    'parts_nrows_test': 10000,
    'data_path': root_dir / 'cache'
    }


if not config['use_subset']:
    config['subset_frac']=1

config['le_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) + '_le.pickle')
config['train_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train.csv')
config['train_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_train_last_step.csv')
config['test_last_step_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test_last_step.csv')
config['test_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_test.csv')
config['val_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val.csv')
config['prices_pickle_path']=config['data_path'] / 'mean_prices.pickle'
config['model_pickle_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_model.pickle')
config['val_long_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_val_long.csv')
config['output_recsys_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_recsys.csv')
config['output_meta_csv_path']=config['data_path'] / ('NB_data_' + str(int(100*config['subset_frac'])).zfill(3) + '_' + str(config['session_length']) +  '_output_meta.csv')


In [65]:
config['session_length']

1

In [66]:
meta=get_metadata()
# meta.dtypes

meta['item_id']=meta['item_id'].astype(str)

In [67]:
if config['save_train_test_val']:
    print('Getting sessions')
    sessions=get_sessions(config['use_subset'],
                          config['subset_frac'],
                          config['use_validation'],
                          config['validation_frac'],
                          config['reference_to_nan_frac'],
                          config['reference_to_nan_seed'])

    print('Filter session with no clickout')
    if (not config['use_validation']) & (not config['use_subset']):
        print('filtering sessions with clickout')
        sessions=filter_sessions_with_no_clicks(sessions)

        
    columns_to_encode = ['action_type','platform','city','device']

    if config['add_prices']:
        print('adding prices')
        prices=pd.read_pickle(config['prices_pickle_path'])
        sessions=sessions.join(prices.set_index('reference'),on='reference')
#         sessions['city_price']=sessions['city'] + sessions['mean_prices'].astype(str)
#         columns_to_encode=np.append(columns_to_encode,['city_price'])
    
    if config['add_hour']:
        sessions['hour']=sessions.timestamp.dt.hour.astype(str)
        sessions['city_hour']=sessions['city'] + sessions['hour']
        sessions.drop('hour',axis=1,inplace=True)
        columns_to_encode=np.append(columns_to_encode,['city_hour'])
        config['cols_to_append']=np.append(config['cols_to_append'],['city_hour'])
    
    
    
    print(sessions.dtypes)
    # sessions.head()

    print('Quick unit test')

    print(len(sessions.index))
    print(len(sessions.index.unique()))

    if not config['use_validation']:
        sessions['is_validation']=False
        sessions['target']=np.NaN

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True),'target'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False),'target'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False),'target'].count())

    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') ,'step'].count())
    print(sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())

    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item'),'step'].count())
    print(sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count())
    
    if config['use_validation']:
        n_clickouts_test = sessions.loc[(sessions.is_validation==True) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count()
    else:
        n_clickouts_test = sessions.loc[(sessions.is_train==False) & (sessions.is_validation==False) & (sessions.action_type=='clickout item') & (sessions.reference.isnull()),'step'].count()
        
        
    # Get possible classes
    # classes=list(set(train.loc[(train.action_type=='clickout item') & (train.step>1),'reference']))
    classes=list(set(sessions.loc[(sessions.action_type=='clickout item') & ~(sessions.reference.isnull()),'reference']))

    
    print('Train encoders and save')
    encoders = {}
    for col in columns_to_encode:
        le = preprocessing.LabelEncoder()
        encoders[col]=le.fit(sessions[col])
        print(encoders[col].classes_)
    #     val_wide[col]=encoders[col].transform(val_wide[col])

    with open(config['le_pickle_path'], 'wb') as handle:
        pickle.dump(encoders, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # with open(config['le_pickle_path'], 'rb') as handle:
    #     b = pickle.load(handle)
    
    print('Get Splits')

    train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)] \
                        .drop(['impressions','prices','is_train','is_validation','target'],axis=1) \
                        .reset_index(drop=True)

    test = sessions.loc[sessions.is_train==False] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    val = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)] \
                       .drop(['is_train','is_validation'],axis=1) \
                       .reset_index(drop=True)

    print('train',train.shape)
    print('test',test.shape)
    print('val',val.shape)

    print('train',train.dtypes)
    print('test',test.dtypes)
    print('val',val.dtypes)


    print('Save either test or val')
    if config['use_validation']:
        val.to_csv(config['val_csv_path'])
    else:
        test.to_csv(config['test_csv_path'])


    print('delete session, test and val')
    try:
        del sessions, test, val
    except NameError:
        pass
    else:
        gc.collect()
     
    


    print('save train')
    train.to_csv(config['train_csv_path'])

else:
    print('loading train and encoders...')
    
    train=pd.read_csv(config['train_csv_path'])
    train.drop(['Unnamed: 0'],axis=1,inplace=True)
    
    with open(config['le_pickle_path'], 'rb') as handle:
        encoders = pickle.load(handle)

    print('done')
    

last_step_per_session=train.groupby('session_id',sort=False)['step'].max().reset_index().to_csv(config['train_last_step_csv_path'])

names=train.columns


print('delete train')
try:
    del train
except NameError:
    pass
else:
    gc.collect()

Getting sessions
Filter session with no clickout
user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
target                          object
dtype: object
Quick unit test
893499
893499
10336
0
0
19995
10336
59661
0
26345
12856
Train encoders and save
['change of sort order' 'clickout item' 'filter selection'
 'interaction item deals' 'interaction item image' 'interaction item info'
 'interaction item rating' 'search for destination' 'search for item'
 'search for poi']
['AA' 'AE' 'AR' 'AT' 'AU' 'BE' 'B

In [68]:
cols_to_drop=([])
for i in range(config['session_length']):
    print('reference|' + str(i+1))
    cols_to_drop=np.append(cols_to_drop,'reference|' + str(i+1))
    if config['drop_action_type']:
        print('action_type|' + str(i+1))
        cols_to_drop=np.append(cols_to_drop,'action_type|' + str(i+1))
    

reference|1
action_type|1


### Fit NB

In [69]:
if not config['load_fitted_model']:
    start = time.time()
    reader=pd.read_csv(config['train_last_step_csv_path'], chunksize=config['train_session_chunksize'])

    skiprows=0
    clf=BernoulliNB()
    for i,chunk in enumerate(reader):
        nrows=chunk.step.sum()
        print('Get Train & process')
        train_part=pd.read_csv(config['train_csv_path'],header=0,skiprows=skiprows,nrows=nrows,names=names)

        train_wide = process_train_naives_bayes(data=train_part, metadata=meta, encoders=encoders, config=config)

        del train_part

        if config['drop_no_references']:
            train_wide_not_allnull=train_wide[(train_wide.iloc[:,0:((2*config['session_length'])-1)].T != 0).any()].copy()
        else:
            train_wide_not_allnull=train_wide.copy()


#         del train_wide

        print(train_wide_not_allnull.shape)
        
        if not train_wide_not_allnull.empty:
            if i==0:
                clf.partial_fit(train_wide_not_allnull \
                            .drop(np.append(cols_to_drop,'y'),axis=1), \
                                   np.ravel(train_wide_not_allnull.y), \
                                   classes = classes)
            else:
                clf.partial_fit(train_wide_not_allnull \
                            .drop(np.append(cols_to_drop,'y'),axis=1), \
                                   np.ravel(train_wide_not_allnull.y))

        skiprows += nrows

    stop = time.time()
    print('Seconds elapsed  :', stop-start) 
    
    # Output a pickle file for the model
    joblib.dump(clf, config['model_pickle_path'])
    
    del train_wide_not_allnull
    
else:
    clf=joblib.load(config['model_pickle_path'])

Get Train & process
(19745, 160)
Get Train & process
(20234, 160)
Get Train & process
(1396, 160)
Seconds elapsed  : 249.72301602363586


# Train model on test data clickouts that have a reference

In [70]:
if config['train_model_on_test_data']:
    
    if config['use_validation']:
        data=pd.read_csv(config['val_csv_path'],parse_dates=['timestamp'])
        test_path=config['val_csv_path']
    else:
        data=pd.read_csv(config['test_csv_path'],parse_dates=['timestamp'])
        test_path=config['test_csv_path']
        
    names2=data.columns
        
    last_step_per_session=data.groupby('session_id',sort=False)['step'].max().reset_index().to_csv(config['test_last_step_csv_path'])

    start = time.time()
    reader=pd.read_csv(config['test_last_step_csv_path'], chunksize=config['train_session_chunksize'])

    skiprows=0
    
    for i,chunk in enumerate(reader):
        nrows=chunk.step.sum()
        train_part=pd.read_csv(test_path,header=0,skiprows=skiprows,nrows=nrows,names=names2)
        
        train_part=train_part.loc[~(train_part.reference.isnull())]
        train_part.drop(['impressions','prices','target'],axis=1,inplace=True)

        train_wide = process_train_naives_bayes(data=train_part, metadata=meta, encoders=encoders, config=config)

        del train_part

        if config['drop_no_references']:
            train_wide_not_allnull=train_wide[(train_wide.iloc[:,0:((2*config['session_length'])-1)].T != 0).any()].copy()
        else:
            train_wide_not_allnull=train_wide.copy()


        del train_wide

        print(train_wide_not_allnull.shape)
        
        if not train_wide_not_allnull.empty:
            clf.partial_fit(train_wide_not_allnull \
                        .drop(np.append(cols_to_drop,'y'),axis=1), \
                               np.ravel(train_wide_not_allnull.y))

        skiprows += nrows

    stop = time.time()
    print('Seconds elapsed  :', stop-start) 
    
    # Output a pickle file for the model
    joblib.dump(clf, config['model_pickle_path'])
    
#     del train_wide_not_allnull


(7168, 160)
Seconds elapsed  : 20.790571212768555


get test data in wide format

## split for partial fit

### PREDICT

In [71]:
if config['use_validation']:
    data=pd.read_csv(config['val_csv_path'],parse_dates=['timestamp'])
else:
    data=pd.read_csv(config['test_csv_path'],parse_dates=['timestamp'])

data.drop(['Unnamed: 0'],axis=1,inplace=True)

val_wide = process_test_naives_bayes(data=data, metadata=meta, encoders=encoders, config=config)
val_wide=val_wide.loc[val_wide.impressions!=0]


try:
    del test
except NameError:
    pass
else:
    gc.collect()

if config['drop_no_references']:
    val_wide_not_allnull=val_wide.loc[(val_wide.iloc[:,0:((2*config['session_length'])-1)].T != 0).any()].copy()
else:
    val_wide_not_allnull=val_wide.copy()

val_wide_not_allnull['impressions']=val_wide_not_allnull['impressions'].str.split('\\|')

try:
    del val_wide
except NameError:
    pass
else:
    gc.collect()

Create long table for results

In [72]:
val_imp=val_wide_not_allnull[['impressions','target']].reset_index().copy()

val_imp_long=val_imp.impressions.apply(pd.Series) \
    .merge(val_imp, right_index = True, left_index = True) \
    .drop(["impressions"], axis = 1)  \
    .melt(id_vars = ['key','target'], value_name = "impressions") \
    .drop("variable", axis = 1) \
    .dropna() \
    .sort_values('key') \
    .reset_index(drop=True) \
    .copy()

try:
    del val_imp
except NameError:
    pass
else:
    gc.collect()

Get row, col indexes for extracting probabilities from pred

In [73]:
row_index=val_imp_long['key'].copy()
col_index=val_imp_long['impressions'].copy()

mask_in_class = col_index.isin(clf.classes_) #True is in class
col_index.loc[~mask_in_class]='not_in_class'

Do predict and munge output on the fly

In [74]:
print(val_wide_not_allnull.shape)

n_splits = round(val_wide_not_allnull.shape[0]/config['parts_nrows_test'])
if n_splits==0:
    n_splits=1
    index_split = val_wide_not_allnull.shape[0]
else:
    index_split = round(val_wide_not_allnull.shape[0]/n_splits)
    
print(index_split)

impression_probs=([])
for i in range(n_splits): # n_splits
    print('%s of %s' % (i,n_splits))
    if (i+1)==n_splits:
        pred=clf.predict_proba(val_wide_not_allnull.iloc[(index_split*(i)):,:] \
                                         .drop(np.append(cols_to_drop,['timestamp', 'impressions','target']),axis=1))
        
        preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=val_wide_not_allnull.iloc[(index_split*(i)):,:].index)
        preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_
        
        first_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):,:].head(1).index
        first_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(first_key_last_pred)].index.min()

        last_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):,:].tail(1).index
        last_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(last_key_last_pred)].index.max()

        row_index[first_index_for_row_col:(last_index_for_row_col+1)]
        
        impression_probs=np.append(impression_probs,preddf.lookup(row_index[first_index_for_row_col:(last_index_for_row_col+1)], \
                                              col_index[first_index_for_row_col:(last_index_for_row_col+1)]),axis=0)
        
    else:
        pred=clf.predict_proba(val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:] \
                                         .drop(np.append(cols_to_drop,['timestamp', 'impressions','target']),axis=1))
        
        
        preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].index)
        preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_
        
        first_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].head(1).index
        first_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(first_key_last_pred)].index.min()

        last_key_last_pred=val_wide_not_allnull.iloc[(index_split*(i)):(index_split*(i+1)),:].tail(1).index
        last_index_for_row_col=val_imp_long.loc[val_imp_long.key.isin(last_key_last_pred)].index.max()

        row_index[first_index_for_row_col:(last_index_for_row_col+1)]
        
        impression_probs=np.append(impression_probs,preddf.lookup(row_index[first_index_for_row_col:(last_index_for_row_col+1)], \
                                              col_index[first_index_for_row_col:(last_index_for_row_col+1)]),axis=0)

(6549, 162)
6549
0 of 1


Munge to get rank and RR

In [75]:
val_imp_long['probs'] = impression_probs

val_imp_long.sort_values(['key','probs'],ascending=[True,False],inplace=True)

val_imp_long['rank'] = 1
val_imp_long['rank'] = val_imp_long.groupby('key')['rank'].cumsum()

val_imp_long['RR']=1/val_imp_long['rank']

val_imp_long.to_csv(config['val_long_csv_path'])

get output file

In [76]:
rank_imp_wide=val_imp_long.pivot(index='key',columns='rank',values=['impressions','probs']).copy()

# collapse column multi index for ease of indexing
rank_imp_wide.columns=rank_imp_wide.columns.map(lambda x: '|'.join([str(i) for i in x]))
    
rank_imp_wide=rank_imp_wide.join(val_wide_not_allnull['timestamp'], on='key')

rank_imp_wide['timestamp']=rank_imp_wide['timestamp'].astype(np.int64)//10**9

rank_imp_wide.reset_index(inplace=True)

rank_imp_wide[['user_id','session_id','step']]=rank_imp_wide['key'].str.split('_',expand=True)

rank_imp_wide.drop(['key'],axis=1,inplace=True)

rank_imp_wide['item_recommendations'] = rank_imp_wide.iloc[:,:25].apply(
    lambda x: ' '.join(x.dropna()),
    axis=1
)

rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

rank_imp_wide['item_probs'] = rank_imp_wide.iloc[:,:25].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

rank_imp_wide.drop(rank_imp_wide.columns[:25],axis=1,inplace=True)

cols=rank_imp_wide.columns.tolist()
cols=cols[1:3]+cols[:1]+cols[3:4]+cols[-2:]
rank_imp_wide=rank_imp_wide[cols]

rank_imp_wide.head()

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,item_probs
0,00EI1R7YK601,9639ee039c1d0,1541068329,3,135917 104136 104132 4529846 104168 393751 314...,2.17473842370666e-06 2.17473842370666e-06 1.91...
1,00GKOZLYVI9R,8e74b912cb1b4,1541065310,16,4467826 1031940 5452024 2596290 5827212 816242...,0.09313640823079226 0.0002826738846636392 3.23...
2,00J9RN4XAC2N,d2397c03bc9b4,1541065050,122,3134112 1104106 1104108 4073430 2714672 470233...,3.641875313123964e-06 8.947170035210208e-12 2....
3,00QD3TS82ZP1,65f6d52da6c28,1541064419,3,3537348 3536828 2034239 1518663 2832878 390219...,0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4,00VFIQZH8RZ6,b485e039d93c7,1541062460,33,3087002 100137 2005355 7793494 2772604 3213142...,5.197289709582199e-07 6.34434779001736e-11 3.9...


In [77]:
if config['use_validation']:
    MRR_NB=val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) \
                           & (val_imp_long.probs!=0.0),'RR'].mean()
    MRR_parse_imp=val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)),'RR'].mean()
    
    print(MRR_NB)
    print(MRR_parse_imp)
    print('support MRR_NB: ', str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)) & (val_imp_long.probs!=0.0),'RR'].count()))
    print('support MRR_NB: ', str(val_imp_long.loc[(val_imp_long.target.astype(int)==val_imp_long.impressions.astype(int)),'RR'].count()))
    print(n_clickouts_test)

0.6253893914008776
0.3351502456170596
support MRR_NB:  2793
support MRR_NB:  6544
10336


In [78]:
threshold=0.1

sum_probs=val_imp_long.groupby('key').probs.sum()
val_imp_sum=val_imp_long.join(sum_probs,on='key',rsuffix='_sum')
print(val_imp_sum.loc[(val_imp_sum.probs_sum>threshold) & (val_imp_sum.target.astype(int)==val_imp_sum.impressions.astype(int))].RR.mean())
print(val_imp_sum.loc[(val_imp_sum.probs_sum>threshold) & (val_imp_sum.target.astype(int)==val_imp_sum.impressions.astype(int))].RR.count())

0.6155825128294519
1262


# Add lost sessions back
Load test data

In [79]:
if config['drop_no_references']:
    if config['use_validation']:
        data=pd.read_csv(config['val_csv_path'],parse_dates=['timestamp'])
    else:
        data=pd.read_csv(config['test_csv_path'],parse_dates=['timestamp'])

    data.drop(['Unnamed: 0'],axis=1,inplace=True)

    val_wide = process_test_naives_bayes(data=data, metadata=meta, encoders=encoders, config=config)
    val_wide=val_wide.loc[val_wide.impressions!=0]

    try:
        del test
    except NameError:
        pass
    else:
        gc.collect()


    val_wide_allnull=val_wide.loc[(val_wide.iloc[:,0:(2*config['session_length']-1)].T == 0).all()].copy()
    
    try:
        del val_wide
    except NameError:
        pass
    else:
        gc.collect()

Munge into same format

In [80]:
columns_ordered = ['user_id', 'session_id', 'timestamp', 'step', 'item_recommendations',  'item_probs']

val_wide_allnull=val_wide_allnull[['timestamp','impressions']]
val_wide_allnull['timestamp']=val_wide_allnull['timestamp'].astype(np.int64)//10**9
val_wide_allnull.reset_index(inplace=True)
val_wide_allnull[['user_id','session_id','step']]=val_wide_allnull['key'].str.split('_',expand=True)
val_wide_allnull.drop(['key'],axis=1,inplace=True)
val_wide_allnull['item_probs']=val_wide_allnull['impressions'].str.replace('[0-9]+\\||[0-9]+','0 ')
val_wide_allnull['impressions']=val_wide_allnull['impressions'].str.replace('\\|',' ')
val_wide_allnull.rename(columns={'impressions':'item_recommendations'},inplace=True)


columns_ordered = ['user_id', 'session_id', 'timestamp', 'step', 'item_recommendations',  'item_probs']
val_wide_allnull=val_wide_allnull.reindex(columns=columns_ordered)

val_wide_allnull.head()



Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations,item_probs
0,00M5AMMLYQG5,8fd417ebd2d5b,1541062793,1,9882016 106769 48219 147360 7333050 3533848 75...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,00WWQYCIDPUF,a8962895bad41,1541064252,6,7929996 8767518 5742058 7195114 7325018 196079...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,0168S4C5E60K,016150a298e6e,1541063179,9,42109 42127 42270 137704 1577271 1750769 42146...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3,01BS6LPQ5JTX,2a35cec32ede2,1541063117,1,127745 43084 1257131 1797067 1848773 5452918 4...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,01MTZ6S2OHUA,2d123642df132,1541059612,10,1277246 1341812 1550767 1388724 2187086 124244...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


append with NB fitted test data

In [81]:
output=rank_imp_wide.append(val_wide_allnull,ignore_index=True,sort=False)

save submission

In [82]:
output.drop('item_probs',axis=1).to_csv(config['output_recsys_csv_path'],index=False)
output.to_csv(config['output_meta_csv_path'],index=False)

In [83]:
output.shape

(10336, 6)

In [84]:
val_wide_allnull.shape

(3787, 6)

In [85]:
rank_imp_wide.shape

(6549, 6)