In [1]:
import pandas as pd
import numpy as np
import gc

from sklearn.naive_bayes import *
from sklearn import preprocessing

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *


In [2]:
meta=get_metadata()
# meta.dtypes

In [3]:
meta['item_id']=meta['item_id'].astype(str)

In [4]:
sessions=get_sessions(True,.1,True,0.25)
sessions['impressions']=sessions['impressions'].str.split('\\|')
sessions['prices']=sessions['prices'].str.split('\\|')
print(sessions.dtypes)
# sessions.head()

(1024331, 12)
(345497, 12)
user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
dtype: object


## Create impressions set
will need this later to build a dict of cost.

In [5]:
l=sessions.loc[~(sessions.impressions.isnull()),'impressions'].tolist()

flat_list = [item for sublist in l for item in sublist]
set_reference=set(flat_list)
len(set_reference)

448694

# Train encoders

In [6]:
columns_to_encode = ['action_type','platform','city','device']

encoders = {}
for col in columns_to_encode:
    le = preprocessing.LabelEncoder()
    encoders[col]=le.fit(sessions[col])
    print(encoders[col].classes_)
#     dev_wide[col]=encoders[col].transform(dev_wide[col])


['change of sort order' 'clickout item' 'filter selection'
 'interaction item deals' 'interaction item image' 'interaction item info'
 'interaction item rating' 'search for destination' 'search for item'
 'search for poi']
['AA' 'AE' 'AR' 'AT' 'AU' 'BE' 'BG' 'BR' 'CA' 'CH' 'CL' 'CN' 'CO' 'CZ'
 'DE' 'DK' 'EC' 'ES' 'FI' 'FR' 'GR' 'HK' 'HR' 'HU' 'ID' 'IE' 'IL' 'IN'
 'IT' 'JP' 'KR' 'MX' 'MY' 'NL' 'NO' 'NZ' 'PE' 'PH' 'PL' 'PT' 'RO' 'RS'
 'RU' 'SE' 'SG' 'SI' 'SK' 'TH' 'TR' 'TW' 'UK' 'US' 'UY' 'VN' 'ZA']
["'s-Heerenberg, Netherlands" "'s-Hertogenbosch, Netherlands"
 'A Teixeira, Spain' ... 'Žarnovica, Slovakia' 'Žilina, Slovakia'
 'Κato Platres - Pano Platres, Cyprus']
['desktop' 'mobile' 'tablet']


# Get Splits

In [7]:
test = sessions.loc[sessions.is_train==False].reset_index(drop=True)
# train = sessions.loc[(sessions.is_train==True)]
train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)].reset_index(drop=True)
dev = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)].reset_index(drop=True)

print('test',test.shape)
print('train',train.shape)
print('dev',dev.shape)

del sessions

test (386304, 14)
train (1024331, 14)
dev (345497, 14)


# Check number of test clickouts without a reference

In [8]:
nclickouts_nan = test.loc[(test.action_type=='clickout item') & (test.reference.isna()),'reference'].shape[0]
nclickouts = test.loc[(test.action_type=='clickout item'),'reference'].shape[0]
print('test clickouts without reference, total, and frac:', [nclickouts_nan,nclickouts,nclickouts_nan/nclickouts])
del test

test clickouts without reference, total, and frac: [25497, 52977, 0.4812843309360666]


In [9]:
# frac_nan=0.488
# seed =1234
# dev['key'] = (dev['session_id'] + '_' + dev['step'].astype(str))

# last_clickout_in_session=dev.loc[dev.action_type=='clickout item'].groupby('session_id',as_index=False)['step'].max()
# last_clickout_in_session['key']=(last_clickout_in_session['session_id'] + '_' + last_clickout_in_session['step'].astype(str))
# print(last_clickout_in_session.shape)

# n_references=round(frac_nan*len(last_clickout_in_session))
# print(n_references)
# np.random.seed(seed)
# index_sampled_clickouts=np.random.choice(last_clickout_in_session.key,n_references,replace=False)
# print(index_sampled_clickouts.shape)

# dev['target']=dev.loc[dev.key.isin(index_sampled_clickouts),'reference']
# dev.loc[dev.key.isin(index_sampled_clickouts),'reference']=np.NaN

# dev.drop('key',axis=1,inplace=True)

# print(dev.shape)

In [10]:
# # dev.loc[(dev.session_id==(last_clickout_in_session.session_id)) & (dev.step.isin(last_clickout_in_session.step))].shape
# dev[dev.key.isin(last_clickout_in_session.key)].shape
# np.random.choice(last_clickout_in_session.key,n_references,replace=False)

In [11]:
# dev.loc[dev.key.isin(list(rows_to_change['key'])),'step']

# Get Fake Test Set

In [12]:
dev_test = process_validation(dev, frac_nan=0.488, seed=1234)

dev_test.head()


Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,is_validation,is_train,target
0,0NA8E4AD2VY7,336d9a46b826f,2018-11-01 13:13:57+00:00,1,filter selection,Free WiFi (Combined),RU,"Yekaterinburg, Russia",desktop,Free WiFi (Combined),,,True,True,
1,0NA8E4AD2VY7,336d9a46b826f,2018-11-01 13:14:08+00:00,2,filter selection,Breakfast Included,RU,"Yekaterinburg, Russia",desktop,Free WiFi (Combined)|Breakfast Included,,,True,True,
2,0NA8E4AD2VY7,336d9a46b826f,2018-11-01 13:14:41+00:00,3,interaction item image,1045290,RU,"Yekaterinburg, Russia",desktop,,,,True,True,
3,0NA8E4AD2VY7,336d9a46b826f,2018-11-01 13:14:41+00:00,4,interaction item image,1045290,RU,"Yekaterinburg, Russia",desktop,,,,True,True,
4,0NA8E4AD2VY7,336d9a46b826f,2018-11-01 13:14:51+00:00,5,interaction item image,1045290,RU,"Yekaterinburg, Russia",desktop,,,,True,True,


# Munge for NB

get training dat ain wide format

In [13]:
train_wide, y = process_train_naives_bayes(data=train, metadata=meta, session_length=5, encode = True,encoders=encoders,cols_to_encode=columns_to_encode)

In [14]:
train_wide_allnull=train_wide[(train_wide.iloc[:,0:9].T == 0).all()].copy()
y_allnull = y[(train_wide.iloc[:,0:9].T == 0).all()].copy()

train_wide_not_allnull=train_wide[(train_wide.iloc[:,0:9].T != 0).all()].copy()
y_not_allnull = y[(train_wide.iloc[:,0:9].T != 0).all()].copy()

print(sum(train_wide_allnull.index.values!=y_allnull.index.values))
print(sum(train_wide_not_allnull.index.values!=y_not_allnull.index.values))

0
0


In [15]:
train_wide_not_allnull.shape

(38719, 801)

get test data in wide format

In [16]:
dev_wide = process_test_naives_bayes(data=dev_test, metadata=meta, session_length=5, encode = True,encoders=encoders,cols_to_encode=columns_to_encode)

dev_wide_allnull=dev_wide[(dev_wide.iloc[:,0:9].T == 0).all()].copy()

dev_wide_not_allnull=dev_wide[(dev_wide.iloc[:,0:9].T != 0).all()].copy()


In [17]:
dev_wide_not_allnull.shape

(2781, 802)

## Clean up memory

In [18]:
# try:
#     del sessions, train, test, dev
# except NameError:
#     pass
# else:
#     gc.collect()

In [19]:
# columns=train_wide.dtypes.loc[((train_wide.dtypes=='category') | (train_wide.dtypes=='object'))].index
# columns
# for col in columns:
#     print(col)
#     train_wide[col]=train_wide[col].str.replace(' ','_')
    
# train_wide

### Fit NB

In [20]:
clf = MultinomialNB()
clf.fit(train_wide_not_allnull.drop(['reference|1', 'reference|2', 'reference|3',
       'reference|4', 'reference|5','timestamp','impressions','prices'],axis=1),np.ravel(y_not_allnull))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
pred=clf.predict_proba(dev_wide_not_allnull.drop(['reference|1', 'reference|2', 'reference|3',
       'reference|4', 'reference|5','timestamp','impressions','prices','target'],axis=1))

In [22]:
pred.shape

(2781, 25939)

In [23]:
# pd.DataFrame(train_wide_not_allnull.iloc[:1000,:]['impressions'].values.tolist())

# pd.DataFrame(pred,columns=clf.classes_)

In [24]:
dev_impression=dev_wide_not_allnull[['impressions','target']].reset_index().copy()


In [25]:
dev_impressions_long=dev_impression.impressions.apply(pd.Series) \
    .merge(dev_impression, right_index = True, left_index = True) \
    .drop(["impressions"], axis = 1)  \
    .melt(id_vars = ['key','target'], value_name = "impressions") \
    .drop("variable", axis = 1) \
    .dropna() \
    .sort_values('key') \
    .reset_index(drop=True) \
    .copy()

In [26]:
preddf=pd.DataFrame(data=pred,columns=clf.classes_,index=dev_wide_not_allnull.index)
preddf['not_in_class']=0 # need to return 0 in case impression not in clf.classes_

In [27]:
row_index=dev_impressions_long['key'].copy()
col_index=dev_impressions_long['impressions'].copy()

mask_in_class = col_index.isin(clf.classes_) #True is in class
col_index.loc[~mask_in_class]='not_in_class'

In [28]:
impression_probs=preddf.lookup(row_index,col_index)

In [29]:
dev_impressions_long['probs'] = impression_probs

In [30]:
dev_impressions_long.sort_values(['key','probs'],ascending=[True,False],inplace=True)

In [31]:
dev_impressions_long['rank'] = 1
dev_impressions_long['rank'] = dev_impressions_long.groupby('key')['rank'].cumsum()

In [32]:
dev_in_class=dev_impressions_long.loc[(dev_impressions_long.target==dev_impressions_long.impressions) \
                         & (dev_impressions_long.probs!=0.0),['key','rank']]

In [33]:
dev_in_class['RR']=1/dev_in_class['rank']

In [34]:
dev_in_class['RR'].mean()

0.4528290484293987

In [40]:
print(dev_wide.shape)
print(dev_wide_not_allnull.shape)
print(dev_in_class.shape)

(10087, 802)
(2781, 802)
(886, 3)
