In [1]:
import pandas as pd
import numpy as np
import gc

from sklearn.naive_bayes import *
from sklearn import preprocessing

# %% Imports
from pathlib import Path
import sys
root_dir = Path().resolve()
sys.path.append(str(root_dir / 'src'))

from recsys_common import *
from recsys_naive_bayes_processing import *


In [2]:
meta=get_metadata()
# meta.dtypes

In [3]:
meta['item_id']=meta['item_id'].astype(str)

In [4]:
sessions=get_sessions(True,.05,True,0.25)
sessions['impressions']=sessions['impressions'].str.split('\\|')
sessions['prices']=sessions['prices'].str.split('\\|')
print(sessions.dtypes)
# sessions.head()

user_id                         object
session_id                      object
timestamp          datetime64[ns, UTC]
step                             int64
action_type                     object
reference                       object
platform                        object
city                            object
device                          object
current_filters                 object
impressions                     object
prices                          object
is_validation                     bool
is_train                          bool
dtype: object


# Train encoders

In [5]:
columns_to_encode = ['action_type','platform','city','device']

encoders = {}
for col in columns_to_encode:
    le = preprocessing.LabelEncoder()
    encoders[col]=le.fit(sessions[col])
    print(encoders[col].classes_)
#     dev_wide[col]=encoders[col].transform(dev_wide[col])


['change of sort order' 'clickout item' 'filter selection'
 'interaction item deals' 'interaction item image' 'interaction item info'
 'interaction item rating' 'search for destination' 'search for item'
 'search for poi']
['AA' 'AE' 'AR' 'AT' 'AU' 'BE' 'BG' 'BR' 'CA' 'CH' 'CL' 'CN' 'CO' 'CZ'
 'DE' 'DK' 'EC' 'ES' 'FI' 'FR' 'GR' 'HK' 'HR' 'HU' 'ID' 'IE' 'IL' 'IN'
 'IT' 'JP' 'KR' 'MX' 'MY' 'NL' 'NO' 'NZ' 'PE' 'PH' 'PL' 'PT' 'RO' 'RS'
 'RU' 'SE' 'SG' 'SI' 'SK' 'TH' 'TR' 'TW' 'UK' 'US' 'UY' 'VN' 'ZA']
['A Teixeira, Spain' 'Aachen, Germany' 'Aadorf, Switzerland' ...
 'Żarki, Poland' 'Žabljak, Montenegro' 'Žilina, Slovakia']
['desktop' 'mobile' 'tablet']


# Get Splits

In [6]:
test = sessions.loc[sessions.is_train==False].reset_index(drop=True)
# train = sessions.loc[(sessions.is_train==True)]
train = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==False)].reset_index(drop=True)
dev = sessions.loc[(sessions.is_train==True) & (sessions.is_validation==True)].reset_index(drop=True)

print('test',test.shape)
print('train',train.shape)
print('dev',dev.shape)

test (200069, 14)
train (537963, 14)
dev (155467, 14)


# Check number of test clickouts without a reference

In [7]:
nclickouts_nan = test.loc[(test.action_type=='clickout item') & (test.reference.isna()),'reference'].shape[0]
nclickouts = test.loc[(test.action_type=='clickout item'),'reference'].shape[0]
print('test clickouts without reference, total, and frac:', [nclickouts_nan,nclickouts,nclickouts_nan/nclickouts])

test clickouts without reference, total, and frac: [12856, 26345, 0.4879863351679636]


# Get Fake Test Set

In [8]:
dev_test,gt = process_validation(dev, frac_nan=0.488, seed=1234)

dev_test.head()
gt.head()
dev_test.loc[dev_test.reference.isnull(),'target']=list(gt['reference'])
dev_test

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,is_validation,is_train,target
0,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:15:30+00:00,1,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
1,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:15:30+00:00,2,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
2,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:15:40+00:00,3,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
3,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:15:40+00:00,4,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
4,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:15:40+00:00,5,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
5,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:15:50+00:00,6,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
6,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:16:00+00:00,7,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
7,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:16:00+00:00,8,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
8,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:16:00+00:00,9,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,
9,02SRUT1NQYH1,3599a6f709eab,2018-11-01 09:16:10+00:00,10,interaction item image,2795374,FI,"Krakow, Poland",mobile,,,,True,True,


# Munge for NB

get training dat ain wide format

In [9]:
train_wide, y = process_train_naives_bayes(data=train, metadata=meta, session_length=5, encode = True,encoders=encoders,cols_to_encode=columns_to_encode)

In [10]:
train_wide_allnull=train_wide[(train_wide.iloc[:,0:9].T == 0).all()].copy()
y_allnull = y[(train_wide.iloc[:,0:9].T == 0).all()].copy()

train_wide_not_allnull=train_wide[(train_wide.iloc[:,0:9].T != 0).all()].copy()
y_not_allnull = y[(train_wide.iloc[:,0:9].T != 0).all()].copy()

print(sum(train_wide_allnull.index.values!=y_allnull.index.values))
print(sum(train_wide_not_allnull.index.values!=y_not_allnull.index.values))

0
0


In [24]:
train_wide

Unnamed: 0_level_0,action_type|1,action_type|2,action_type|3,action_type|4,action_type|5,reference|1,reference|2,reference|3,reference|4,reference|5,...,Water Slide|5,Wheelchair Accessible|5,WiFi (Public Areas)|5,WiFi (Rooms)|5,platform,city,device,timestamp,impressions,prices
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0017FIR55K7R_dbd605dbee1e5_2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,2,4361,1,2018-11-01 03:20:56+00:00,"[2745276, 2729988, 9467438, 9803260, 10344076]","[25, 115, 48, 60, 221]"
0017FIR55K7R_dbd605dbee1e5_3,1,0,0,0,0,2745276,0,0,0,0,...,0.0,0.0,0.0,0.0,2,4361,1,2018-11-01 03:22:32+00:00,"[2745276, 2729988, 9467438, 9803260, 10344076]","[25, 115, 48, 60, 221]"
0017FIR55K7R_dbd605dbee1e5_4,1,1,0,0,0,2745276,2745276,0,0,0,...,0.0,0.0,0.0,0.0,2,4361,1,2018-11-01 03:25:33+00:00,"[2745276, 2729988, 9467438, 9803260, 10344076]","[25, 115, 48, 60, 221]"
0017FIR55K7R_dbd605dbee1e5_5,1,1,1,0,0,2745276,2745276,2745276,0,0,...,0.0,0.0,0.0,0.0,2,4361,1,2018-11-01 03:27:02+00:00,"[2745276, 2729988, 9467438, 9803260, 10344076]","[25, 115, 48, 60, 221]"
001TEVEVUEBE_394f5ad9aa596_1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,31,490,0,2018-11-01 05:12:52+00:00,"[3366614, 2670592, 6625742, 2881004, 5708270, ...","[61, 80, 102, 87, 126, 85, 26, 81, 50]"
0025B8BU0NYP_37bfe437f8b89_43,6,5,4,4,4,4773134,4773134,4773134,4773134,4773134,...,0.0,1.0,1.0,1.0,4,2320,1,2018-11-01 07:35:38+00:00,"[8937, 8948, 3814822, 8938, 4202300, 8916, 133...","[85, 125, 32, 72, 359, 1592, 66, 93, 58, 31, 6..."
002BISXP1U1Q_8cd721ffb8e03_2,3,0,0,0,0,503071,0,0,0,0,...,0.0,0.0,0.0,0.0,38,5099,0,2018-11-01 06:37:11+00:00,"[503071, 1155657, 1377982, 4543792, 1824669, 1...","[130, 179, 354, 146, 234, 239, 287, 385, 278, ..."
002J73UJ7Z1T_f721d12954e48_51,4,4,3,4,4,6623894,6623894,6623894,6623894,6623894,...,0.0,0.0,0.0,0.0,7,358,1,2018-11-01 00:43:10+00:00,"[1776335, 1827211, 1827619, 1827699, 1828421, ...","[73, 60, 59, 52, 81, 68, 72, 90, 72, 72, 47, 5..."
002J73UJ7Z1T_f721d12954e48_52,1,4,4,3,4,6623894,6623894,6623894,6623894,6623894,...,0.0,0.0,0.0,0.0,7,358,1,2018-11-01 00:43:52+00:00,"[1776335, 1827211, 1827619, 1827699, 1828421, ...","[73, 60, 59, 52, 81, 68, 72, 90, 72, 72, 47, 5..."
002J73UJ7Z1T_f721d12954e48_53,1,1,4,4,3,6623894,6623894,6623894,6623894,6623894,...,0.0,0.0,0.0,0.0,7,358,1,2018-11-01 00:44:15+00:00,"[1776335, 1827211, 1827619, 1827699, 1828421, ...","[73, 60, 59, 52, 81, 68, 72, 90, 72, 72, 47, 5..."


get test data in wide format

In [11]:
dev_wide = process_test_naives_bayes(data=dev_test, metadata=meta, session_length=5, encode = True,encoders=encoders,cols_to_encode=columns_to_encode)

dev_wide_allnull=dev_wide[(dev_wide.iloc[:,0:9].T == 0).all()].copy()

dev_wide_not_allnull=dev_wide[(dev_wide.iloc[:,0:9].T != 0).all()].copy()


In [12]:
dev_wide_not_allnull.shape

(1809, 802)

## Clean up memory

In [13]:
try:
    del sessions, train, test, dev
except NameError:
    pass
else:
    gc.collect()

In [14]:
# columns=train_wide.dtypes.loc[((train_wide.dtypes=='category') | (train_wide.dtypes=='object'))].index
# columns
# for col in columns:
#     print(col)
#     train_wide[col]=train_wide[col].str.replace(' ','_')
    
# train_wide

### Fit NB

In [15]:
clf = MultinomialNB()
clf.fit(train_wide_not_allnull.drop(['reference|1', 'reference|2', 'reference|3',
       'reference|4', 'reference|5','timestamp','impressions','prices'],axis=1),np.ravel(y_not_allnull))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
pred=clf.predict_proba(dev_wide_not_allnull.drop(['reference|1', 'reference|2', 'reference|3',
       'reference|4', 'reference|5','timestamp','impressions','prices','target'],axis=1))

In [17]:
pred.shape

(1809, 14084)

In [25]:
# pd.DataFrame(train_wide_not_allnull.iloc[:1000,:]['impressions'].values.tolist())

# pd.DataFrame(pred,columns=clf.classes_)

In [20]:
dev_impression=dev_wide_not_allnull[['impressions','target']].reset_index().copy()
dev_impression.head()


Unnamed: 0,key,impressions,target
0,00VFIQZH8RZ6_b485e039d93c7_33,"[6546628, 147278, 3905094, 3087002, 8980586, 1...",5627048
1,019M5PODL0RN_05bd08cd6d306_41,"[5833518, 1038762, 109133, 851456, 1398107, 32...",851456
2,01VPWUCD4E94_65f1cf611b920_19,"[2201134, 13863, 2275636, 15494, 5548, 5614, 1...",5611
3,01XT8XWUV9ZA_077b15a79c9aa_24,"[3205032, 37528, 6831812, 6481364, 2900967, 49...",3770184
4,02ENVWY65X9H_7394b61ec8376_15,"[83485, 81902, 352661, 124234, 2015597, 313647...",84803


In [21]:
dev_impressions_long=dev_impression.impressions.apply(pd.Series) \
    .merge(dev_impression, right_index = True, left_index = True) \
    .drop(["impressions"], axis = 1)  \
    .melt(id_vars = ['key','target'], value_name = "impressions") \
    .drop("variable", axis = 1) \
    .dropna() \
    .sort_values('key') \
    .reset_index(drop=True) \
    .copy()

In [30]:
preddf=pd.DataFrame(data=pred.T,index=clf.classes_,columns=dev_wide_not_allnull.index).T

In [32]:
pred_comb=dev_impression.join(preddf, on='key')

In [None]:
pred_comb[lambda x: x]