In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
 
pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.5f' % x) 
pd.options.display.max_rows = 200

In [2]:
train_df = pd.read_csv('data/train/train_beh.csv')
test_df = pd.read_csv('data/test/test_beh_b.csv')
print(train_df.shape)
print(test_df.shape)

(934282, 5)
(95669, 4)


In [3]:
train_df.head(5)

Unnamed: 0,id,flag,page_no,Unnamed: 3,page_tm
0,U64F0C9,0,SZA,2019-06-30 12:44:27,
1,U64F0C9,0,CQE,2019-06-25 16:15:11,
2,U64F0C9,0,AAO,2019-06-30 12:44:17,
3,U64F0C9,0,CQE,2019-06-17 13:50:12,
4,U64F0C9,0,AAO,2019-06-17 13:50:08,


In [4]:
test_df.head(5)

Unnamed: 0,id,page_no,Unnamed: 2,page_tm
0,U441F8F,CQA,2019-06-11 22:27:25,
1,U441F8F,XAI,2019-06-17 17:42:39,
2,U441F8F,CQA,2019-06-17 17:42:02,
3,U441F8F,XAI,2019-06-17 17:42:35,
4,U441F8F,CQE,2019-06-17 17:42:49,


In [5]:
train_df.drop(['Unnamed: 3', 'page_tm'], axis = 1, inplace=True)
test_df.drop(['Unnamed: 2', 'page_tm'], axis = 1, inplace=True)

In [6]:
ts = train_df.groupby('page_no').flag.mean()

In [7]:
ts

page_no
AAO   0.14911
BWA   0.14964
BWE   0.15111
CQA   0.14872
CQB   0.10983
CQC   0.11446
CQD   0.11807
CQE   0.14303
CTR   0.16426
EGA   0.20592
EGB   0.20355
FDA   0.08304
FLS   0.14900
FTR   0.18709
GBA   0.14286
JF2   0.10745
JJD   0.07105
JJK   0.06067
LC0   0.12835
LCT   0.09617
MSG   0.15527
MTA   0.08891
SYK   0.05318
SZA   0.14571
SZD   0.14481
TRN   0.17293
XAG   0.15286
XAI   0.14844
ZY1   0.11111
Name: flag, dtype: float64

In [8]:
pno_target = pd.DataFrame(ts)
pno_target

Unnamed: 0_level_0,flag
page_no,Unnamed: 1_level_1
AAO,0.14911
BWA,0.14964
BWE,0.15111
CQA,0.14872
CQB,0.10983
CQC,0.11446
CQD,0.11807
CQE,0.14303
CTR,0.16426
EGA,0.20592


In [9]:
pno_target['pno'] = pno_target.index
pno_target.columns = ['target_score', 'pno']

In [10]:
pno_target

Unnamed: 0_level_0,target_score,pno
page_no,Unnamed: 1_level_1,Unnamed: 2_level_1
AAO,0.14911,AAO
BWA,0.14964,BWA
BWE,0.15111,BWE
CQA,0.14872,CQA
CQB,0.10983,CQB
CQC,0.11446,CQC
CQD,0.11807,CQD
CQE,0.14303,CQE
CTR,0.16426,CTR
EGA,0.20592,EGA


In [11]:
train_df.drop(['flag'], axis = 1, inplace=True)

In [12]:
test_df

Unnamed: 0,id,page_no
0,U441F8F,CQA
1,U441F8F,XAI
2,U441F8F,CQA
3,U441F8F,XAI
4,U441F8F,CQE
...,...,...
95664,U46C5B4,CQE
95665,U46C5B4,CQA
95666,U46C5B4,CQA
95667,U46C5B4,CQE


In [13]:
train_df=pd.concat([train_df, test_df],axis=0,ignore_index=True)
train_df['cnt'] = 1
train_df.shape

(1029951, 3)

In [14]:
train_df

Unnamed: 0,id,page_no,cnt
0,U64F0C9,SZA,1
1,U64F0C9,CQE,1
2,U64F0C9,AAO,1
3,U64F0C9,CQE,1
4,U64F0C9,AAO,1
...,...,...,...
1029946,U46C5B4,CQE,1
1029947,U46C5B4,CQA,1
1029948,U46C5B4,CQA,1
1029949,U46C5B4,CQE,1


In [15]:
table = pd.pivot_table(train_df, values='cnt', index=['id'],
                    columns=['page_no'], aggfunc=np.sum)
table.fillna(0, inplace=True)

In [16]:
for col in table.columns:
    table[col+'_ts'] = table[col] * pno_target.loc[col, 'target_score'] * 10

In [17]:
table['id'] = table.index
table

page_no,AAO,BWA,BWE,CQA,CQB,CQC,CQD,CQE,CTR,EGA,...,MSG_ts,MTA_ts,SYK_ts,SZA_ts,SZD_ts,TRN_ts,XAG_ts,XAI_ts,ZY1_ts,id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U0001B8,14.00000,0.00000,0.00000,17.00000,0.00000,2.00000,0.00000,15.00000,0.00000,0.00000,...,1.55273,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,U0001B8
U000437,3.00000,0.00000,0.00000,6.00000,0.00000,0.00000,0.00000,3.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,6.91724,0.00000,0.00000,0.00000,U000437
U0013E0,88.00000,1.00000,0.00000,206.00000,1.00000,0.00000,4.00000,4.00000,1.00000,0.00000,...,1.55273,0.00000,0.00000,11.65670,0.00000,31.12758,3.05723,0.00000,0.00000,U0013E0
U0015B2,1075.00000,20.00000,9.00000,510.00000,2.00000,1.00000,2.00000,21.00000,22.00000,0.00000,...,45.02912,0.00000,1.06352,90.33939,1.44811,1876.30128,0.00000,0.00000,0.00000,U0015B2
U0016FF,12.00000,0.00000,0.00000,18.00000,0.00000,0.00000,0.00000,31.00000,4.00000,0.00000,...,1.55273,0.00000,0.00000,1.45709,0.00000,3.45862,0.00000,0.00000,0.00000,U0016FF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UFFF2E7,2.00000,3.00000,1.00000,11.00000,1.00000,1.00000,0.00000,14.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,1.45709,1.44811,3.45862,0.00000,17.81288,0.00000,UFFF2E7
UFFF441,132.00000,1.00000,0.00000,200.00000,0.00000,0.00000,0.00000,5.00000,0.00000,0.00000,...,18.63274,0.00000,0.00000,1.45709,1.44811,0.00000,0.00000,10.39085,0.00000,UFFF441
UFFF7F4,47.00000,0.00000,0.00000,64.00000,0.00000,0.00000,1.00000,4.00000,0.00000,0.00000,...,24.84365,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,UFFF7F4
UFFFC56,2.00000,0.00000,0.00000,66.00000,0.00000,0.00000,0.00000,4.00000,1.00000,3.00000,...,1.55273,0.00000,0.00000,24.77048,8.68866,3.45862,0.00000,0.00000,0.00000,UFFFC56


In [18]:
table.shape

(13145, 59)

In [19]:
table.columns

Index(['AAO', 'BWA', 'BWE', 'CQA', 'CQB', 'CQC', 'CQD', 'CQE', 'CTR', 'EGA',
       'EGB', 'FDA', 'FLS', 'FTR', 'GBA', 'JF2', 'JJD', 'JJK', 'LC0', 'LCT',
       'MSG', 'MTA', 'SYK', 'SZA', 'SZD', 'TRN', 'XAG', 'XAI', 'ZY1', 'AAO_ts',
       'BWA_ts', 'BWE_ts', 'CQA_ts', 'CQB_ts', 'CQC_ts', 'CQD_ts', 'CQE_ts',
       'CTR_ts', 'EGA_ts', 'EGB_ts', 'FDA_ts', 'FLS_ts', 'FTR_ts', 'GBA_ts',
       'JF2_ts', 'JJD_ts', 'JJK_ts', 'LC0_ts', 'LCT_ts', 'MSG_ts', 'MTA_ts',
       'SYK_ts', 'SZA_ts', 'SZD_ts', 'TRN_ts', 'XAG_ts', 'XAI_ts', 'ZY1_ts',
       'id'],
      dtype='object', name='page_no')

In [20]:
table.to_csv('data/beh_pageno_target_encoding.csv', index=False)

In [16]:
test_df['cnt'] = 1
table = pd.pivot_table(test_df, values='cnt', index=['id'],
                    columns=['page_no'], aggfunc=np.sum)
table.fillna(0, inplace=True)