# CrossTargetEncoder, TargetEncoder class

`TargetEncoder` class implements target encoding for X_train X_val datasets

`CrossTargetEncoder` class fits `TargetEncoder` for each fold 

## Imports

In [1]:
import pandas as pd
import os
import requests
import numpy as np
import yaml
import logging
from sklearn.metrics import roc_auc_score
from crosspredict.iterator import Iterator
from crosspredict.crossval import CrossLightgbmModel
from crosspredict.target_encoder import CrossTargetEncoder, TargetEncoder
from category_encoders import WOEEncoder
logging.basicConfig(level=logging.INFO)

## Load data

In [2]:
file_url = 'https://boosters.pro/api/ch/files/pub/onetwotrip_challenge_train.csv'
file_path = '../tests/onetwotrip_challenge_train.csv'
if os.path.isfile(file_path)!=True:
    myfile = requests.get(file_url)
    open(file_path, 'wb').write(myfile.content)
    
df = pd.read_csv('../tests/onetwotrip_challenge_train.csv')


unique_clients = pd.Series(df['userid'].unique())
test_users = unique_clients.sample(frac=0.2,random_state=0)
val_idx = df['userid'].isin(test_users)
test = df[val_idx].copy()
train = df[~val_idx].copy()

## TargetEncoder example

In [3]:
iter_df = Iterator(n_repeats=3,
                    n_splits=10,
                    random_state = 0,
                    col_client = 'userid',
                    cv_byclient=True)

for i, (X_train, X_val) in enumerate(iter_df.split(train)):
    pass

Using RepeatedKFold by column group "userid"


In [4]:
# TargetEncoder have same parameters as Iterator 
# but it is needed for double cross validation in X_train dataset of single fold
# + `cols` - columns to encode
# + `col_encoded` - target column
# + `encoder_class` - category_encoders class

simple_encoder = TargetEncoder(encoder_class=WOEEncoder,
                                n_splits= 3,
                                n_repeats= 1,
                                random_state= 0,
                                col_client= 'userid',
                                cv_byclient= True,
                                col_encoded= 'goal1',
                                cols= ['field1','field2','field3','field4']
                              )
simple_encoder.fit(X_train)

<crosspredict.target_encoder._target_encoder.TargetEncoder at 0x7f581264f210>

In [5]:
# transform X_train dataset
# (mean prediction if n_repeats>1, because there are more than one model that can predict each row)
simple_encoder.transform(X_train)

Unnamed: 0,encoded_field1,encoded_field2,encoded_field3,encoded_field4
0,-0.463403,0.093969,0.113434,0.078877
1,-0.213632,0.005103,0.078200,-0.050764
2,-0.252676,-0.198786,-0.049519,-0.065034
4,-0.021756,-0.052728,-0.134602,0.084184
5,-0.160879,0.045104,0.014076,0.078877
...,...,...,...,...
196049,-0.127791,0.062602,0.053194,0.094973
196050,-0.035787,0.121828,-0.085495,0.078877
196052,-0.249904,0.031877,-0.134602,-0.050764
196054,-0.200901,0.093969,0.144979,0.078877


In [6]:
# predict X_test dataset (mean prediction from all models)
simple_encoder.predict(X_val)

Unnamed: 0,encoded_field1,encoded_field2,encoded_field3,encoded_field4
16,-0.035520,-0.055500,0.042379,0.076159
22,-0.227570,-0.286093,-0.237577,-0.274263
24,-0.270356,0.030928,0.138657,0.076159
78,-0.177357,0.165028,-0.097532,0.076159
84,-0.512491,0.165028,-0.097532,0.076159
...,...,...,...,...
195984,-0.177357,0.165028,0.068076,0.076159
196026,-0.270356,0.030928,0.018795,0.307825
196029,-0.177357,-0.036723,0.134974,0.076159
196051,0.004552,0.120051,0.068076,-0.066884


## CrossTargetEncoder example

In [7]:
# Same as TargetEncoder but have `iterator` parameter
# same 'iterator' will be passed to CrossLightgbmModel to match cross validation folds

In [7]:
iter_df = Iterator(n_repeats=1,
                    n_splits=3,
                    random_state = 0,
                    col_client = 'userid',
                    cv_byclient=True)

cross_encoder = CrossTargetEncoder(iterator = iter_df,
                                    encoder_class=WOEEncoder,
                                    n_splits= 3,
                                    n_repeats= 1,
                                    random_state= 0,
                                    col_client= 'userid',
                                    cv_byclient= True,
                                    col_encoded= 'goal1',
                                    cols= ['field3','field2','field11','field23','field18','field20']
                                  )
cross_encoder.fit(train)

Using RepeatedKFold by column group "userid"


In [8]:
# transform method gets `fold` parameter - number of fold to get

X_train, X_val = next(iter_df.split(train))

encoded_train, encoded_test = cross_encoder.transform(fold=0, train=X_train, test=X_val)

In [9]:
# predict method for out test dataset (mean prediction from all models from all TargetEncoders)
cross_encoder.predict(test)

Unnamed: 0,encoded_field3,encoded_field2,encoded_field11,encoded_field23,encoded_field18,encoded_field20
3,-0.140829,-0.075079,0.010212,0.078315,0.005202,-0.003783
6,-0.066498,-0.075079,-0.053754,0.154461,-0.042450,0.005543
9,0.021738,-0.229410,0.010212,0.078315,-0.042315,-0.020129
12,-0.081345,0.181179,-0.148919,0.078315,-0.015237,0.019826
14,-0.261834,-0.296218,-0.031003,-0.035564,-0.042315,-0.020129
...,...,...,...,...,...,...
196039,0.042939,-0.035236,-0.036774,-0.103551,-0.042315,-0.020129
196042,0.062696,0.098558,0.013271,-0.049208,-0.042450,-0.003783
196045,0.042939,-0.035236,0.098156,-0.092917,-0.042450,0.028625
196046,0.042939,-0.035236,-0.007281,-0.035905,0.116902,0.028625


In [10]:
# get list of new columns
cross_encoder._targetencoded_cols

In [11]:
feature_name = df.columns.values
feature_name = np.delete(feature_name, np.argwhere(feature_name == 'goal1'))
feature_name = np.delete(feature_name, np.argwhere(feature_name == 'orderid'))
feature_name = np.delete(feature_name, np.argwhere(feature_name == 'userid'))

feature_name = np.append(feature_name,cross_encoder._targetencoded_cols)
params= {'bagging_fraction': 0.849285747554019,
  'bagging_freq': 5,
  'bagging_seed': 0,
  'boosting_type': 'gbdt',
  'data_random_seed': 0,
  'drop_seed': 0,
  'feature_fraction': 0.8212766928844304,
  'feature_fraction_seed': 0,
  'lambda_l1': 0.8955546599539566,
  'lambda_l2': 1.4423261095989717,
  'learning_rate': 0.03,
  'max_bin': 255,
  'max_depth': 43,
  'metric': 'auc',
  'min_data_in_leaf': 149,
  'min_sum_hessian_in_leaf': 1.804477623298885,
  'num_leaves': 363,
  'objective': 'binary',
  'seed': 0,
  'verbose': -1}

In [12]:
model_class = CrossLightgbmModel(iterator=iter_df, 
                                 feature_name=feature_name,
                                params=params,
                                cols_cat = ['field3', 'field2', 'field11', 'field23', 'field18', 'field20'],
                                num_boost_round = 9999,
                                early_stopping_rounds = 50,
                                valid = True,
                                random_state = 0,
                                col_target = 'goal1',
                                cross_target_encoder = cross_encoder)
result = model_class.fit(train)

INFO:crosspredict.crossval._crossval:{'bagging_fraction': 0.849285747554019, 'bagging_freq': 5, 'bagging_seed': 0, 'boosting_type': 'gbdt', 'data_random_seed': 0, 'drop_seed': 0, 'feature_fraction': 0.8212766928844304, 'feature_fraction_seed': 0, 'lambda_l1': 0.8955546599539566, 'lambda_l2': 1.4423261095989717, 'learning_rate': 0.03, 'max_bin': 255, 'max_depth': 43, 'metric': 'auc', 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 1.804477623298885, 'num_leaves': 363, 'objective': 'binary', 'seed': 0, 'verbose': -1}
INFO:crosspredict.crossval._crossval:REPEAT FOLDS 0 START
INFO:crosspredict.crossval._crossval:	CROSSVALIDATION FOLD 0 ENDS with best ROCAUC = 0.69380416031959
INFO:crosspredict.crossval._crossval:	CROSSVALIDATION FOLD 1 ENDS with best ROCAUC = 0.7019798370214689
INFO:crosspredict.crossval._crossval:	CROSSVALIDATION FOLD 2 ENDS with best ROCAUC = 0.7025274328889723
INFO:crosspredict.crossval._crossval:{'loss': -0.6988591595331693, 'status': 'ok', 'std': 0.005149967831115

In [13]:
print(roc_auc_score(test['goal1'],model_class.predict(test)), result)

0.7153827793509867 {'loss': -0.6988591595331693, 'status': 'ok', 'std': 0.005149967831115862, 'score_max': 0.6988591595331693, 'scores_all': [0.69380416031959, 0.7019798370214689, 0.7025274328889723], 'num_boost': 64}
