### Supervised categorical encodings

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
train = pd.read_csv("data/amazon-employee-access-challenge/train.csv")
test = pd.read_csv("data/amazon-employee-access-challenge/test.csv")

In [7]:
target = "ACTION"
col4train = [x for x in train.columns if x not in [target, "ROLE_TITLE"]]
y = train[target].values

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

def get_model():
    params = {
        "n_estimators" : 300,
        "n_jobs" : 3,
        "random_state":5436,
    }
    return ExtraTreesClassifier(**params)

### Simple Target Encoding
타겟의 평균으로 각 고유값을 인코딩

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
class TargetEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names ):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
    
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        X_["__target__"] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            self.learned_values[c] = (X_[[c, "__target__"]]
                                     .groupby(c)["__target__"].mean()
                                     .reset_index())
        self.dataset_mean = np.mean(y)
        return self
    
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                               .merge(self.learned_values[c], on = c, how = 'left')
                               )["__target__"]
            transformed_X = transformed_X.fillna(self.dataset_mean)
            return transformed_X
        
    def fit_transform(self, X, y, **fit_params):
        self.fit(X,y)
        return self.transform(X)
    

In [18]:
skf = StratifiedKFold(n_splits=5, random_state=5451, shuffle=True)
te = TargetEncoding(columns_names=col4train)
X_tr = te.fit_transform(train, y).values

scores = []
tr_scores = []
for train_index, test_index in skf.split(train, y):
    train_df, valid_df = X_tr[train_index], X_tr[test_index]
    train_y, valid_y = y[train_index], y[test_index]
    
    model = get_model()
    model.fit(train_df, train_y)
    
    predictions = model.predict_proba(valid_df)[:,1]
    scores.append(roc_auc_score(valid_y, predictions))
    
    train_preds = model.predict_proba(train_df)[:,1]
    tr_scores.append(roc_auc_score(train_y, train_preds))
    
print("Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}".format(
    np.mean(tr_scores), np.mean(scores), np.std(scores)))

Train AUC score: 1.0000 Valid AUC score: 0.9313, STD: 0.0035


### Target Encoding Smoothing