# Mean target encoding

1) Split train on n-folds; (Next steps will be described for fold1 and fold2 as trainset and fold3 as validation. For other combinations everything is the same)

2) Split new train (fold1 + fold2) on n-folds one more time;

![image](../pictures/mean_encoding.png)

3) Calculate mean for each categorical feature on subfold1 and subfold2, then replace categorical value on subfold3. Then repeat the same for different combinations of folds.

![image](../pictures/mean_encoding2.png)

4) Calculate mean for each categorical feature on the whole new train (fold1 + fold2) and replace each categorical value on fold3.

![image](../pictures/mean_encoding3.png)

5) Perform 2-4 steps for each validation combination.

6) Now you have new datasets for each validation combination.

![image](../pictures/mean_encoding4.png)

7) Calculate mean for each categorical feature on the whole train (fold1 + fold2 + fold3) and replace each categorical value on test.

8) Enjoy :)

![image](../pictures/mean_encoding5.png)

Some practical advices by **Stas Semenov** and how he applied this approach on BNP Paribas Competition: https://www.youtube.com/watch?v=g335THJxkto

In [1]:
import warnings

import numpy as np
import pandas as pd


class MeanTargetEncoding:
    def __init__(self, c=10):
        self.c = c
        self.global_mean = 0
        self.features = []
        self.values = dict()

    def fit(self, data, y, features="all"):
        if features == "all":
            self.features = sorted([i for i in data.columns if data[i].dtype == "O"])
        else:
            assert all(feature in data.columns for feature in features)
            self.features = features

        self.global_mean = np.mean(y)

        f = {"y": ["size", "mean"]}

        for col in self.features:
            self.values[col] = dict()
            temp = pd.DataFrame({"y": y, col: data[col]}).groupby([col]).agg(f)

            self.values[col] = (
                    (temp["y"]["mean"] * temp["y"]["size"] + self.global_mean * self.c) /
                    (temp["y"]["size"] + self.c)
            ).to_dict()

        return self.values

    def fit_transform(self, data, y, features="all", inplace=True):

        self.fit(data, y, features)
        return self.transform(data, inplace=inplace)

    def transform(self, data, inplace=True):
        if not inplace:
            new_data = data.copy()
            self._apply_mean_encoding(new_data)
            return new_data
        self._apply_mean_encoding(data)

    def _apply_mean_encoding(self, data):
        for col in self.values:
            if col in data.columns:
                temp = pd.DataFrame.from_dict(
                    self.values[col], orient="index").reset_index()
                temp.columns = [col, "value"]
                data = pd.merge(data, temp, how="left").fillna(self.global_mean)
                data[col] = data["value"]
                del data["value"]
                data[col] = data[col].astype("float32")
            else:
                warnings.warn("Column " + col + " is missed in this dataset.")


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

In [3]:
data = pd.read_csv("../data/telecom_churn.csv")
y = data["Churn"].astype('int8')
data.drop(["Churn"], axis=1, inplace=True)
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [4]:
data["International plan"].value_counts()

No     3010
Yes     323
Name: International plan, dtype: int64

In [5]:
# split data on train/test
train, test, y_train, y_test = train_test_split(
    data, 
    y, 
    test_size=0.2, 
    random_state=1, 
    stratify=y
)

print(train.shape, y_train.shape, test.shape, y_test.shape)

(2666, 19) (2666,) (667, 19) (667,)


In [6]:
# 1)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [7]:
def create_new_df_with_categorical_encodings(new_train, new_train_y, new_val, cols):
    se = MeanTargetEncoding()
    new_skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    val_dfs = []
    # 2)
    for new_train_split, new_val_split in new_skf.split(new_train, new_train_y):
        # 3)
        se.fit(
            new_train.iloc[new_train_split], 
            new_train_y.iloc[new_train_split], 
            features=cols
        )
        val_dfs.append(
            se.transform(new_train.iloc[new_val_split], inplace=False)
        )
    # 4)
    se.fit(new_train, new_train_y, features=cols)
    main_val = se.transform(new_val, inplace=False)
    return val_dfs, main_val

In [8]:
%%time

new_train_dfs = []
new_val_dfs = []
main_train_dfs = []

for train_split, val_split in skf.split(train, y_train): 
    # 5)
    temp_train_dfs, temp_val_df = create_new_df_with_categorical_encodings(
        train.iloc[train_split], 
        y_train.iloc[train_split], 
        train.iloc[val_split], 
        ["International plan"]
    )
    # 6)
    new_train_dfs.append(temp_train_dfs)
    new_val_dfs.append(temp_val_df)
    # 7)
    se = MeanTargetEncoding()
    se.fit(train.iloc[train_split], y.iloc[train_split], ["International plan"])
    main_train_dfs.append(
        se.transform(train.iloc[val_split], inplace=False)
    )
    
se.fit(train, y, features=["International plan"])
main_test = se.transform(test, inplace=False)

CPU times: user 220 ms, sys: 0 ns, total: 220 ms
Wall time: 219 ms


In [9]:
# check results
new_val_dfs[2]["International plan"].value_counts()

No     801
Yes     87
Name: International plan, dtype: int64

In [10]:
temp = pd.concat([train.iloc[train_split], y_train.iloc[train_split]], axis=1)
temp.groupby(["International plan"])["Churn"].mean()

International plan
No     0.114642
Yes    0.427746
Name: Churn, dtype: float64

In [11]:
temp.groupby(["International plan"])["Churn"].size()

International plan
No     1605
Yes     173
Name: Churn, dtype: int64

In [12]:
temp["Churn"].mean()

0.14510686164229472

In [13]:
(
    (0.114642 * 1605 + 10*0.14510686164229472) / (10 + 1605),
    (0.427746 * 173 + 10*0.14510686164229472) / (10 + 173)
)

(0.11483063691419378, 0.41230123834110893)

In [14]:
# append parts together
new_train_dfs = [pd.concat(i, axis=0) for i in new_train_dfs]
new_train_dfs[2].head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
672,IL,151,408,No,No,0,175.3,106,29.8,144.3,87,12.27,160.2,88,7.21,11.8,5,3.19,0
240,NJ,138,510,No,No,0,220.2,89,37.43,88.3,125,7.51,195.3,79,8.79,12.9,5,3.48,0
148,LA,121,408,No,No,0,181.5,121,30.86,218.4,98,18.56,161.6,103,7.27,8.5,5,2.3,1
881,GA,86,510,No,No,0,124.1,82,21.1,202.6,120,17.22,289.6,119,13.03,6.7,8,1.81,3
1233,IL,48,510,No,No,0,128.2,71,21.79,48.1,78,4.09,116.3,80,5.23,8.9,3,2.4,0


In [15]:
new_train_dfs[2]["International plan"].value_counts()

No     1605
Yes     173
Name: International plan, dtype: int64

In [16]:
new_val_dfs[2].head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
2465,IN,88,415,No,No,0,183.5,93,31.2,170.5,80,14.49,193.8,88,8.72,8.3,5,2.24,3
2062,ME,140,415,No,No,0,159.1,104,27.05,269.8,106,22.93,220.4,116,9.92,10.3,4,2.78,1
2604,MD,106,415,No,No,0,208.3,89,35.41,169.4,67,14.4,102.0,90,4.59,15.9,4,4.29,3
1462,CO,37,408,No,No,0,199.5,107,33.92,207.5,110,17.64,83.9,123,3.78,8.1,4,2.19,2
2318,WV,106,510,No,No,0,194.8,133,33.12,213.4,73,18.14,190.8,92,8.59,11.5,7,3.11,0


# Feature Interactions as Features

In [17]:
import xgbfir
import xgboost as xgb

In [18]:
data = pd.read_csv("../data/telecom_churn.csv")
y = data["Churn"].astype('int8')
data.drop(["Churn"], axis=1, inplace=True)

train_cols = [col for col in data.columns if data[col].dtype != 'O']

In [19]:
parameters = {
    #default
    'objective': 'reg:logistic',
    'eta': 0.1,
    'silent': 1,
    "nthread": -1,
    "random_seed": 1,
    "eval_metric": 'auc',
    
    # regularization parameters
    'max_leaves': 20,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    
    #lightgbm approach
    'tree_method': 'hist',
    'grow_policy': 'lossguide'
}

num_rounds = 10000

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
xgb_train = xgb.DMatrix(data[train_cols], y.values, feature_names=train_cols)

results = xgb.cv(
    parameters, 
    xgb_train, 
    num_rounds, 
    early_stopping_rounds=10,
    folds=skf, 
    verbose_eval=10
)

[0]	train-auc:0.832862+0.00472396	test-auc:0.807838+0.0305292
[10]	train-auc:0.920047+0.00706104	test-auc:0.875377+0.0158608
[20]	train-auc:0.9368+0.00316342	test-auc:0.880931+0.0104092
[30]	train-auc:0.954769+0.00407126	test-auc:0.882399+0.00904475


In [20]:
results.shape

(28, 4)

In [21]:
model = xgb.train(parameters, xgb_train, num_boost_round=30)

In [22]:
xgbfir.saveXgbFI(
    model, 
    feature_names=train_cols, 
    OutputXlsxFile="xgbfir_importance.xlsx"
)

## Importance metrics

<img src="https://raw.githubusercontent.com/Far0n/xgbfi/master/doc/ScoresExample_small.png">

In [23]:
train.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
672,IL,151,408,No,No,0,175.3,106,29.8,144.3,87,12.27,160.2,88,7.21,11.8,5,3.19,0
2465,IN,88,415,No,No,0,183.5,93,31.2,170.5,80,14.49,193.8,88,8.72,8.3,5,2.24,3
473,WV,72,510,No,Yes,33,96.6,59,16.42,315.4,98,26.81,163.3,117,7.35,6.2,4,1.67,4
2062,ME,140,415,No,No,0,159.1,104,27.05,269.8,106,22.93,220.4,116,9.92,10.3,4,2.78,1
2604,MD,106,415,No,No,0,208.3,89,35.41,169.4,67,14.4,102.0,90,4.59,15.9,4,4.29,3


In [24]:
data["Customer service calls|Total day calls"] = data["Total day calls"] / data["Customer service calls"]

In [25]:
train_cols = [col for col in data.columns if data[col].dtype != 'O']

In [26]:
parameters = {
    #default
    'objective': 'reg:logistic',
    'eta': 0.1,
    'silent': 1,
    "nthread": -1,
    "random_seed": 1,
    "eval_metric": 'auc',
    
    # regularization parameters
    'max_leaves': 20,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    
    #lightgbm approach
    'tree_method': 'hist',
    'grow_policy': 'lossguide'
}

xgb_train = xgb.DMatrix(data[train_cols], y.values, feature_names=train_cols)

results = xgb.cv(
    parameters, 
    xgb_train, 
    num_rounds, 
    early_stopping_rounds=10,
    folds=skf, 
    verbose_eval=10
)

[0]	train-auc:0.8331+0.00561297	test-auc:0.808334+0.0311378
[10]	train-auc:0.913019+0.00531814	test-auc:0.872306+0.0158523
[20]	train-auc:0.937659+0.00195639	test-auc:0.877066+0.0111481
[30]	train-auc:0.95554+0.00372294	test-auc:0.878916+0.00757684
[40]	train-auc:0.970289+0.00265366	test-auc:0.877866+0.00669572


In [28]:
results.tail()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
33,0.959531,0.00345,0.880121,0.007309
34,0.960762,0.00351,0.880681,0.006965
35,0.961822,0.004567,0.880632,0.006483
36,0.963895,0.003681,0.88143,0.007258
37,0.965006,0.003348,0.881748,0.006707
