In [281]:
import numpy as np
import pandas as pd
import feature_engine as fe
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer

### feature-engine

Проверим, как работают функции для заполнения пропущенных данных из пакета feature-engine

In [282]:
def create_test_df():
    test_df = pd.DataFrame({"SomeVariable" : [1,2,4,5,None], 
                           "OneMoreVariable" : [12, 66, 12, None, 99], 
                           "SomeCategoricalVariable" : ["Biba", "I", "Boba", "Dva", None]})
    return test_df

Пробный дата-тейбл

In [283]:
test_df = create_test_df()
test_df

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,,Dva
4,,99.0,


###### feature_engine.imputation.MeanMedianImputer(imputation_method='median', variables=None)
При наличии качественных переменных, метод их просто игнорирует и заполняет числовые.


###### feature_engine.imputation.CategoricalImputer(imputation_method='missing', fill_value='Missing', variables=None, return_object=False, ignore_format=False)
Заполняет качественные переменные значением по выбору или самым частым значением 

In [10]:
test_df = create_test_df()
obj = MeanMedianImputer(imputation_method='median', variables=None)
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,39.0,Dva
4,3.0,99.0,


In [11]:
obj = MeanMedianImputer(imputation_method='mean', variables=None)
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,47.25,Dva
4,3.0,99.0,


In [285]:
obj = CategoricalImputer(fill_value = "Data Scientista")
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,,Dva
4,,99.0,Data Scientista


### Homebrewn
А этот вариант я успел написать до того, как Антон скинул готовое решение. Я заглянул в сурсы feature-engine - они делают плюс-минус то же самое, только у меня работа с категориальными переменными при наличии желания добавляется в тот же класс. А еще у меня есть мода, а у них нет. 

In [277]:
class missing_filler_category():
    
    """
    Returns a table with all the missing values filled with special category.
    params::x DataFrame with the data
    params::filling_category a value to fill the missing values
    """
    
    def __init__(self,  filling_category = "filler"):
        self.filling_category = filling_category
        
    def fit(self, x, y = None):
        self.x = x.copy()
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        self.x.fillna(self.filling_category, inplace = True)
        filled_table = self.x
        return filled_table
    
    
class missing_filler_mean():
    
    """
    Returns a table with all the missing values in numerical columns filled with mean and all the 
    missing values in categorical columns filled with special category. If you do not wish to fill
    categorical at all, just do not pass "categorical_variables" parameter.
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    params::filling_category a value to fill the missing values in categorical variables
    """
    
    def __init__(self, categorical_variables = [], filling_category = "filler", y = None):
        self.categorical_variables = categorical_variables
        self.filling_category = filling_category
        
    def fit(self, x, y= None):
        self.x = x.copy()
        self.mean_to_fill = self.x.mean()
        return self
    
    def transform(self, x, y= None):
        self.x = x.copy()
        filled_table = self.x.copy()
        if len(self.categorical_variables) != 0:
            filled_table[self.categorical_variables] = \
            filled_table[self.categorical_variables].fillna(self.filling_category)
            
        filled_table = filled_table.fillna(self.mean_to_fill)

        return filled_table
    
    
    
class missing_filler_median():
    
    """
    Returns a table with all the missing values in numerical columns filled with median and all the 
    missing values in categorical columns filled with special category. If you do not wish to fill
    categorical at all, just do not pass "categorical_variables" parameter.
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    params::filling_category a value to fill the missing values in categorical variables
    """
    
    def __init__(self, categorical_variables = [], filling_category = "filler"):

        self.categorical_variables = categorical_variables

        self.filling_category = filling_category
        
    def fit(self, x, y = None):
        self.x = x.copy()
        self.median_to_fill = self.x.median()
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        filled_table = self.x.copy()
        
        if len(self.categorical_variables) != 0:
            
            filled_table[self.categorical_variables] = \
            filled_table[self.categorical_variables].fillna(self.filling_category)
            
        filled_table = \
        filled_table.fillna(self.median_to_fill)
            
        return filled_table
    
    
class missing_filler_mode():
    
    """
    Returns a table with all the missing values in numerical columns filled with mode. If there are several 
    modes, the behaviour is the foolowing:
    1) For categorical variables from the input, the first element of list of modes is used
    2) For numerical variables, the mean of modes is used
    
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    """
    
    def __init__(self,categorical_variables = [], filling_category = "filler"):
        self.filling_category = filling_category
        self.categorical_variables = categorical_variables

    def fit(self, x, y = None):
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        if (type(self.categorical_variables) == list):
            self.non_categorical_variables = \
            list(set(self.categorical_variables).symmetric_difference(list(x.columns)))
        else:
            all_variables = list(self.x.columns).copy()
            all_variables.remove(self.categorical_variables)
            self.non_categorical_variables = \
            all_variables
            
        table_to_fill = self.x.copy()
        categorical_table = table_to_fill[self.categorical_variables].copy()
        non_categorical_table = table_to_fill[self.non_categorical_variables].copy()

        if len(self.categorical_variables) == 0:
            categorical_table = pd.DataFrame(index = non_categorical_table.index())
        
        table_to_fill[self.categorical_variables] = \
        categorical_table.fillna(categorical_table.mode().loc[0])
        
        table_to_fill[self.non_categorical_variables] = \
        non_categorical_table.fillna(non_categorical_table.mode().mean())
        
        return table_to_fill

Проверим, как работает

In [273]:
test_df = create_test_df()
obj = missing_filler_category( "Grazhdanina")
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,Grazhdanina,Dva
4,Grazhdanina,99.0,Grazhdanina


In [278]:
test_df = create_test_df()
obj = missing_filler_mean("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mean(["OneMoreVariable","SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mean("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0            12.00                    Biba
1           2.0            66.00                       I
2           4.0            12.00                    Boba
3           5.0            47.25                     Dva
4           3.0            99.00             Grazhdanina
   SomeVariable OneMoreVariable SomeCategoricalVariable
0           1.0            12.0                    Biba
1           2.0            66.0                       I
2           4.0            12.0                    Boba
3           5.0     Grazhdanina                     Dva
4           3.0            99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0            12.00                    Biba
1           2.0            66.00                       I
2           4.0            12.00                    Boba
3           5.0            47.25                     Dva
4           3.0            99.00     

In [279]:
obj = missing_filler_median(["SomeCategoricalVariable", "OneMoreVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_median(["SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_median("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable OneMoreVariable SomeCategoricalVariable
0           1.0            12.0                    Biba
1           2.0            66.0                       I
2           4.0            12.0                    Boba
3           5.0     Grazhdanina                     Dva
4           3.0            99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             39.0                     Dva
4           3.0             99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             39.0                     Dva
4           3.0             99.0     

In [280]:
obj = missing_filler_mode(["SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mode(["SomeCategoricalVariable", "OneMoreVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mode("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.0                    Biba
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.0                    Biba
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.

#### Проверим, как работает в пайплайне

##### Выводы:
1) Обратить внимание, что woe не работает, когда для одного из классов категориальной переменной нет наблюдений в одном из классов таргета

In [143]:
import time
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, metrics, model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from hyperopt import hp
# from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# for HyperOpt class
import lightgbm as lgb
import xgboost as xgb
# import catboost as ctb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [137]:
# новый пакет!
from feature_engine.encoding import WoEEncoder
from feature_engine.creation import CombineWithReferenceFeature
from feature_engine.selection import RecursiveFeatureAddition

In [136]:
from sklearn.pipeline import Pipeline

In [357]:
X_train = pd.read_parquet('../datasets/15_hmeq/samples/X_train.parquet')
X_test  = pd.read_parquet('../datasets/15_hmeq/samples/X_test.parquet')
y_train = pd.read_parquet('../datasets/15_hmeq/samples/y_train.parquet').target
y_test  = pd.read_parquet('../datasets/15_hmeq/samples/y_test.parquet').target

In [358]:
with open('../datasets/15_hmeq/factors.json') as json_file:
    factors_dict = json.load(json_file)

factors_dict['cat_vals']
factors_dict['cat_vals'].remove('target')
seed = 42

def Gini(y, y_pred):
    res = roc_auc_score(y, y_pred) * 2 - 1
    print(f"Gini: {res}")
    return(res)



In [359]:
factors_dict

{'cat_vals': ['REASON', 'JOB', 'DEROG', 'DELINQ'],
 'num_vals': ['LOAN',
  'MORTDUE',
  'VALUE',
  'YOJ',
  'CLAGE',
  'NINQ',
  'CLNO',
  'DEBTINC']}

In [360]:
X_train.dtypes

LOAN         int64
MORTDUE    float64
VALUE      float64
REASON      object
JOB         object
YOJ        float64
DEROG       object
DELINQ      object
CLAGE      float64
NINQ       float64
CLNO       float64
DEBTINC    float64
dtype: object

In [361]:
X_train_y = X_train.copy()
X_train_y["y"] = y_train
X_train_y.y = X_train_y.y - 1
X_train_y.groupby("DELINQ").sum()

Unnamed: 0_level_0,LOAN,MORTDUE,VALUE,YOJ,CLAGE,NINQ,CLNO,DEBTINC,y
DELINQ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,63486500,230916500.0,348124600.0,28409.55,592117.412058,3662.0,68449.0,94453.65856,-2895
1.0,9922300,35483110.0,48307370.0,4449.6,92881.655166,718.0,12159.0,12045.029674,-347
2.0,3240900,13812320.0,16800470.0,1860.8,34267.129584,245.0,4786.0,4651.663149,-107
3.0,1510100,6228849.0,9550298.0,925.3,17243.198925,174.0,2499.0,1756.372258,-47
4.0,1191700,5718352.0,6958213.0,551.2,13603.115863,86.0,1975.0,1068.35861,-26
5.0,594800,2377663.0,3262527.0,345.3,6977.89348,41.0,880.0,325.275801,-6


feature_engine работает в пайплайне:

In [362]:
woe = WoEEncoder(variables = factors_dict['cat_vals'])
feat_eng = CombineWithReferenceFeature(
    variables_to_combine = list(X_train.columns),
    reference_variables = list(X_train.columns),
    operations = ['mul']
)
missing_impute = MeanMedianImputer(
    imputation_method='mean', variables=None
)
missing_cat = CategoricalImputer(
    fill_value = "Data Scientista"
)
lgbm_mdl = LGBMClassifier(
    num_leaves = 10,
    learning_rate = .1,
    reg_alpha = 8,
    reg_lambda = 8,
    random_state = seed
)
feat_sel = RecursiveFeatureAddition(
    lgbm_mdl,
    threshold = 0.005
)

mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('impute_missing_categorical', missing_cat),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)

print("With imputation:")
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])


With imputation:
Gini: 0.8875928618436604
Gini: 0.8190815450643776


0.8190815450643776

Доморощенная версия тоже работает в пайплайне и дает такие же результаты: 

In [367]:
missing_impute = missing_filler_mean( 
                                     filling_category = "missing",
                                     categorical_variables = factors_dict["cat_vals"])
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])

missing_impute = missing_filler_median( 
                                     filling_category = "missing",
                                     categorical_variables = factors_dict["cat_vals"])
print("With imputation, median:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])

With imputation:
Gini: 0.8875928618436604
Gini: 0.8190815450643776
With imputation, median:
Gini: 0.868677634424833
Gini: 0.8029957081545067


0.8029957081545067

## Выводы:

1. Версии feature_engine и моя работают в пайплайне, дают одинаковые результаты
2. В классах собственного изготовления метод fit должен принимать датасет в качестве аргумента
3. WOE не принимает датасет, в котором есть хотя бы одна категория в одной из категориальных переменных, в которой нет хотя бы одной из категорий таргета