In [15]:
import numpy as np
import pandas as pd
import feature_engine as fe
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine import imputation
from sklearn.impute import KNNImputer

### feature-engine

Проверим, как работают [функции для заполнения пропущенных данных из пакета feature-engine](https://feature-engine.readthedocs.io/en/1.1.x/imputation/index.html)

In [3]:
def create_test_df():
    test_df = pd.DataFrame({"SomeVariable" : [1,2,4,5,None], 
                           "OneMoreVariable" : [12, 66, 12, None, 99], 
                           "SomeCategoricalVariable" : ["Biba", "I", "Boba", "Dva", None]})
    return test_df

Пробный дата-тейбл

In [3]:
test_df = create_test_df()
test_df

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,,Dva
4,,99.0,


###### feature_engine.imputation.MeanMedianImputer(imputation_method='median', variables=None)
При наличии качественных переменных, метод их просто игнорирует и заполняет числовые.


###### feature_engine.imputation.CategoricalImputer(imputation_method='missing', fill_value='Missing', variables=None, return_object=False, ignore_format=False)
Заполняет качественные переменные значением по выбору или самым частым значением 

In [4]:
test_df = create_test_df()
obj = MeanMedianImputer(imputation_method='median', variables=None)
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,39.0,Dva
4,3.0,99.0,


In [5]:
obj = MeanMedianImputer(imputation_method='mean', variables=None)
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,47.25,Dva
4,3.0,99.0,


In [6]:
obj = CategoricalImputer(fill_value = "Data Scientista")
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,,Dva
4,,99.0,Data Scientista


##### feature_engine.imputation.ArbitraryNumberImputer(arbitrary_number=999, variables=None, imputer_dict=None)
Заполняет пропуски числом по выбору
##### feature_engine.imputation.RandomSampleImputer(random_state=None, seed='general', seeding_method='add', variables=None)
Заполняет пропуски случайным числом из уже имеющихся в датасете

!Обратить внимание: CategoricalImputer заполняет только категориальные переменные, а ArbitraryNumberImputer - только количественные

In [15]:
obj = imputation.ArbitraryNumberImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,999.0,Dva
4,999.0,99.0,


In [51]:
obj = imputation.RandomSampleImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,66.0,Dva
4,4.0,99.0,Dva


### sklearn.impute

Проверим, как работают [функции для заполнения пропущенных данных из пакета sklearn.impute](https://feature-engine.readthedocs.io/en/1.1.x/imputation/index.html)

##### sklearn.impute.KNNImputer(*, missing_values=nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False)

Заполняет пропуски с помощью KNN. Можно выбрать, чем заполнены пропуски, с помощью missing_values (по дефолту - любые формы nan). Есть гиперпараметры для оптимизации - количественный n_neighbours и качественные metric и weights. 

In [2]:
def create_test_df(nans = True):
    x1 = np.random.normal(size = 10000)
    x2 = np.random.normal(size = 10000)*x1
    x3 = np.random.normal(size = 10000) + x2
    if nans == True:
        x4 = np.random.choice(a = [np.random.normal(), None], size = 10000)
        xcat = np.random.choice(a = ["a", "b", "c", "d", None], size = 10000)
    else:
        x4 = np.random.choice(a = [np.random.normal()], size = 10000)
        xcat = np.random.choice(a = ["a", "b", "c", "d", "e"], size = 10000)
    target = np.random.choice(a = [0,1], size = 10000)

    test_df = pd.DataFrame({"x1" : x1,
                           "x2" : x2,
                           "x3" : x3,
                           "x4" : x4,
                            "xcat" : xcat,
                            "target" : target
                           } )
    return test_df

Сразу же обнаруживаем, что функция не умеет игнорировать качественные переменные. Потом для этого и других багов напишем дочерний класс.

In [5]:
obj = impute.KNNImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

ValueError: could not convert string to float: 'd'

А еще он возвращает 2-d array, теряя при этом названия столбцов

In [76]:
obj = impute.KNNImputer()
test_df = create_test_df()
test_df = test_df.drop("xcat", 1)
obj.fit(test_df)
obj.transform(test_df)

array([[ 0.4905574 , -0.26862924, -0.72125618, -0.18686868,  1.        ],
       [-1.39761923, -0.21595223, -0.54791356, -0.18686868,  1.        ],
       [-0.47396901, -0.40941988,  0.86991798, -0.18686868,  1.        ],
       ...,
       [ 0.12021848, -0.10752047,  0.61437338, -0.18686868,  0.        ],
       [-0.37652069, -0.25623558, -1.34070241, -0.18686868,  1.        ],
       [ 1.14568947,  0.31375386, -0.22497819, -0.18686868,  1.        ]])

Приведем в божеский вид

In [74]:
transformed_table = obj.transform(test_df)
transformed_table = pd.DataFrame(transformed_table)
transformed_table.columns = test_df.columns
transformed_table

Unnamed: 0,x1,x2,x3,x4,target
0,-2.296535,1.614464,1.808018,-0.819636,0.0
1,0.330350,0.170367,-0.818577,-0.819636,0.0
2,0.492528,0.514476,-0.653851,-0.819636,0.0
3,-0.534076,-0.786875,-0.222213,-0.819636,1.0
4,0.543864,-0.280548,-1.401941,-0.819636,1.0
...,...,...,...,...,...
9995,-0.311699,-0.178914,-1.073968,-0.819636,1.0
9996,2.646492,-3.146069,-2.097949,-0.819636,0.0
9997,-0.195642,0.120157,-0.319205,-0.819636,1.0
9998,0.849675,-1.534753,-1.640004,-0.819636,1.0


Теперь сделаем класс, который будет это делать сразу, чтобы можно было его не напрягаясь встроить в пайплайн:

In [110]:
class SeparatedDF():
    def __init__(self, X, categorical_variables = []):
        self.X_numeric = X.drop(categorical_variables, 1)
        self.X_categorical = X.copy()[categorical_variables]
        
class KNNImputerSeparated(KNNImputer):
    
    def __init__(self, categorical_variables, *,
                          missing_values=np.nan,
                          n_neighbors=5, weights='uniform',
                          metric='nan_euclidean', copy=True,
                          add_indicator=False):
        
        super().__init__(missing_values = missing_values,
                          n_neighbors = n_neighbors,
                          weights = weights,
                          metric = metric,
                          copy = copy,
                          add_indicator = add_indicator)
        
        self.categorical_variables = categorical_variables
        self.obj = KNNImputer()

    def fit(self, X, y = None):
        
        df = SeparatedDF(X, self.categorical_variables)
        self.obj.fit(df.X_numeric)
        
        return self
        
    def transform(self, X, y = None):
        
        df = SeparatedDF(X, self.categorical_variables)
        
        fitted_df = self.obj.transform(df.X_numeric)
        fitted_df = pd.DataFrame(fitted_df)
        fitted_df.columns = df.X_numeric.columns
        fitted_df = pd.concat([fitted_df, df.X_categorical], axis = 1)
        
        return fitted_df


In [75]:
obj = KNNImputerSeparated(categorical_variables = ["xcat"], n_neighbors = 6)
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,x1,x2,x3,x4,target,xcat
0,1.630365,-1.364086,-1.705694,0.447851,0.0,c
1,1.239528,-0.298811,-0.656311,0.447851,1.0,a
2,1.899336,-0.690572,-0.119609,0.447851,0.0,c
3,0.359527,-0.163658,-0.306700,0.447851,1.0,d
4,0.618871,-0.587284,-0.329049,0.447851,0.0,d
...,...,...,...,...,...,...
9995,-2.234815,5.456012,5.963234,0.447851,1.0,a
9996,0.967139,0.010661,-1.388447,0.447851,1.0,d
9997,1.081831,-0.306719,-1.596662,0.447851,1.0,b
9998,0.257836,-0.285029,1.461673,0.447851,1.0,


Проверим, что дочерний класс дает те же значения для численных столбцов:

In [91]:
test_df = create_test_df()

obj = KNNImputerSeparated(['xcat'])
obj.fit(test_df)
child_fitted = obj.transform(test_df)
child_fitted = child_fitted.drop("xcat", axis = 1)

obj = KNNImputer()
test_df_local = test_df.drop("xcat",1)
obj.fit(test_df_local)
parent_fitted = obj.transform(test_df_local)
parent_fitted = pd.DataFrame(parent_fitted,
                             columns = child_fitted.columns)
assert child_fitted.equals(parent_fitted)

### Homebrewn
А этот вариант я успел написать до того, как Антон скинул готовое решение. Я заглянул в сурсы feature-engine - они делают плюс-минус то же самое, только у меня работа с категориальными переменными при наличии желания добавляется в тот же класс. А еще у меня есть мода, а у них нет. 

In [277]:
class missing_filler_category():
    
    """
    Returns a table with all the missing values filled with special category.
    params::x DataFrame with the data
    params::filling_category a value to fill the missing values
    """
    
    def __init__(self,  filling_category = "filler"):
        self.filling_category = filling_category
        
    def fit(self, x, y = None):
        self.x = x.copy()
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        self.x.fillna(self.filling_category, inplace = True)
        filled_table = self.x
        return filled_table
    
    
class missing_filler_mean():
    
    """
    Returns a table with all the missing values in numerical columns filled with mean and all the 
    missing values in categorical columns filled with special category. If you do not wish to fill
    categorical at all, just do not pass "categorical_variables" parameter.
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    params::filling_category a value to fill the missing values in categorical variables
    """
    
    def __init__(self, categorical_variables = [], filling_category = "filler", y = None):
        self.categorical_variables = categorical_variables
        self.filling_category = filling_category
        
    def fit(self, x, y= None):
        self.x = x.copy()
        self.mean_to_fill = self.x.mean()
        return self
    
    def transform(self, x, y= None):
        self.x = x.copy()
        filled_table = self.x.copy()
        if len(self.categorical_variables) != 0:
            filled_table[self.categorical_variables] = \
            filled_table[self.categorical_variables].fillna(self.filling_category)
            
        filled_table = filled_table.fillna(self.mean_to_fill)

        return filled_table
    
    
    
class missing_filler_median():
    
    """
    Returns a table with all the missing values in numerical columns filled with median and all the 
    missing values in categorical columns filled with special category. If you do not wish to fill
    categorical at all, just do not pass "categorical_variables" parameter.
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    params::filling_category a value to fill the missing values in categorical variables
    """
    
    def __init__(self, categorical_variables = [], filling_category = "filler"):

        self.categorical_variables = categorical_variables

        self.filling_category = filling_category
        
    def fit(self, x, y = None):
        self.x = x.copy()
        self.median_to_fill = self.x.median()
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        filled_table = self.x.copy()
        
        if len(self.categorical_variables) != 0:
            
            filled_table[self.categorical_variables] = \
            filled_table[self.categorical_variables].fillna(self.filling_category)
            
        filled_table = \
        filled_table.fillna(self.median_to_fill)
            
        return filled_table
    
    
class missing_filler_mode():
    
    """
    Returns a table with all the missing values in numerical columns filled with mode. If there are several 
    modes, the behaviour is the foolowing:
    1) For categorical variables from the input, the first element of list of modes is used
    2) For numerical variables, the mean of modes is used
    
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    """
    
    def __init__(self,categorical_variables = [], filling_category = "filler"):
        self.filling_category = filling_category
        self.categorical_variables = categorical_variables

    def fit(self, x, y = None):
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        if (type(self.categorical_variables) == list):
            self.non_categorical_variables = \
            list(set(self.categorical_variables).symmetric_difference(list(x.columns)))
        else:
            all_variables = list(self.x.columns).copy()
            all_variables.remove(self.categorical_variables)
            self.non_categorical_variables = \
            all_variables
            
        table_to_fill = self.x.copy()
        categorical_table = table_to_fill[self.categorical_variables].copy()
        non_categorical_table = table_to_fill[self.non_categorical_variables].copy()

        if len(self.categorical_variables) == 0:
            categorical_table = pd.DataFrame(index = non_categorical_table.index())
        
        table_to_fill[self.categorical_variables] = \
        categorical_table.fillna(categorical_table.mode().loc[0])
        
        table_to_fill[self.non_categorical_variables] = \
        non_categorical_table.fillna(non_categorical_table.mode().mean())
        
        return table_to_fill

Проверим, как работает

In [273]:
test_df = create_test_df()
obj = missing_filler_category( "Grazhdanina")
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,Grazhdanina,Dva
4,Grazhdanina,99.0,Grazhdanina


In [278]:
test_df = create_test_df()
obj = missing_filler_mean("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mean(["OneMoreVariable","SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mean("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0            12.00                    Biba
1           2.0            66.00                       I
2           4.0            12.00                    Boba
3           5.0            47.25                     Dva
4           3.0            99.00             Grazhdanina
   SomeVariable OneMoreVariable SomeCategoricalVariable
0           1.0            12.0                    Biba
1           2.0            66.0                       I
2           4.0            12.0                    Boba
3           5.0     Grazhdanina                     Dva
4           3.0            99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0            12.00                    Biba
1           2.0            66.00                       I
2           4.0            12.00                    Boba
3           5.0            47.25                     Dva
4           3.0            99.00     

In [279]:
obj = missing_filler_median(["SomeCategoricalVariable", "OneMoreVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_median(["SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_median("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable OneMoreVariable SomeCategoricalVariable
0           1.0            12.0                    Biba
1           2.0            66.0                       I
2           4.0            12.0                    Boba
3           5.0     Grazhdanina                     Dva
4           3.0            99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             39.0                     Dva
4           3.0             99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             39.0                     Dva
4           3.0             99.0     

In [280]:
obj = missing_filler_mode(["SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mode(["SomeCategoricalVariable", "OneMoreVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mode("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.0                    Biba
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.0                    Biba
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.

#### Проверим, как работает в пайплайне

##### Выводы:
1) Обратить внимание, что woe не работает, когда для одного из классов категориальной переменной нет наблюдений в одном из классов таргета

In [92]:
import time
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, metrics, model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from hyperopt import hp
# from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# for HyperOpt class
import lightgbm as lgb
import xgboost as xgb
# import catboost as ctb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [93]:
# новый пакет!
from feature_engine.encoding import WoEEncoder
from feature_engine.creation import CombineWithReferenceFeature
from feature_engine.selection import RecursiveFeatureAddition

In [94]:
from sklearn.pipeline import Pipeline

In [95]:
X_train = pd.read_parquet('../datasets/15_hmeq/samples/X_train.parquet')
X_test  = pd.read_parquet('../datasets/15_hmeq/samples/X_test.parquet')
y_train = pd.read_parquet('../datasets/15_hmeq/samples/y_train.parquet').target
y_test  = pd.read_parquet('../datasets/15_hmeq/samples/y_test.parquet').target

In [96]:
with open('../datasets/15_hmeq/factors.json') as json_file:
    factors_dict = json.load(json_file)

factors_dict['cat_vals']
factors_dict['cat_vals'].remove('target')
seed = 42

def Gini(y, y_pred):
    res = roc_auc_score(y, y_pred) * 2 - 1
    print(f"Gini: {res}")
    return(res)



In [97]:
factors_dict

{'cat_vals': ['REASON', 'JOB', 'DEROG', 'DELINQ'],
 'num_vals': ['LOAN',
  'MORTDUE',
  'VALUE',
  'YOJ',
  'CLAGE',
  'NINQ',
  'CLNO',
  'DEBTINC']}

In [360]:
X_train.dtypes

LOAN         int64
MORTDUE    float64
VALUE      float64
REASON      object
JOB         object
YOJ        float64
DEROG       object
DELINQ      object
CLAGE      float64
NINQ       float64
CLNO       float64
DEBTINC    float64
dtype: object

In [98]:
X_train_y = X_train.copy()
X_train_y["y"] = y_train
X_train_y.y = X_train_y.y - 1
X_train_y.groupby("DELINQ").sum()

Unnamed: 0_level_0,LOAN,MORTDUE,VALUE,YOJ,CLAGE,NINQ,CLNO,DEBTINC,y
DELINQ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,63486500,230916500.0,348124600.0,28409.55,592117.412058,3662.0,68449.0,94453.65856,-2895
1.0,9922300,35483110.0,48307370.0,4449.6,92881.655166,718.0,12159.0,12045.029674,-347
2.0,3240900,13812320.0,16800470.0,1860.8,34267.129584,245.0,4786.0,4651.663149,-107
3.0,1510100,6228849.0,9550298.0,925.3,17243.198925,174.0,2499.0,1756.372258,-47
4.0,1191700,5718352.0,6958213.0,551.2,13603.115863,86.0,1975.0,1068.35861,-26
5.0,594800,2377663.0,3262527.0,345.3,6977.89348,41.0,880.0,325.275801,-6


feature_engine работает в пайплайне:

In [100]:
woe = WoEEncoder(variables = factors_dict['cat_vals'])
feat_eng = CombineWithReferenceFeature(
    variables_to_combine = list(X_train.columns),
    reference_variables = list(X_train.columns),
    operations = ['mul']
)
missing_impute = MeanMedianImputer(
    imputation_method='mean', variables=None
)
missing_cat = CategoricalImputer(
    fill_value = "Data Scientista"
)
lgbm_mdl = LGBMClassifier(
    num_leaves = 10,
    learning_rate = .1,
    reg_alpha = 8,
    reg_lambda = 8,
    random_state = seed
)
feat_sel = RecursiveFeatureAddition(
    lgbm_mdl,
    threshold = 0.005
)

mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('impute_missing_categorical', missing_cat),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)

print("With imputation:")
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])


With imputation:
Gini: 0.8875928618436604
Gini: 0.8190815450643776


0.8190815450643776

Доморощенная версия тоже работает в пайплайне и дает такие же результаты: 

In [367]:
missing_impute = missing_filler_mean( 
                                     filling_category = "missing",
                                     categorical_variables = factors_dict["cat_vals"])
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])

missing_impute = missing_filler_median( 
                                     filling_category = "missing",
                                     categorical_variables = factors_dict["cat_vals"])
print("With imputation, median:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])

With imputation:
Gini: 0.8875928618436604
Gini: 0.8190815450643776
With imputation, median:
Gini: 0.868677634424833
Gini: 0.8029957081545067


0.8029957081545067

KNNImputer после модификации тоже работает:

In [109]:
missing_impute = KNNImputerSeparated(
                    categorical_variables = factors_dict["cat_vals"])
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('impute_missing_categorical', missing_cat),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])


With imputation:
Gini: 0.83916212678694
Gini: 0.738


0.738

## Выводы:

1. Версии feature_engine и моя работают в пайплайне, дают одинаковые результаты
2. В классах собственного изготовления метод fit должен принимать датасет в качестве аргумента
3. WOE не принимает датасет, в котором есть хотя бы одна категория в одной из категориальных переменных, в которой нет хотя бы одной из категорий таргета
4. Обратить внимание: CategoricalImputer заполняет только категориальные переменные, а ArbitraryNumberImputer - только количественные
5. sklearn.impute.KNNImputer выдает ошибку при наличии категориальных переменных во фрейме
6. А еще он возвращает 2-d array, теряя при этом названия столбцов
7. Но я написал класс-обертку KNNImputerSeparated(categorical_variables, ...), который встраивается в пайплайн, предлагаю использовать его
