In [1]:
import numpy as np
import pandas as pd
import feature_engine as fe
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine import imputation
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
def create_test_df(nans = True):
    x1 = np.random.normal(size = 10000)
    x2 = np.random.normal(size = 10000)*x1
    x3 = np.random.normal(size = 10000) + x2
    if nans == True:
        x4 = np.random.choice(a = [1, 0], size = 10000)
        xcat = np.random.choice(a = ["a", "b", "c", "d", None], size = 10000)
    else:
        x4 = np.random.choice(a = [1], size = 10000)
        xcat = np.random.choice(a = ["a", "b", "c", "d", "e"], size = 10000)
    target = np.random.choice(a = [0,1], size = 10000)
    x5 = np.random.normal(size = 10000)

    test_df = pd.DataFrame({"x1" : x1,
                           "x2" : x2,
                           "x3" : x3,
                           "x4" : x4,
                            "xcat" : xcat,
                            "target" : target
                           } )
    
    test_df.x4 = np.where(test_df.x4 == 1, x5, None)
    return test_df

### feature-engine

Проверим, как работают [функции для заполнения пропущенных данных из пакета feature-engine](https://feature-engine.readthedocs.io/en/1.1.x/imputation/index.html)

In [3]:
def create_test_df():
    test_df = pd.DataFrame({"SomeVariable" : [1,2,4,5,None], 
                           "OneMoreVariable" : [12, 66, 12, None, 99], 
                           "SomeCategoricalVariable" : ["Biba", "I", "Boba", "Dva", None]})
    return test_df

Пробный дата-тейбл

In [3]:
test_df = create_test_df()
test_df

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,,Dva
4,,99.0,


###### feature_engine.imputation.MeanMedianImputer(imputation_method='median', variables=None)
При наличии качественных переменных, метод их просто игнорирует и заполняет числовые.


###### feature_engine.imputation.CategoricalImputer(imputation_method='missing', fill_value='Missing', variables=None, return_object=False, ignore_format=False)
Заполняет качественные переменные значением по выбору или самым частым значением 

In [4]:
test_df = create_test_df()
obj = MeanMedianImputer(imputation_method='median', variables=None)
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,39.0,Dva
4,3.0,99.0,


In [5]:
obj = MeanMedianImputer(imputation_method='mean', variables=None)
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,47.25,Dva
4,3.0,99.0,


In [6]:
obj = CategoricalImputer(fill_value = "Data Scientista")
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,,Dva
4,,99.0,Data Scientista


##### feature_engine.imputation.ArbitraryNumberImputer(arbitrary_number=999, variables=None, imputer_dict=None)
Заполняет пропуски числом по выбору
##### feature_engine.imputation.RandomSampleImputer(random_state=None, seed='general', seeding_method='add', variables=None)
Заполняет пропуски случайным числом из уже имеющихся в датасете

!Обратить внимание: CategoricalImputer заполняет только категориальные переменные, а ArbitraryNumberImputer - только количественные

In [15]:
obj = imputation.ArbitraryNumberImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,999.0,Dva
4,999.0,99.0,


In [51]:
obj = imputation.RandomSampleImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,66.0,Dva
4,4.0,99.0,Dva


### sklearn.impute

Проверим, как работают [функции для заполнения пропущенных данных из пакета sklearn.impute](https://feature-engine.readthedocs.io/en/1.1.x/imputation/index.html)

##### sklearn.impute.KNNImputer(*, missing_values=nan, n_neighbors=5, weights='uniform', metric='nan_euclidean', copy=True, add_indicator=False)

Заполняет пропуски с помощью KNN. Можно выбрать, чем заполнены пропуски, с помощью missing_values (по дефолту - любые формы nan). Есть гиперпараметры для оптимизации - количественный n_neighbours и качественные metric и weights. 

Сразу же обнаруживаем, что функция не умеет игнорировать качественные переменные. Потом для этого и других багов напишем дочерний класс.

In [5]:
obj = impute.KNNImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

ValueError: could not convert string to float: 'd'

А еще он возвращает 2-d array, теряя при этом названия столбцов

In [76]:
obj = impute.KNNImputer()
test_df = create_test_df()
test_df = test_df.drop("xcat", 1)
obj.fit(test_df)
obj.transform(test_df)

array([[ 0.4905574 , -0.26862924, -0.72125618, -0.18686868,  1.        ],
       [-1.39761923, -0.21595223, -0.54791356, -0.18686868,  1.        ],
       [-0.47396901, -0.40941988,  0.86991798, -0.18686868,  1.        ],
       ...,
       [ 0.12021848, -0.10752047,  0.61437338, -0.18686868,  0.        ],
       [-0.37652069, -0.25623558, -1.34070241, -0.18686868,  1.        ],
       [ 1.14568947,  0.31375386, -0.22497819, -0.18686868,  1.        ]])

Приведем в божеский вид

In [74]:
transformed_table = obj.transform(test_df)
transformed_table = pd.DataFrame(transformed_table)
transformed_table.columns = test_df.columns
transformed_table

Unnamed: 0,x1,x2,x3,x4,target
0,-2.296535,1.614464,1.808018,-0.819636,0.0
1,0.330350,0.170367,-0.818577,-0.819636,0.0
2,0.492528,0.514476,-0.653851,-0.819636,0.0
3,-0.534076,-0.786875,-0.222213,-0.819636,1.0
4,0.543864,-0.280548,-1.401941,-0.819636,1.0
...,...,...,...,...,...
9995,-0.311699,-0.178914,-1.073968,-0.819636,1.0
9996,2.646492,-3.146069,-2.097949,-0.819636,0.0
9997,-0.195642,0.120157,-0.319205,-0.819636,1.0
9998,0.849675,-1.534753,-1.640004,-0.819636,1.0


Теперь сделаем функцию, которая будет переопределять класс так, чтобы можно было его не напрягаясь встроить в пайплайн (функцию - потому что в дальнейшем придется для других классов делать то же самое):

In [37]:
def teach_to_separate(imputer_class):
    class SeparatedDF():
        def __init__(self, X, categorical_variables = []):
            self.X_numeric = X.drop(categorical_variables, 1)
            self.X_categorical = X.copy()[categorical_variables]

    class ImputerSeparated(imputer_class):

        def __init__(self, categorical_variables, **kwargs):

            super().__init__(**kwargs)

            self.categorical_variables = categorical_variables
            self.obj = imputer_class()

        def fit(self, X, y = None):

            df = SeparatedDF(X, self.categorical_variables)
            self.obj.fit(df.X_numeric)

            return self

        def transform(self, X, y = None):

            df = SeparatedDF(X, self.categorical_variables)

            fitted_df = self.obj.transform(df.X_numeric)
            fitted_df = pd.DataFrame(fitted_df)
            fitted_df.columns = df.X_numeric.columns
            fitted_df = pd.concat([fitted_df, df.X_categorical], axis = 1)

            return fitted_df
    return ImputerSeparated

def assert_identical_results_separated(imputer_class_basic,
                                       imputer_class_modified,
                                       categorical_variables, 
                                       test_df = create_test_df()):

    obj = imputer_class_modified(categorical_variables)
    obj.fit(test_df)
    child_fitted = obj.transform(test_df)
    child_fitted = child_fitted.drop(categorical_variables, axis = 1)
    print(child_fitted)
    obj = imputer_class_basic()
    test_df_local = test_df.drop(categorical_variables,1)
    obj.fit(test_df_local)
    parent_fitted = obj.transform(test_df_local)
    parent_fitted = pd.DataFrame(parent_fitted,
                                 columns = child_fitted.columns)
    print(parent_fitted)
    assert child_fitted.equals(parent_fitted)

In [189]:
KNNImputerSeparated = teach_to_separate(KNNImputer)

In [190]:
obj = KNNImputerSeparated(categorical_variables = ["xcat"], n_neighbors = 6)
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,x1,x2,x3,x4,target,xcat
0,-0.854386,-1.085760,-0.509892,0.548853,0.0,c
1,1.122344,-0.121229,-0.960090,0.024724,1.0,b
2,2.579596,7.045227,6.712655,0.717949,0.0,d
3,0.827951,0.792939,1.047094,0.295139,1.0,
4,0.307618,-0.242927,-0.430720,0.389447,0.0,
...,...,...,...,...,...,...
9995,-0.124569,-0.167455,-0.215072,-0.081915,1.0,a
9996,-0.063285,0.091194,-0.651454,0.798114,1.0,
9997,-0.285216,0.029076,0.252618,0.217586,1.0,d
9998,-0.297463,-0.067652,-0.968421,1.005322,0.0,c


Проверим, что дочерний класс дает те же значения для численных столбцов (если ничего, кроме принтов, не произошло, то работает):

In [191]:
assert_identical_results_separated(KNNImputer, KNNImputerSeparated, ["xcat"])

            x1        x2        x3        x4  target
0    -1.175799 -0.048057 -1.039715  1.887637     1.0
1    -0.952326 -0.066390  2.505456 -1.252388     0.0
2     0.145049 -0.063837 -0.326285  0.148106     1.0
3    -0.011194  0.000947  0.884393  0.469292     0.0
4     0.349066 -0.425200 -0.855749 -0.225708     1.0
...        ...       ...       ...       ...     ...
9995  1.535798 -2.296633  0.711070 -0.630990     0.0
9996  0.219119  0.215200 -0.038490 -0.392901     1.0
9997 -1.900352 -3.995766 -4.387440  0.264144     1.0
9998  0.343509 -0.024115 -1.422368 -0.193456     1.0
9999 -0.070089  0.048583  1.998205  0.416006     1.0

[10000 rows x 5 columns]
            x1        x2        x3        x4  target
0    -1.175799 -0.048057 -1.039715  1.887637     1.0
1    -0.952326 -0.066390  2.505456 -1.252388     0.0
2     0.145049 -0.063837 -0.326285  0.148106     1.0
3    -0.011194  0.000947  0.884393  0.469292     0.0
4     0.349066 -0.425200 -0.855749 -0.225708     1.0
...        ...      

##### sklearn.impute.IterativeImputer(estimator=None, *, missing_values=nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=None, initial_strategy='mean', imputation_order='ascending', skip_complete=False, min_value=- inf, max_value=inf, verbose=0, random_state=None, add_indicator=False)

Делает предикшн для каждой из фич на основе остальных и заполняет им пропуски. Я не понял, какой estimator он использует по дефолту и как должен выглядеть эстиматор, которым можно его заменить, но он работает при незаполненной переменной estimator. Тут есть несколько потенциальных переменных для гипероптимизации: max_iter - количество итераций, после которых функция вернет окончательную оценку (насколько я понимаю, она сначала заполняет все переменные, потом делает новый предикшн, перезаполняет их и т.д.); n_nearest_features - количество ближайших (по корреляции) фич, которые функция использует для предикшнов, tol - tolerance of the stopping condition

Та же проблема, что с KNN (не работает со смешанными таблицами)

In [4]:
obj = IterativeImputer()
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

ValueError: could not convert string to float: 'c'

In [192]:
IterativeImputerSeparated = teach_to_separate(IterativeImputer)

In [54]:
obj = IterativeImputerSeparated(["xcat"])
test_df = create_test_df()
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,x1,x2,x3,x4,target,xcat
0,-0.534940,-0.138062,-0.844898,0.158684,0.0,
1,-0.541844,-0.897978,-0.973873,-0.001151,1.0,
2,-0.117848,-0.046087,-0.250023,-0.936007,1.0,d
3,0.140135,0.290983,-2.031201,-0.001726,0.0,c
4,1.637557,-0.184460,-0.654217,-0.521677,0.0,
...,...,...,...,...,...,...
9995,0.325635,0.033993,-0.570681,0.000701,0.0,
9996,-0.143977,-0.103711,-1.056011,-0.000186,0.0,d
9997,0.477057,-0.266036,-2.041189,1.625381,0.0,b
9998,0.097728,-0.072437,-0.264021,0.000215,1.0,d


In [55]:
assert_identical_results_separated(IterativeImputer, IterativeImputerSeparated, ["xcat"])

            x1        x2        x3        x4  target
0     0.273006 -0.193791 -0.999903  0.762098     0.0
1     0.654977  0.458439  0.935853  0.336748     1.0
2     1.836535 -3.361335 -2.052113 -0.008909     1.0
3     0.845131  0.507075  0.718444  0.825801     1.0
4     0.035952 -0.027294 -0.107981 -0.285675     0.0
...        ...       ...       ...       ...     ...
9995 -0.114499 -0.187512  0.221656 -0.180320     1.0
9996  0.644926  0.219449 -0.753907 -0.004112     1.0
9997  0.117010 -0.119661 -2.393662 -0.005735     1.0
9998  0.130185  0.087813  3.334400 -0.001727     0.0
9999 -0.812767  1.194934  2.083174  0.122515     1.0

[10000 rows x 5 columns]
            x1        x2        x3        x4  target
0     0.273006 -0.193791 -0.999903  0.762098     0.0
1     0.654977  0.458439  0.935853  0.336748     1.0
2     1.836535 -3.361335 -2.052113 -0.008909     1.0
3     0.845131  0.507075  0.718444  0.825801     1.0
4     0.035952 -0.027294 -0.107981 -0.285675     0.0
...        ...      

### Homebrewn
А этот вариант я успел написать до того, как Антон скинул готовое решение. Я заглянул в сурсы feature-engine - они делают плюс-минус то же самое, только у меня работа с категориальными переменными при наличии желания добавляется в тот же класс. А еще у меня есть мода, а у них нет. 

In [277]:
class missing_filler_category():
    
    """
    Returns a table with all the missing values filled with special category.
    params::x DataFrame with the data
    params::filling_category a value to fill the missing values
    """
    
    def __init__(self,  filling_category = "filler"):
        self.filling_category = filling_category
        
    def fit(self, x, y = None):
        self.x = x.copy()
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        self.x.fillna(self.filling_category, inplace = True)
        filled_table = self.x
        return filled_table
    
    
class missing_filler_mean():
    
    """
    Returns a table with all the missing values in numerical columns filled with mean and all the 
    missing values in categorical columns filled with special category. If you do not wish to fill
    categorical at all, just do not pass "categorical_variables" parameter.
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    params::filling_category a value to fill the missing values in categorical variables
    """
    
    def __init__(self, categorical_variables = [], filling_category = "filler", y = None):
        self.categorical_variables = categorical_variables
        self.filling_category = filling_category
        
    def fit(self, x, y= None):
        self.x = x.copy()
        self.mean_to_fill = self.x.mean()
        return self
    
    def transform(self, x, y= None):
        self.x = x.copy()
        filled_table = self.x.copy()
        if len(self.categorical_variables) != 0:
            filled_table[self.categorical_variables] = \
            filled_table[self.categorical_variables].fillna(self.filling_category)
            
        filled_table = filled_table.fillna(self.mean_to_fill)

        return filled_table
    
    
    
class missing_filler_median():
    
    """
    Returns a table with all the missing values in numerical columns filled with median and all the 
    missing values in categorical columns filled with special category. If you do not wish to fill
    categorical at all, just do not pass "categorical_variables" parameter.
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    params::filling_category a value to fill the missing values in categorical variables
    """
    
    def __init__(self, categorical_variables = [], filling_category = "filler"):

        self.categorical_variables = categorical_variables

        self.filling_category = filling_category
        
    def fit(self, x, y = None):
        self.x = x.copy()
        self.median_to_fill = self.x.median()
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        filled_table = self.x.copy()
        
        if len(self.categorical_variables) != 0:
            
            filled_table[self.categorical_variables] = \
            filled_table[self.categorical_variables].fillna(self.filling_category)
            
        filled_table = \
        filled_table.fillna(self.median_to_fill)
            
        return filled_table
    
    
class missing_filler_mode():
    
    """
    Returns a table with all the missing values in numerical columns filled with mode. If there are several 
    modes, the behaviour is the foolowing:
    1) For categorical variables from the input, the first element of list of modes is used
    2) For numerical variables, the mean of modes is used
    
    params::x DataFrame with the data
    params::categorical_variables list of categorical varaibles
    """
    
    def __init__(self,categorical_variables = [], filling_category = "filler"):
        self.filling_category = filling_category
        self.categorical_variables = categorical_variables

    def fit(self, x, y = None):
        return self
    
    def transform(self, x, y = None):
        self.x = x.copy()
        if (type(self.categorical_variables) == list):
            self.non_categorical_variables = \
            list(set(self.categorical_variables).symmetric_difference(list(x.columns)))
        else:
            all_variables = list(self.x.columns).copy()
            all_variables.remove(self.categorical_variables)
            self.non_categorical_variables = \
            all_variables
            
        table_to_fill = self.x.copy()
        categorical_table = table_to_fill[self.categorical_variables].copy()
        non_categorical_table = table_to_fill[self.non_categorical_variables].copy()

        if len(self.categorical_variables) == 0:
            categorical_table = pd.DataFrame(index = non_categorical_table.index())
        
        table_to_fill[self.categorical_variables] = \
        categorical_table.fillna(categorical_table.mode().loc[0])
        
        table_to_fill[self.non_categorical_variables] = \
        non_categorical_table.fillna(non_categorical_table.mode().mean())
        
        return table_to_fill

Проверим, как работает

In [273]:
test_df = create_test_df()
obj = missing_filler_category( "Grazhdanina")
obj.fit(test_df)
obj.transform(test_df)

Unnamed: 0,SomeVariable,OneMoreVariable,SomeCategoricalVariable
0,1.0,12.0,Biba
1,2.0,66.0,I
2,4.0,12.0,Boba
3,5.0,Grazhdanina,Dva
4,Grazhdanina,99.0,Grazhdanina


In [278]:
test_df = create_test_df()
obj = missing_filler_mean("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mean(["OneMoreVariable","SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mean("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0            12.00                    Biba
1           2.0            66.00                       I
2           4.0            12.00                    Boba
3           5.0            47.25                     Dva
4           3.0            99.00             Grazhdanina
   SomeVariable OneMoreVariable SomeCategoricalVariable
0           1.0            12.0                    Biba
1           2.0            66.0                       I
2           4.0            12.0                    Boba
3           5.0     Grazhdanina                     Dva
4           3.0            99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0            12.00                    Biba
1           2.0            66.00                       I
2           4.0            12.00                    Boba
3           5.0            47.25                     Dva
4           3.0            99.00     

In [279]:
obj = missing_filler_median(["SomeCategoricalVariable", "OneMoreVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_median(["SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_median("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable OneMoreVariable SomeCategoricalVariable
0           1.0            12.0                    Biba
1           2.0            66.0                       I
2           4.0            12.0                    Boba
3           5.0     Grazhdanina                     Dva
4           3.0            99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             39.0                     Dva
4           3.0             99.0             Grazhdanina
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             39.0                     Dva
4           3.0             99.0     

In [280]:
obj = missing_filler_mode(["SomeCategoricalVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mode(["SomeCategoricalVariable", "OneMoreVariable"], "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

test_df = create_test_df()
obj = missing_filler_mode("SomeCategoricalVariable", "Grazhdanina")
obj.fit(test_df)
print(obj.transform(test_df))

   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.0                    Biba
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.0                    Biba
   SomeVariable  OneMoreVariable SomeCategoricalVariable
0           1.0             12.0                    Biba
1           2.0             66.0                       I
2           4.0             12.0                    Boba
3           5.0             12.0                     Dva
4           3.0             99.

#### Проверим, как работает в пайплайне

##### Выводы:
1) Обратить внимание, что woe не работает, когда для одного из классов категориальной переменной нет наблюдений в одном из классов таргета

In [5]:
import time
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, metrics, model_selection
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from hyperopt import hp
# from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# for HyperOpt class
import lightgbm as lgb
import xgboost as xgb
# import catboost as ctb
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [6]:
# новый пакет!
from feature_engine.encoding import WoEEncoder
from feature_engine.creation import CombineWithReferenceFeature
from feature_engine.selection import RecursiveFeatureAddition

In [7]:
from sklearn.pipeline import Pipeline

In [8]:
X_train = pd.read_parquet('../datasets/15_hmeq/samples/X_train.parquet')
X_test  = pd.read_parquet('../datasets/15_hmeq/samples/X_test.parquet')
y_train = pd.read_parquet('../datasets/15_hmeq/samples/y_train.parquet').target
y_test  = pd.read_parquet('../datasets/15_hmeq/samples/y_test.parquet').target

In [9]:
with open('../datasets/15_hmeq/factors.json') as json_file:
    factors_dict = json.load(json_file)

factors_dict['cat_vals']
factors_dict['cat_vals'].remove('target')
seed = 42

def Gini(y, y_pred):
    res = roc_auc_score(y, y_pred) * 2 - 1
    print(f"Gini: {res}")
    return(res)



In [10]:
factors_dict

{'cat_vals': ['REASON', 'JOB', 'DEROG', 'DELINQ'],
 'num_vals': ['LOAN',
  'MORTDUE',
  'VALUE',
  'YOJ',
  'CLAGE',
  'NINQ',
  'CLNO',
  'DEBTINC']}

In [203]:
X_train.dtypes

LOAN         int64
MORTDUE    float64
VALUE      float64
REASON      object
JOB         object
YOJ        float64
DEROG       object
DELINQ      object
CLAGE      float64
NINQ       float64
CLNO       float64
DEBTINC    float64
dtype: object

In [204]:
X_train_y = X_train.copy()
X_train_y["y"] = y_train
X_train_y.y = X_train_y.y - 1
X_train_y.groupby("DELINQ").sum()

Unnamed: 0_level_0,LOAN,MORTDUE,VALUE,YOJ,CLAGE,NINQ,CLNO,DEBTINC,y
DELINQ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,63486500,230916500.0,348124600.0,28409.55,592117.412058,3662.0,68449.0,94453.65856,-2895
1.0,9922300,35483110.0,48307370.0,4449.6,92881.655166,718.0,12159.0,12045.029674,-347
2.0,3240900,13812320.0,16800470.0,1860.8,34267.129584,245.0,4786.0,4651.663149,-107
3.0,1510100,6228849.0,9550298.0,925.3,17243.198925,174.0,2499.0,1756.372258,-47
4.0,1191700,5718352.0,6958213.0,551.2,13603.115863,86.0,1975.0,1068.35861,-26
5.0,594800,2377663.0,3262527.0,345.3,6977.89348,41.0,880.0,325.275801,-6


feature_engine работает в пайплайне:

In [11]:
woe = WoEEncoder(variables = factors_dict['cat_vals'])
feat_eng = CombineWithReferenceFeature(
    variables_to_combine = list(X_train.columns),
    reference_variables = list(X_train.columns),
    operations = ['mul']
)
missing_impute = MeanMedianImputer(
    imputation_method='mean', variables=None
)
missing_cat = CategoricalImputer(
    fill_value = "Data Scientista"
)
lgbm_mdl = LGBMClassifier(
    num_leaves = 10,
    learning_rate = .1,
    reg_alpha = 8,
    reg_lambda = 8,
    random_state = seed
)
feat_sel = RecursiveFeatureAddition(
    lgbm_mdl,
    threshold = 0.005
)

mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('impute_missing_categorical', missing_cat),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)

print("With imputation:")
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])


With imputation:
Gini: 0.8875928618436604
Gini: 0.8190815450643776


0.8190815450643776

Доморощенная версия тоже работает в пайплайне и дает такие же результаты: 

In [367]:
missing_impute = missing_filler_mean( 
                                     filling_category = "missing",
                                     categorical_variables = factors_dict["cat_vals"])
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])

missing_impute = missing_filler_median( 
                                     filling_category = "missing",
                                     categorical_variables = factors_dict["cat_vals"])
print("With imputation, median:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])

With imputation:
Gini: 0.8875928618436604
Gini: 0.8190815450643776
With imputation, median:
Gini: 0.868677634424833
Gini: 0.8029957081545067


0.8029957081545067

KNNImputer после модификации тоже работает:

In [27]:
KNNImputerSeparated = teach_to_separate(KNNImputer)

In [46]:
missing_impute = KNNImputerSeparated(
                    categorical_variables = factors_dict["cat_vals"])
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('impute_missing_categorical', missing_cat),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)
mdl_pipe_impute.fit(X_train, y_train)
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])


With imputation:
       LOAN   MORTDUE     VALUE   REASON      JOB   YOJ DEROG DELINQ  \
0     26300   78851.0  109090.0  DebtCon   Office   8.0   0.0    0.0   
1     15700   48093.0   65686.0  DebtCon  ProfExe   3.0   0.0    0.0   
2     20000  126324.0  171450.0     None      Mgr  26.0  None    5.0   
3     11000   50716.0   61492.0  DebtCon    Other   3.0   0.0    1.0   
4      8700   77377.0   97070.0  DebtCon      Mgr   3.0   0.0    0.0   
...     ...       ...       ...      ...      ...   ...   ...    ...   
4721  20000       NaN  128042.0  HomeImp    Other   0.0   0.0    0.0   
4722  28400   46126.0   66942.0  DebtCon   Office  18.0   0.0    0.0   
4723  28800  204455.0  245685.0  HomeImp    Other   8.0   0.0    0.0   
4724  32000   92400.0  215000.0     None     Self   2.5   1.0    3.0   
4725   8800   51740.0   60852.0  DebtCon    Other  23.0   0.0    0.0   

           CLAGE  NINQ  CLNO    DEBTINC  
0     405.430429   0.0  44.0  42.341626  
1     149.906379   1.0  12.0  30.4

0.738

In [20]:
factors_dict

{'cat_vals': ['REASON', 'JOB', 'DEROG', 'DELINQ'],
 'num_vals': ['LOAN',
  'MORTDUE',
  'VALUE',
  'YOJ',
  'CLAGE',
  'NINQ',
  'CLNO',
  'DEBTINC']}

In [30]:
IterativeImputerSeparated = teach_to_separate(IterativeImputer)

In [47]:
obj = IterativeImputerSeparated(factors_dict["cat_vals"])
test_df = create_test_df()
obj.fit(X_train)
obj.transform(X_train)

       LOAN   MORTDUE     VALUE   REASON      JOB   YOJ DEROG DELINQ  \
0     26300   78851.0  109090.0  DebtCon   Office   8.0   0.0    0.0   
1     15700   48093.0   65686.0  DebtCon  ProfExe   3.0   0.0    0.0   
2     20000  126324.0  171450.0     None      Mgr  26.0  None    5.0   
3     11000   50716.0   61492.0  DebtCon    Other   3.0   0.0    1.0   
4      8700   77377.0   97070.0  DebtCon      Mgr   3.0   0.0    0.0   
...     ...       ...       ...      ...      ...   ...   ...    ...   
4721  20000       NaN  128042.0  HomeImp    Other   0.0   0.0    0.0   
4722  28400   46126.0   66942.0  DebtCon   Office  18.0   0.0    0.0   
4723  28800  204455.0  245685.0  HomeImp    Other   8.0   0.0    0.0   
4724  32000   92400.0  215000.0     None     Self   2.5   1.0    3.0   
4725   8800   51740.0   60852.0  DebtCon    Other  23.0   0.0    0.0   

           CLAGE  NINQ  CLNO    DEBTINC  
0     405.430429   0.0  44.0  42.341626  
1     149.906379   1.0  12.0  30.486361  
2     329



Unnamed: 0,LOAN,MORTDUE,VALUE,YOJ,CLAGE,NINQ,CLNO,DEBTINC,REASON,JOB,DEROG,DELINQ
0,26300.0,78851.000000,109090.0,8.0,405.430429,0.0,44.0,42.341626,DebtCon,Office,0.0,0.0
1,15700.0,48093.000000,65686.0,3.0,149.906379,1.0,12.0,30.486361,DebtCon,ProfExe,0.0,0.0
2,20000.0,126324.000000,171450.0,26.0,329.566667,1.0,28.0,33.910140,,Mgr,,5.0
3,11000.0,50716.000000,61492.0,3.0,90.230330,0.0,20.0,38.255862,DebtCon,Other,0.0,1.0
4,8700.0,77377.000000,97070.0,3.0,68.854877,2.0,24.0,38.217726,DebtCon,Mgr,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4721,20000.0,90250.271265,128042.0,0.0,314.533333,3.0,21.0,34.363970,HomeImp,Other,0.0,0.0
4722,28400.0,46126.000000,66942.0,18.0,180.668308,2.0,21.0,33.769035,DebtCon,Office,0.0,0.0
4723,28800.0,204455.000000,245685.0,8.0,123.396747,1.0,42.0,42.345471,HomeImp,Other,0.0,0.0
4724,32000.0,92400.000000,215000.0,2.5,165.333333,1.0,40.0,36.009596,,Self,1.0,3.0


А вот это пока не работает и я еще не разобрался почему

In [52]:
IterativeImputerSeparated = teach_to_separate(IterativeImputer)
missing_impute = IterativeImputerSeparated(factors_dict["cat_vals"])
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('impute_missing_categorical', missing_cat),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)

mdl_pipe_impute.fit(X_train, y_train)
                    
Gini(y_train, mdl_pipe_impute.predict_proba(X_train)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_test)[:, 1])


With imputation:


ValueError: could not convert string to float: 'DebtCon'

In [53]:
IterativeImputerSeparated = teach_to_separate(IterativeImputer)
missing_impute = IterativeImputerSeparated([])
X_train_test = X_train.drop(factors_dict["cat_vals"], 1).copy()
print("With imputation:")
mdl_pipe_impute = Pipeline(
    [('impute_missing', missing_impute),('encode', woe), ('feat_eng', feat_eng), ('feat_select', feat_sel), ('lgbm', lgbm_mdl)]
)

mdl_pipe_impute.fit(X_train_test, y_train)
                    
Gini(y_train, mdl_pipe_impute.predict_proba(X_train_test)[:, 1])

Gini(y_test, mdl_pipe_impute.predict_proba(X_train_test)[:, 1])


With imputation:




KeyError: "None of [Index(['REASON', 'JOB', 'DEROG', 'DELINQ'], dtype='object')] are in the [columns]"

## Выводы:

1. Версии feature_engine и моя работают в пайплайне, дают одинаковые результаты
2. В классах собственного изготовления метод fit должен принимать датасет в качестве аргумента
3. WOE не принимает датасет, в котором есть хотя бы одна категория в одной из категориальных переменных, в которой нет хотя бы одной из категорий таргета
4. Обратить внимание: CategoricalImputer заполняет только категориальные переменные, а ArbitraryNumberImputer - только количественные (комментарий МБ: у обоих классов есть аргумент `variables`, который позволяет выбрать подмножество признаков, пропуски в которых заполняем)
5. sklearn.impute.KNNImputer выдает ошибку при наличии категориальных переменных во фрейме (комментарий МБ: может просто наследуюемся от него и чуть-чуть поправим `fit`?)
6. А еще он возвращает 2-d array, теряя при этом названия столбцов
7. Но я написал класс-обертку KNNImputerSeparated(categorical_variables, ...), который встраивается в пайплайн, предлагаю использовать его
