In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
from featuretools import selection
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.pipeline import make_union, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from category_encoders import OrdinalEncoder
from category_encoders.count import CountEncoder

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2, mutual_info_classif

from mlxtend.feature_selection import SequentialFeatureSelector

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector
from nni.algorithms.feature_engineering.gbdt_selector import GBDTSelector

from sklearn.feature_selection import RFECV


pd.options.display.max_rows = 100

In [127]:
ft.__version__

'0.23.1'

In [123]:
n_rows = 10000
train = pd.read_csv('data/application_train.csv', nrows=n_rows)
test = pd.read_csv('data/application_test.csv', nrows=n_rows)
bureau = pd.read_csv('data/bureau.csv', nrows=3*n_rows)
bureau_balance = pd.read_csv('data/bureau_balance.csv', nrows=2*n_rows)
cash_balance = pd.read_csv('data/POS_CASH_balance.csv', nrows=2*n_rows)
card_balance = pd.read_csv('data/credit_card_balance.csv', nrows=2*n_rows)
prev_app = pd.read_csv('data/previous_application.csv', nrows=2*n_rows)
payments = pd.read_csv('data/installments_payments.csv', nrows=2*n_rows)

![File connection columns](https://storage.googleapis.com/kaggle-media/competitions/home-credit/home_credit.png)

In [4]:
train

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,111629,0,Revolving loans,F,Y,Y,1,135000.0,270000.0,13500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
9996,111630,0,Cash loans,F,Y,Y,0,166500.0,1006920.0,51543.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
9997,111631,0,Cash loans,M,Y,Y,0,225000.0,497520.0,53581.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
9998,111632,0,Cash loans,F,N,N,0,360000.0,348264.0,27643.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0


In [5]:
# Merge application data
train['Test'] = False
test['Test'] = True
test['TARGET'] = np.nan
app = pd.concat([train, test], ignore_index=True, sort=False)

In [6]:
# Create an entity set
es = ft.EntitySet(id='applications')

In [7]:
# Add dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='applications',
    dataframe=app,
    index='SK_ID_CURR'
)

In [8]:
# Manually define datatypes in app dataframe
variable_types = {
    'FLAG_MOBIL': ft.variable_types.Boolean,
    'FLAG_EMP_PHONE': ft.variable_types.Boolean,
    'FLAG_WORK_PHONE': ft.variable_types.Boolean,
    'FLAG_CONT_MOBILE': ft.variable_types.Boolean,
    'FLAG_PHONE': ft.variable_types.Boolean,
    'FLAG_EMAIL': ft.variable_types.Boolean,
    'REG_REGION_NOT_LIVE_REGION': ft.variable_types.Boolean,
    'REG_REGION_NOT_WORK_REGION': ft.variable_types.Boolean,
    'LIVE_REGION_NOT_WORK_REGION': ft.variable_types.Boolean,
    'REG_CITY_NOT_LIVE_CITY': ft.variable_types.Boolean,
    'REG_CITY_NOT_WORK_CITY': ft.variable_types.Boolean,
    'LIVE_CITY_NOT_WORK_CITY': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_2': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_3': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_4': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_5': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_6': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_7': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_8': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_9': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_10': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_11': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_12': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_13': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_14': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_15': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_16': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_17': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_18': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_19': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_20': ft.variable_types.Boolean,
    'FLAG_DOCUMENT_21': ft.variable_types.Boolean,
}

# Add dataframe to entityset, using manual datatypes
es = es.entity_from_dataframe(
    entity_id='applications',
    dataframe=app,
    index='SK_ID_CURR',
    variable_types=variable_types
)

In [9]:
BOOL = ft.variable_types.Boolean
ID = ft.variable_types.Id

# Add bureau dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='bureau',
    dataframe=bureau,
    index='SK_ID_BUREAU',
    variable_types={'SK_ID_CURR': ID}
)

# Add bureau_balance dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='bureau_balance',
    dataframe=bureau_balance,
    index='New',
    variable_types={'SK_ID_BUREAU': ID}
)

# Add cash_balance dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='cash_balance',
    dataframe=cash_balance,
    index='New',
    variable_types={
        'SK_ID_PREV': ID,
        'SK_ID_CURR': ID
    }
)

# Add card_balance dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='card_balance',
    dataframe=card_balance,
    index='New',
    variable_types={
        'SK_ID_PREV': ID,
        'SK_ID_CURR': ID
    }
)
                              
# Add prev_app dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='prev_app',
    dataframe=prev_app,
    index='SK_ID_PREV',
    variable_types={
        'SK_ID_CURR': ID,
        'NFLAG_LAST_APPL_IN_DAY': BOOL
    }
)

# Add payments dataframe to entityset
es = es.entity_from_dataframe(
    entity_id='payments',
    dataframe=payments,
    index='New',
    variable_types={
        'SK_ID_PREV': ID,
        'SK_ID_CURR': ID
    }
)



In [10]:
es

Entityset: applications
  Entities:
    applications [Rows: 20000, Columns: 123]
    bureau [Rows: 30000, Columns: 17]
    bureau_balance [Rows: 20000, Columns: 4]
    cash_balance [Rows: 20000, Columns: 9]
    card_balance [Rows: 20000, Columns: 24]
    prev_app [Rows: 20000, Columns: 37]
    payments [Rows: 20000, Columns: 9]
  Relationships:
    No relationships

In [11]:
# Define relationships between dataframes
relationships = [
    # parent_entity   parent_variable  child_entity      child_variable
    ('applications', 'SK_ID_CURR',   'bureau',         'SK_ID_CURR'),
    ('bureau',       'SK_ID_BUREAU', 'bureau_balance', 'SK_ID_BUREAU'),
    ('applications', 'SK_ID_CURR',   'prev_app',       'SK_ID_CURR'),
    ('applications', 'SK_ID_CURR',   'cash_balance',   'SK_ID_CURR'),
    ('applications', 'SK_ID_CURR',   'payments',       'SK_ID_CURR'),
    ('applications', 'SK_ID_CURR',   'card_balance',   'SK_ID_CURR')
]

# Create the relationships
for pe, pv, ce, cv in relationships:
    es = es.add_relationship(ft.Relationship(es[pe][pv], es[ce][cv]))

In [12]:
es['applications']

Entity: applications
  Variables:
    SK_ID_CURR (dtype: index)
    TARGET (dtype: numeric)
    NAME_CONTRACT_TYPE (dtype: categorical)
    CODE_GENDER (dtype: categorical)
    FLAG_OWN_CAR (dtype: categorical)
    FLAG_OWN_REALTY (dtype: categorical)
    CNT_CHILDREN (dtype: numeric)
    AMT_INCOME_TOTAL (dtype: numeric)
    AMT_CREDIT (dtype: numeric)
    AMT_ANNUITY (dtype: numeric)
    AMT_GOODS_PRICE (dtype: numeric)
    NAME_TYPE_SUITE (dtype: categorical)
    NAME_INCOME_TYPE (dtype: categorical)
    NAME_EDUCATION_TYPE (dtype: categorical)
    NAME_FAMILY_STATUS (dtype: categorical)
    NAME_HOUSING_TYPE (dtype: categorical)
    REGION_POPULATION_RELATIVE (dtype: numeric)
    DAYS_BIRTH (dtype: numeric)
    DAYS_EMPLOYED (dtype: numeric)
    DAYS_REGISTRATION (dtype: numeric)
    DAYS_ID_PUBLISH (dtype: numeric)
    OWN_CAR_AGE (dtype: numeric)
    OCCUPATION_TYPE (dtype: categorical)
    CNT_FAM_MEMBERS (dtype: numeric)
    REGION_RATING_CLIENT (dtype: numeric)
    REGION_RATI

In [13]:
ft.list_primitives()

Unnamed: 0,name,type,dask_compatible,koalas_compatible,description
0,std,aggregation,True,True,Computes the dispersion relative to the mean v...
1,mode,aggregation,False,False,Determines the most commonly repeated value.
2,mean,aggregation,True,True,Computes the average for a list of values.
3,sum,aggregation,True,True,"Calculates the total addition, ignoring `NaN`."
4,time_since_first,aggregation,False,False,Calculates the time elapsed since the first da...
5,avg_time_between,aggregation,False,False,Computes the average number of seconds between...
6,median,aggregation,False,False,Determines the middlemost number in a list of ...
7,num_true,aggregation,True,False,Counts the number of `True` values.
8,trend,aggregation,False,False,Calculates the trend of a variable over time.
9,min,aggregation,True,True,"Calculates the smallest value, ignoring `NaN` ..."


In [14]:
agg_primitives =  ['count', 'mean', 'num_unique']
trans_primitives = ['cum_sum']
dfs_feat, dfs_defs = ft.dfs(
    entityset=es,
    target_entity='applications',
    trans_primitives=trans_primitives,
    agg_primitives=agg_primitives, 
    max_features=1000,
    chunk_size=4000,
    verbose=True,                            
    max_depth=3,
    n_jobs=-1
)

Built 547 features
Fewer chunks (5), than workers (16) consider reducing the chunk size                                                   
EntitySet scattered to 16 workers in 9 seconds                                                                         
Elapsed: 00:27 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [16]:
dfs_feat

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CUM_SUM(MEAN(prev_app.CUM_SUM(HOUR_APPR_PROCESS_START))),CUM_SUM(MEAN(prev_app.CUM_SUM(NFLAG_INSURED_ON_APPROVAL))),CUM_SUM(MEAN(prev_app.CUM_SUM(RATE_DOWN_PAYMENT))),CUM_SUM(MEAN(prev_app.CUM_SUM(RATE_INTEREST_PRIMARY))),CUM_SUM(MEAN(prev_app.CUM_SUM(RATE_INTEREST_PRIVILEGED))),CUM_SUM(MEAN(prev_app.CUM_SUM(SELLERPLACE_AREA))),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.CREDIT_ACTIVE)),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.CREDIT_CURRENCY)),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.CREDIT_TYPE)),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.SK_ID_CURR))
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1.0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,,,,,,,,,,
100003,0.0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,,,,,,,,,,
100004,0.0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,,,,,,,,,,
100006,0.0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,,,,,,,,,,
100007,0.0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,167736.0,2644.0,,,,8409030.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172551,,Cash loans,F,N,Y,0,135000.0,454500.0,29173.5,454500.0,...,,,,,,,,,,
172556,,Cash loans,M,Y,N,1,180000.0,500490.0,52555.5,450000.0,...,,,,,,,,,,
172562,,Cash loans,F,N,Y,0,202500.0,523152.0,37336.5,463500.0,...,,,,,,,,,,
172570,,Cash loans,M,N,Y,1,382500.0,967500.0,31338.0,967500.0,...,,,,,,,,,,


In [17]:
#remove_low_information_features(feature_matrix)
#remove_highly_correlated_features(feature_matrix)
#remove_highly_null_features(feature_matrix)
#remove_single_value_features(feature_matrix)

filtered = selection.remove_low_information_features(dfs_feat)

In [130]:
filtered

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CUM_SUM(MEAN(prev_app.CUM_SUM(HOUR_APPR_PROCESS_START))),CUM_SUM(MEAN(prev_app.CUM_SUM(NFLAG_INSURED_ON_APPROVAL))),CUM_SUM(MEAN(prev_app.CUM_SUM(RATE_DOWN_PAYMENT))),CUM_SUM(MEAN(prev_app.CUM_SUM(RATE_INTEREST_PRIMARY))),CUM_SUM(MEAN(prev_app.CUM_SUM(RATE_INTEREST_PRIVILEGED))),CUM_SUM(MEAN(prev_app.CUM_SUM(SELLERPLACE_AREA))),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.CREDIT_ACTIVE)),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.CREDIT_CURRENCY)),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.CREDIT_TYPE)),CUM_SUM(NUM_UNIQUE(bureau_balance.bureau.SK_ID_CURR))
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1.0,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,,,,,,,,,,
100003,0.0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,,,,,,,,,,
100004,0.0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,,,,,,,,,,
100006,0.0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,,,,,,,,,,
100007,0.0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,167736.0,2644.0,,,,8409030.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172551,,Cash loans,F,N,Y,0,135000.0,454500.0,29173.5,454500.0,...,,,,,,,,,,
172556,,Cash loans,M,Y,N,1,180000.0,500490.0,52555.5,450000.0,...,,,,,,,,,,
172562,,Cash loans,F,N,Y,0,202500.0,523152.0,37336.5,463500.0,...,,,,,,,,,,
172570,,Cash loans,M,N,Y,1,382500.0,967500.0,31338.0,967500.0,...,,,,,,,,,,


In [148]:
# Split data back into test + train
train = dfs_feat.loc[~app['Test'].values, :].copy()
test = dfs_feat.loc[app['Test'].values, :].copy()

# Target labels
train_y = train['TARGET']

# Remove test/train indicator column and target column
train.drop(columns=['Test', 'TARGET'], inplace=True)
test.drop(columns=['Test', 'TARGET'], inplace=True)
skf = StratifiedKFold(5, random_state=42, shuffle=True)

In [38]:
%%time
# Classification pipeline OrdinalEncoder LightGBM
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')), # для заполнения пропущенных значений
#     ('scaler', RobustScaler()),
    ('classifier', LGBMClassifier())
])

# Cross-validated roc auc
scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.725 +/- 0.014
CPU times: total: 1min 7s
Wall time: 6.26 s


In [39]:
%%time
# Classification pipeline CountEncoder LightGBM
lgbm_pipeline = Pipeline([
    ('label-encoder', CountEncoder(min_group_size=1)),
    ('imputer', SimpleImputer(strategy='median')), # для заполнения пропущенных значений
#     ('scaler', RobustScaler()),
    ('classifier', CalibratedClassifierCV( # Калибровка вероятности
                        estimator=LGBMClassifier(),
                        method='isotonic'))
])

# Cross-validated roc auc
scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.729 +/- 0.018
CPU times: total: 4min 47s
Wall time: 21.2 s


# Feature Selection

## Filter methods

Constant

In [40]:
%%time
# Classification pipeline w/ LightGBM
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('constant_remover', VarianceThreshold(0.0)),
#     ('scaler', RobustScaler()),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.725 +/- 0.014
CPU times: total: 1min 8s
Wall time: 6.53 s


Quasi-Constant

In [41]:
%%time
# Classification pipeline w/ LightGBM
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('quasi_constant_remover', VarianceThreshold(.99 * (1 - .99))),
#     ('scaler', RobustScaler()),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.731 +/- 0.006
CPU times: total: 1min 7s
Wall time: 6.24 s


In [42]:
quasi_constant_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('quasi_constant_remover', VarianceThreshold(.99 * (1 - .99)))])

train_qc = quasi_constant_pipeline.fit_transform(train, train_y)

In [44]:
train.shape, train_qc.shape

((10000, 545), (10000, 402))

In [131]:
%%time
class CorrSelector:
    def __init__(self, threshold):
        self.threshold = threshold
    
    def fit(self, X, y=None):
        corr = np.absolute(np.corrcoef(X, rowvar=False))
        upper = corr*np.triu(np.ones(corr.shape), k=1).astype(np.bool)
        self.to_drop = [column for column in range(upper.shape[1]) if any(upper[:,column] >= self.threshold)]
        return self
    
    def transform(self, X):
        return np.delete(X, self.to_drop, axis=1)
    
    
lgbm_pipeline = Pipeline([
#     ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('quasi_constant_remover', VarianceThreshold(.99 * (1 - .99))),
    ('correlated_remover', CorrSelector(.99)),
#     ('scaler', RobustScaler()),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train_qc, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Cross-validated ROC_AUC: 0.726 +/- 0.006
CPU times: total: 24.9 s
Wall time: 2.48 s


In [46]:
correlated_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('quasi_constant_remover', VarianceThreshold(.99 * (1 - .99))),
    ('correlated_remover', CorrSelector(.95)),])

train_cor = correlated_pipeline.fit_transform(train, train_y)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [47]:
train.shape, train_cor.shape

((10000, 545), (10000, 158))

Mutual Information

In [48]:
%%time
# Взаимная Информация
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
#     ('quasi_constant_remover', VarianceThreshold(.99 * (1 - .99))),
#     ('correlated_remover', CorrSelector(.9)),
    ('mutual_info_remover', SelectPercentile(mutual_info_classif, percentile=95)),
#     ('scaler', RobustScaler()),
#     ('imputer', SimpleImputer(strategy='median')),
#     ('classifier', CalibratedClassifierCV(
#                         base_estimator=LGBMClassifier(),
#                         method='isotonic'))
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.725 +/- 0.013
CPU times: total: 2min 6s
Wall time: 1min 5s


## Embeded Methods

LinearSVC

In [49]:
%%time
# Классификация линейных опорных векторов (передается в Pipeline с помощью SelectFromModel - реализацию feature_selection в sklearn)
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1",dual=False))),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))



Cross-validated ROC_AUC: 0.732 +/- 0.007
CPU times: total: 1min 23s
Wall time: 43.6 s


LogisticRegression

In [50]:
%%time
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    
    ('feature_selection', SelectFromModel(LogisticRegression(penalty="l1",solver='liblinear'))),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.728 +/- 0.006
CPU times: total: 2min 40s
Wall time: 2min 9s


ExtraTreesClassifier

In [51]:
%%time
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    
    ('feature_selection', SelectFromModel(ExtraTreesClassifier(n_estimators=50))),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.718 +/- 0.008
CPU times: total: 41.3 s
Wall time: 9.32 s


In [149]:
quasi_constant_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('quasi_constant_remover', VarianceThreshold(.99 * (1 - .99))),
    ('scaler', StandardScaler()),

])

train_f = quasi_constant_pipeline.fit_transform(train, train_y)
train_f.shape

(10000, 402)

In [53]:
%%time
lgbm_pipeline = Pipeline([
    ('classifier', LGBMClassifier())
])


scores = cross_val_score(
    lgbm_pipeline, train_f, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.731 +/- 0.006
CPU times: total: 46.6 s
Wall time: 2.98 s


## Wrapper Methods

In [54]:
%%time
sfs = SequentialFeatureSelector(
    RandomForestClassifier(), 
    k_features='best', 
    forward=False, 
    floating=False,
    scoring='roc_auc',
    cv=None,
    verbose=1,
    n_jobs=-1
)

sfs = sfs.fit(train_f[:100], train_y[:100].values)

print(len(sfs.k_feature_idx_))

# print the final prediction score.
print(sfs.k_score_)

# transform to the newly selected features.
train_f_sfs = sfs.transform(train_f)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 402 out of 402 | elapsed:   12.0s finished
Features: 401/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 370 out of 401 | elapsed:    5.8s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 401 out of 401 | elapsed:    6.1s finished
Features: 400/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:    6.1s finished
Features: 399/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 c

[Parallel(n_jobs=-1)]: Done 377 out of 377 | elapsed:    6.0s finished
Features: 376/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 376 out of 376 | elapsed:    6.0s finished
Features: 375/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 344 out of 375 | elapsed:    5.6s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:    5.9s finished
Features: 374/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 374 out of 374 | elapsed:    5

Features: 350/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:    5.5s finished
Features: 349/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 318 out of 349 | elapsed:    5.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 349 out of 349 | elapsed:    5.5s finished
Features: 348/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 317 out of 348 | elapsed:    5.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 348 out of 348 | ela

Features: 321/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 290 out of 321 | elapsed:    4.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 321 out of 321 | elapsed:    4.9s finished
Features: 320/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    4.9s finished
Features: 319/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 288 out of 319 | elapsed:    4.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 319 out of 319 | elapsed:    5.0s finished
Features: 318/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel

Features: 292/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 292 out of 292 | elapsed:    4.5s finished
Features: 291/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 260 out of 291 | elapsed:    4.3s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 291 out of 291 | elapsed:    4.5s finished
Features: 290/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 290 out of 290 | elapsed:    4.5s finished
Features: 289/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 258 out of 289 | elapsed:    4.2s remaining:    0.4s
[Parallel

Features: 260/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 260 out of 260 | elapsed:    4.1s finished
Features: 259/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 228 out of 259 | elapsed:    3.9s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 259 out of 259 | elapsed:    4.1s finished
Features: 258/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 258 out of 258 | elapsed:    4.1s finished
Features: 257/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 226 out of 257 | elapsed:    3.9s remaining:    0.4s
[Parallel

Features: 230/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 230 out of 230 | elapsed:    4.0s finished
Features: 229/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 198 out of 229 | elapsed:    3.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 229 out of 229 | elapsed:    3.9s finished
Features: 228/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 228 out of 228 | elapsed:    3.9s finished
Features: 227/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 196 out of 227 | elapsed:    3.7s remaining:    0.5s
[Parallel

Features: 198/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 198 out of 198 | elapsed:    3.3s finished
Features: 197/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 166 out of 197 | elapsed:    3.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 197 out of 197 | elapsed:    3.3s finished
Features: 196/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 196 out of 196 | elapsed:    3.3s finished
Features: 195/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 164 out of 195 | elapsed:    3.1s remaining:    0.5s
[Parallel

[Parallel(n_jobs=-1)]: Done 168 out of 168 | elapsed:    2.9s finished
Features: 167/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 136 out of 167 | elapsed:    2.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 167 out of 167 | elapsed:    2.8s finished
Features: 166/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 166 out of 166 | elapsed:    2.8s finished
Features: 165/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 134 out of 165 | elapsed:    2.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 165 out of 165 | elapsed:    2.8s finished
Features: 164/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Features: 136/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 136 out of 136 | elapsed:    2.4s finished
Features: 135/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 104 out of 135 | elapsed:    2.2s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    2.3s finished
Features: 134/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 134 out of 134 | elapsed:    2.4s finished
Features: 133/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 102 out of 133 | elapsed:    2.1s remaining:    0.6s
[Parallel

Features: 105/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  74 out of 105 | elapsed:    1.7s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:    1.9s finished
Features: 104/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 104 out of 104 | elapsed:    1.9s finished
Features: 103/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  72 out of 103 | elapsed:    1.7s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done 103 out of 103 | elapsed:    1.9s finished
Features: 102/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.7s
[Parallel

[Parallel(n_jobs=-1)]: Done  76 out of  76 | elapsed:    1.5s finished
Features: 75/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  44 out of  75 | elapsed:    1.3s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    1.4s finished
Features: 74/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  74 out of  74 | elapsed:    1.4s finished
Features: 73/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  42 out of  73 | elapsed:    1.3s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  73 out of  73 | elapsed:    1.4s finished
Features: 72/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Par

[Parallel(n_jobs=-1)]: Done  38 out of  38 | elapsed:    0.8s finished
Features: 37/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 out of  37 | elapsed:    0.8s finished
Features: 36/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.8s finished
Features: 35/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:    0.8s finished
Features: 34/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:    0.8s finished
Features: 33/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    0.7s finished
Features: 32/1[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 

391
1.0000000000000002
CPU times: total: 22min 21s
Wall time: 23min 15s


In [55]:
train_f_sfs = sfs.transform(train_f)

In [56]:
%%time
lgbm_pipeline = Pipeline([
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train_f_sfs, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.727 +/- 0.015
CPU times: total: 45.4 s
Wall time: 2.88 s


In [143]:
train_f_sfs.shape

(10000, 391)

# RFECV

In [57]:
%%time
# Рекурсивное исключение функций с перекрестной проверкой для выбора функций

# svc = LinearSVC(penalty='l1', dual=False)
svc = LogisticRegression(penalty="l1",solver='liblinear')

rfecv = RFECV(estimator=svc, step=1, cv=None, min_features_to_select=200,
              scoring='roc_auc', verbose=1)
rfecv.fit(train_f[:1000], train_y[:1000])

Fitting estimator with 402 features.
Fitting estimator with 401 features.
Fitting estimator with 400 features.
Fitting estimator with 399 features.
Fitting estimator with 398 features.
Fitting estimator with 397 features.
Fitting estimator with 396 features.
Fitting estimator with 395 features.
Fitting estimator with 394 features.
Fitting estimator with 393 features.
Fitting estimator with 392 features.
Fitting estimator with 391 features.
Fitting estimator with 390 features.
Fitting estimator with 389 features.
Fitting estimator with 388 features.
Fitting estimator with 387 features.
Fitting estimator with 386 features.
Fitting estimator with 385 features.
Fitting estimator with 384 features.
Fitting estimator with 383 features.
Fitting estimator with 382 features.
Fitting estimator with 381 features.
Fitting estimator with 380 features.
Fitting estimator with 379 features.
Fitting estimator with 378 features.
Fitting estimator with 377 features.
Fitting estimator with 376 features.
F

Fitting estimator with 381 features.
Fitting estimator with 380 features.
Fitting estimator with 379 features.
Fitting estimator with 378 features.
Fitting estimator with 377 features.
Fitting estimator with 376 features.
Fitting estimator with 375 features.
Fitting estimator with 374 features.
Fitting estimator with 373 features.
Fitting estimator with 372 features.
Fitting estimator with 371 features.
Fitting estimator with 370 features.
Fitting estimator with 369 features.
Fitting estimator with 368 features.
Fitting estimator with 367 features.
Fitting estimator with 366 features.
Fitting estimator with 365 features.
Fitting estimator with 364 features.
Fitting estimator with 363 features.
Fitting estimator with 362 features.
Fitting estimator with 361 features.
Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
F

Fitting estimator with 360 features.
Fitting estimator with 359 features.
Fitting estimator with 358 features.
Fitting estimator with 357 features.
Fitting estimator with 356 features.
Fitting estimator with 355 features.
Fitting estimator with 354 features.
Fitting estimator with 353 features.
Fitting estimator with 352 features.
Fitting estimator with 351 features.
Fitting estimator with 350 features.
Fitting estimator with 349 features.
Fitting estimator with 348 features.
Fitting estimator with 347 features.
Fitting estimator with 346 features.
Fitting estimator with 345 features.
Fitting estimator with 344 features.
Fitting estimator with 343 features.
Fitting estimator with 342 features.
Fitting estimator with 341 features.
Fitting estimator with 340 features.
Fitting estimator with 339 features.
Fitting estimator with 338 features.
Fitting estimator with 337 features.
Fitting estimator with 336 features.
Fitting estimator with 335 features.
Fitting estimator with 334 features.
F

Fitting estimator with 340 features.
Fitting estimator with 339 features.
Fitting estimator with 338 features.
Fitting estimator with 337 features.
Fitting estimator with 336 features.
Fitting estimator with 335 features.
Fitting estimator with 334 features.
Fitting estimator with 333 features.
Fitting estimator with 332 features.
Fitting estimator with 331 features.
Fitting estimator with 330 features.
Fitting estimator with 329 features.
Fitting estimator with 328 features.
Fitting estimator with 327 features.
Fitting estimator with 326 features.
Fitting estimator with 325 features.
Fitting estimator with 324 features.
Fitting estimator with 323 features.
Fitting estimator with 322 features.
Fitting estimator with 321 features.
Fitting estimator with 320 features.
Fitting estimator with 319 features.
Fitting estimator with 318 features.
Fitting estimator with 317 features.
Fitting estimator with 316 features.
Fitting estimator with 315 features.
Fitting estimator with 314 features.
F

Fitting estimator with 319 features.
Fitting estimator with 318 features.
Fitting estimator with 317 features.
Fitting estimator with 316 features.
Fitting estimator with 315 features.
Fitting estimator with 314 features.
Fitting estimator with 313 features.
Fitting estimator with 312 features.
Fitting estimator with 311 features.
Fitting estimator with 310 features.
Fitting estimator with 309 features.
Fitting estimator with 308 features.
Fitting estimator with 307 features.
Fitting estimator with 306 features.
Fitting estimator with 305 features.
Fitting estimator with 304 features.
Fitting estimator with 303 features.
Fitting estimator with 302 features.
Fitting estimator with 301 features.
Fitting estimator with 300 features.
Fitting estimator with 299 features.
Fitting estimator with 298 features.
Fitting estimator with 297 features.
Fitting estimator with 296 features.
Fitting estimator with 295 features.
Fitting estimator with 294 features.
Fitting estimator with 293 features.
F

In [58]:
%%time
lgbm_pipeline = Pipeline([
    ('classifier', LGBMClassifier())
])

train_f_rfecv = rfecv.transform(train_f)


scores = cross_val_score(
    lgbm_pipeline, train_f_rfecv, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.731 +/- 0.007
CPU times: total: 44.6 s
Wall time: 2.92 s


In [59]:
train_f_rfecv.shape

(10000, 393)

## Permutation Importance

In [61]:
from eli5.sklearn import PermutationImportance
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel
roc_scorer = make_scorer(roc_auc_score,needs_proba=True)

In [62]:
perm = PermutationImportance(LogisticRegression(penalty="l1", solver='liblinear'), scoring=roc_scorer, cv=None)
perm.fit(train_f[:10000], train_y[:10000].values)

In [63]:
perm_sel = SelectFromModel(perm, threshold=0.001, prefit=True)
train_f_perm = perm_sel.transform(train_f)

In [64]:
train_f_perm.shape

(10000, 98)

In [65]:
%%time
lgbm_pipeline = Pipeline([
    ('label-encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    
#     ('feature_selection', SelectFromModel(perm, threshold=0.05, prefit=False)),
    ('classifier', LGBMClassifier())
])

scores = cross_val_score(
    lgbm_pipeline, train_f_perm, train_y, 
    cv=skf, scoring='roc_auc'
)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f' % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.734 +/- 0.013
CPU times: total: 16.9 s
Wall time: 1.28 s


## GBDTSelector

In [151]:
%%time
gbdt = GBDTSelector()
gbdt.fit(
    train_f,
    train_y,
    lgb_params={'boosting_type':'gbdt','metric':'AUC','verbosity':-1},
    eval_ratio=0.25,
    early_stopping_rounds=10,
    importance_type='gain',
    num_boost_round=100
)



[1]	valid_0's auc: 0.697337
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.708376
[3]	valid_0's auc: 0.707401
[4]	valid_0's auc: 0.711736
[5]	valid_0's auc: 0.70651
[6]	valid_0's auc: 0.705347
[7]	valid_0's auc: 0.709064
[8]	valid_0's auc: 0.717238
[9]	valid_0's auc: 0.717797
[10]	valid_0's auc: 0.713492
[11]	valid_0's auc: 0.715177
[12]	valid_0's auc: 0.71578
[13]	valid_0's auc: 0.714185
[14]	valid_0's auc: 0.71936
[15]	valid_0's auc: 0.718733
[16]	valid_0's auc: 0.715566
[17]	valid_0's auc: 0.717438
[18]	valid_0's auc: 0.715193
[19]	valid_0's auc: 0.715595
[20]	valid_0's auc: 0.713221
[21]	valid_0's auc: 0.714197
[22]	valid_0's auc: 0.713728
[23]	valid_0's auc: 0.715478
[24]	valid_0's auc: 0.716519
Early stopping, best iteration is:
[14]	valid_0's auc: 0.71936
CPU times: total: 3.22 s
Wall time: 249 ms


In [84]:
train_f_gbdt = train_f[:,gbdt.get_selected_features(100)]

In [85]:
%%time
lgbm_pipeline = Pipeline([
    ('classifier', LGBMClassifier())
])


roc_scorer = make_scorer(roc_auc_score,needs_proba=True)
scores = cross_val_score(lgbm_pipeline, train_f_gbdt, train_y, 
                         cv=skf, scoring=roc_scorer)
print('Cross-validated ROC_AUC: %0.3f +/- %0.3f'
      % (scores.mean(), scores.std()))

Cross-validated ROC_AUC: 0.728 +/- 0.008
CPU times: total: 13.2 s
Wall time: 830 ms
