In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import SCORERS
from sklearn.metrics import roc_auc_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

In [2]:
from hcdr_applications import load_datasets, preprocessing_transformations, make_prep_pipeline

In [3]:
# load data
DATA_DIR = "../data"
# ds_names = ("application_train", "application_test", "bureau","bureau_balance","credit_card_balance","installments_payments",
#             "previous_application","POS_CASH_balance")
ds_names = ("application_train", "application_test")
datasets = load_datasets(DATA_DIR, ds_names)

In [4]:
cat_selected = ['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE', 'FLAG_WORK_PHONE', 
                'ORGANIZATION_TYPE', 'FLAG_OWN_CAR', 'OCCUPATION_TYPE', 'REG_CITY_NOT_LIVE_CITY', 
                'NAME_FAMILY_STATUS', 'FLAG_PHONE', 'FLAG_OWN_REALTY', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_18',
                'WEEKDAY_APPR_PROCESS_START', 'NAME_TYPE_SUITE', 'FLAG_DOCUMENT_16', 'NAME_HOUSING_TYPE', 
                'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_3', 'WALLSMATERIAL_MODE']

In [5]:
y = datasets['application_train']['TARGET']
X = preprocessing_transformations(datasets['application_train'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
X_kaggle_test = datasets['application_test']

data_prep_pipeline, num_attribs_total, cat_attribs = make_prep_pipeline(cat_selected=None)

In [6]:
%%time
full_pipeline_with_predictor = Pipeline([
    ("preparation", data_prep_pipeline),
    ('L1_selector', SelectFromModel(LogisticRegression(
        C=0.006404,
        penalty='l1',
        solver='liblinear', 
        class_weight='balanced', 
        random_state=0))),
    ("model", LogisticRegression(class_weight='balanced'))
])
model = full_pipeline_with_predictor.fit(X_train, y_train)

CPU times: user 22.5 s, sys: 4.03 s, total: 26.5 s
Wall time: 12.6 s


In [26]:
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_test_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc_score(y_train, y_train_pred_proba), roc_auc_score(y_test, y_test_pred_proba)

(0.749646813560819, 0.7508619874299979)

In [27]:
cat_pipeline = data_prep_pipeline.transformer_list[1][1]
cat_features = [f'{base}_{c}'for base, ohe_c in zip(
    cat_attribs, cat_pipeline.named_steps['ohe'].categories_) for c in ohe_c]
features = num_attribs_total + cat_features
total_num_features = len(features)
len(features), len(num_attribs_total), len(cat_features)

(255, 50, 205)

In [28]:
selector_model = full_pipeline_with_predictor.named_steps['L1_selector']
selected_features = list(np.array(features)[selector_model.get_support()])
len(selected_features), len(num_attribs_total + cat_attribs), total_num_features

(75, 97, 255)

In [15]:
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_test_pred_proba = model.predict_proba(X_test)[:, 1]
y_train_pred = y_train_pred_proba > 0.5
y_test_pred = y_test_pred_proba > 0.5
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc', verbose=1)


try:
    expLog
except NameError:
    expLog = pd.DataFrame(columns=["exp_name", 
                                   "Train AUC", 
                                   "5-fold Valid AUC",
                                   "5-fold Valid AUC std",
                                   "Test  AUC"
                                  ])

exp_name = f"Baseline_{len(selected_features)}({len(num_attribs_total + cat_attribs)}.{total_num_features})_features"
expLog.loc[len(expLog)] = [f"{exp_name}"] + list(np.round(
               [roc_auc_score(y_train, y_train_pred_proba),
                scores.mean(),
                scores.std(),
                roc_auc_score(y_test, y_test_pred_proba)],
    4)) 
expLog

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min finished


Unnamed: 0,exp_name,Train AUC,5-fold Valid AUC,5-fold Valid AUC std,Test AUC
0,Baseline_77(70.199)_features,0.7496,0.747,0.0051,0.7509


### Interpretting L1 Selector Estimator

In [22]:
# x=['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'annuity_income_percentage', 'car_to_birth_ratio', 'children_ratio', 'credit_to_annuity_ratio', 'credit_to_goods_ratio', 'credit_to_income_ratio', 'days_employed_percentage', 'income_credit_percentage', 'income_per_child', 'payment_rate', 'phone_to_birth_ratio', 'phone_to_employ_ratio', 'external_source_mean', 'child_to_non_child_ratio', 'credit_per_person', 'CODE_GENDER_F', 'CODE_GENDER_M', 'FLAG_OWN_CAR_N', 'FLAG_OWN_CAR_Y', 'FLAG_OWN_REALTY_N', 'FLAG_OWN_REALTY_Y', 'NAME_TYPE_SUITE_Unaccompanied', 'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Working', 'NAME_EDUCATION_TYPE_Higher education', 'NAME_EDUCATION_TYPE_Incomplete higher', 'NAME_EDUCATION_TYPE_Lower secondary', 'NAME_EDUCATION_TYPE_Secondary / secondary special', 'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Married', 'NAME_FAMILY_STATUS_Widow', 'NAME_HOUSING_TYPE_House / apartment', 'NAME_HOUSING_TYPE_Municipal apartment', 'NAME_HOUSING_TYPE_Rented apartment', 'OCCUPATION_TYPE_Accountants', 'OCCUPATION_TYPE_Core staff', 'OCCUPATION_TYPE_Drivers', 'OCCUPATION_TYPE_High skill tech staff', 'OCCUPATION_TYPE_Laborers', 'OCCUPATION_TYPE_Low-skill Laborers', 'OCCUPATION_TYPE_Managers', 'OCCUPATION_TYPE_Sales staff', 'OCCUPATION_TYPE_Security staff', 'OCCUPATION_TYPE_missing', 'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick', 'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_missing', 'WEEKDAY_APPR_PROCESS_START_FRIDAY', 'WEEKDAY_APPR_PROCESS_START_MONDAY', 'WEEKDAY_APPR_PROCESS_START_SATURDAY', 'WEEKDAY_APPR_PROCESS_START_SUNDAY', 'WEEKDAY_APPR_PROCESS_START_TUESDAY', 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY', 'ORGANIZATION_TYPE_Business Entity Type 2', 'ORGANIZATION_TYPE_Business Entity Type 3', 'ORGANIZATION_TYPE_Construction', 'ORGANIZATION_TYPE_Industry: type 3', 'ORGANIZATION_TYPE_Industry: type 9', 'ORGANIZATION_TYPE_Military', 'ORGANIZATION_TYPE_School', 'ORGANIZATION_TYPE_Self-employed', 'ORGANIZATION_TYPE_Trade: type 2', 'ORGANIZATION_TYPE_Trade: type 7', 'ORGANIZATION_TYPE_Transport: type 3', 'FLAG_WORK_PHONE_0', 'FLAG_WORK_PHONE_1', 'FLAG_PHONE_0', 'FLAG_PHONE_1', 'FLAG_EMAIL_0', 'FLAG_EMAIL_1', 'REG_REGION_NOT_LIVE_REGION_0', 'REG_REGION_NOT_LIVE_REGION_1', 'REG_REGION_NOT_WORK_REGION_0', 'REG_REGION_NOT_WORK_REGION_1', 'REG_CITY_NOT_LIVE_CITY_0', 'REG_CITY_NOT_LIVE_CITY_1', 'FLAG_DOCUMENT_3_0', 'FLAG_DOCUMENT_3_1', 'FLAG_DOCUMENT_5_0', 'FLAG_DOCUMENT_5_1', 'FLAG_DOCUMENT_6_0', 'FLAG_DOCUMENT_6_1', 'FLAG_DOCUMENT_8_1', 'FLAG_DOCUMENT_16_0', 'FLAG_DOCUMENT_16_1', 'FLAG_DOCUMENT_18_0', 'FLAG_DOCUMENT_18_1']

# x=['AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'annuity_income_percentage', 'car_to_birth_ratio', 'children_ratio', 'credit_to_annuity_ratio', 'credit_to_goods_ratio', 'credit_to_income_ratio', 'days_employed_percentage', 'income_credit_percentage', 'income_per_child', 'payment_rate', 'phone_to_birth_ratio', 'phone_to_employ_ratio', 'external_source_mean', 'child_to_non_child_ratio', 'credit_per_person', 'CODE_GENDER_F', 'CODE_GENDER_M', 'FLAG_OWN_CAR_N', 'FLAG_OWN_CAR_Y', 'FLAG_OWN_REALTY_N', 'FLAG_OWN_REALTY_Y', 'NAME_TYPE_SUITE_Unaccompanied', 'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Working', 'NAME_EDUCATION_TYPE_Higher education', 'NAME_EDUCATION_TYPE_Lower secondary', 'NAME_EDUCATION_TYPE_Secondary / secondary special', 'NAME_FAMILY_STATUS_Civil marriage', 'NAME_FAMILY_STATUS_Married', 'NAME_FAMILY_STATUS_Widow', 'NAME_HOUSING_TYPE_House / apartment', 'NAME_HOUSING_TYPE_Municipal apartment', 'OCCUPATION_TYPE_Accountants', 'OCCUPATION_TYPE_Core staff', 'OCCUPATION_TYPE_Drivers', 'OCCUPATION_TYPE_Laborers', 'OCCUPATION_TYPE_Low-skill Laborers', 'OCCUPATION_TYPE_Sales staff', 'OCCUPATION_TYPE_missing', 'WALLSMATERIAL_MODE_Panel', 'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_missing', 'WEEKDAY_APPR_PROCESS_START_MONDAY', 'WEEKDAY_APPR_PROCESS_START_SATURDAY', 'WEEKDAY_APPR_PROCESS_START_TUESDAY', 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY', 'ORGANIZATION_TYPE_Business Entity Type 3', 'ORGANIZATION_TYPE_Construction', 'ORGANIZATION_TYPE_Industry: type 9', 'ORGANIZATION_TYPE_Military', 'ORGANIZATION_TYPE_Self-employed', 'FLAG_WORK_PHONE_0', 'FLAG_WORK_PHONE_1', 'FLAG_PHONE_0', 'FLAG_PHONE_1', 'REG_CITY_NOT_LIVE_CITY_0', 'REG_CITY_NOT_LIVE_CITY_1', 'FLAG_DOCUMENT_3_0', 'FLAG_DOCUMENT_3_1', 'FLAG_DOCUMENT_6_0', 'FLAG_DOCUMENT_6_1', 'FLAG_DOCUMENT_16_0', 'FLAG_DOCUMENT_16_1', 'FLAG_DOCUMENT_18_0', 'FLAG_DOCUMENT_18_1']

In [23]:
len(x)

86

In [29]:
selected_attribs = set([f if f in num_attribs_total else '_'.join(f.split('_')[:-1]) for f in selected_features])
selected_num_attribs = set([f for f in selected_features if f in num_attribs_total])
selected_cat_attribs = set(['_'.join(f.split('_')[:-1]) for f in selected_features if f not in num_attribs_total])

In [30]:
selected_num_attribs, len(selected_num_attribs), len(num_attribs_total)

({'AMT_ANNUITY',
  'AMT_GOODS_PRICE',
  'AMT_REQ_CREDIT_BUREAU_DAY',
  'AMT_REQ_CREDIT_BUREAU_MON',
  'AMT_REQ_CREDIT_BUREAU_QRT',
  'AMT_REQ_CREDIT_BUREAU_WEEK',
  'AMT_REQ_CREDIT_BUREAU_YEAR',
  'DAYS_BIRTH',
  'DAYS_EMPLOYED',
  'DAYS_ID_PUBLISH',
  'DAYS_REGISTRATION',
  'DEF_30_CNT_SOCIAL_CIRCLE',
  'DEF_60_CNT_SOCIAL_CIRCLE',
  'EXT_SOURCE_1',
  'EXT_SOURCE_2',
  'EXT_SOURCE_3',
  'HOUR_APPR_PROCESS_START',
  'OBS_30_CNT_SOCIAL_CIRCLE',
  'OWN_CAR_AGE',
  'REGION_POPULATION_RELATIVE',
  'REGION_RATING_CLIENT_W_CITY',
  'annuity_income_percentage',
  'car_to_birth_ratio',
  'child_to_non_child_ratio',
  'children_ratio',
  'credit_per_person',
  'credit_to_goods_ratio',
  'credit_to_income_ratio',
  'days_employed_percentage',
  'external_source_mean',
  'income_per_child',
  'payment_rate',
  'phone_to_birth_ratio',
  'phone_to_employ_ratio'},
 34,
 50)

In [31]:
selected_cat_attribs, len(selected_cat_attribs), len(cat_attribs)

({'CODE_GENDER',
  'EMERGENCYSTATE_MODE',
  'FLAG_DOCUMENT_16',
  'FLAG_DOCUMENT_18',
  'FLAG_DOCUMENT_3',
  'FLAG_DOCUMENT_6',
  'FLAG_OWN_CAR',
  'FLAG_OWN_REALTY',
  'FLAG_PHONE',
  'FLAG_WORK_PHONE',
  'NAME_EDUCATION_TYPE',
  'NAME_FAMILY_STATUS',
  'NAME_HOUSING_TYPE',
  'NAME_INCOME_TYPE',
  'NAME_TYPE_SUITE',
  'OCCUPATION_TYPE',
  'ORGANIZATION_TYPE',
  'REG_CITY_NOT_LIVE_CITY',
  'WALLSMATERIAL_MODE',
  'WEEKDAY_APPR_PROCESS_START'},
 20,
 47)

In [32]:
unused_attribs = set(num_attribs_total+cat_attribs) - selected_attribs

In [33]:
unused_attribs, len(unused_attribs)

({'AMT_CREDIT',
  'AMT_INCOME_TOTAL',
  'AMT_REQ_CREDIT_BUREAU_HOUR',
  'CNT_CHILDREN',
  'CNT_FAM_MEMBERS',
  'DAYS_LAST_PHONE_CHANGE',
  'FLAG_CONT_MOBILE',
  'FLAG_DOCUMENT_10',
  'FLAG_DOCUMENT_11',
  'FLAG_DOCUMENT_12',
  'FLAG_DOCUMENT_13',
  'FLAG_DOCUMENT_14',
  'FLAG_DOCUMENT_15',
  'FLAG_DOCUMENT_17',
  'FLAG_DOCUMENT_19',
  'FLAG_DOCUMENT_2',
  'FLAG_DOCUMENT_20',
  'FLAG_DOCUMENT_21',
  'FLAG_DOCUMENT_4',
  'FLAG_DOCUMENT_5',
  'FLAG_DOCUMENT_7',
  'FLAG_DOCUMENT_8',
  'FLAG_DOCUMENT_9',
  'FLAG_EMAIL',
  'FLAG_EMP_PHONE',
  'FLAG_MOBIL',
  'HOUSETYPE_MODE',
  'LIVE_CITY_NOT_WORK_CITY',
  'LIVE_REGION_NOT_WORK_REGION',
  'NAME_CONTRACT_TYPE',
  'OBS_60_CNT_SOCIAL_CIRCLE',
  'REGION_RATING_CLIENT',
  'REG_CITY_NOT_WORK_CITY',
  'REG_REGION_NOT_LIVE_REGION',
  'REG_REGION_NOT_WORK_REGION',
  'car_to_employ_ratio',
  'cnt_non_child',
  'credit_per_child',
  'credit_per_non_child',
  'credit_to_annuity_ratio',
  'income_credit_percentage',
  'income_per_non_child',
  'income_pe