# utils

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, precision_recall_curve, f1_score
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
import datetime
import warnings
from joblib import dump
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator, TransformerMixin

def load_data(in_path, name):
    df = pd.read_csv(in_path)
    return df

def load_datasets(DATA_DIR, ds_names):
    datasets = {}
    for ds_name in ds_names:
        datasets[ds_name] = load_data(os.path.join(DATA_DIR, f'{ds_name}.csv'), ds_name)
    return datasets
    
def pct(x):
    return round(100*x,3)

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [3]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names]


def show_scores(y, y_pred, percentages=False):
    conf_mx = confusion_matrix(y, y_pred)
    if percentages:
        conf_mx = 100*conf_mx/y.shape[0]
    print('scores\n')
    print('precision', precision_score(y, y_pred))
    print('recall   ', recall_score(y, y_pred))
    print('f1       ', f1_score(y, y_pred))
    print('accuracy ', np.sum(y == y_pred)/y.shape[0])

    ax = plt.subplot()
    sns.heatmap(conf_mx, annot=True, fmt='3.1f')

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')

# loading the datasets

In [None]:
DATA_DIR = "home-credit-default-risk"
ds_names = ("application_train", "application_test", "bureau","bureau_balance","credit_card_balance","installments_payments",
            "previous_application","POS_CASH_balance")

datasets = load_datasets(DATA_DIR, ds_names)

# transformations

In [5]:
def transform_days(X):
    mask = X > 0
    X[mask] = np.NaN
    # return np.log1p(-1*X)
    return -X

In [6]:
def preprocessing_transformations(df, inplace=False):
    # pure state-less transformations
    if inplace:
        df_new = df
    else:
        df_new = df.copy()

    right_skewed = ['AMT_ANNUITY']
    left_skewed = []
    days = ['DAYS_EMPLOYED']

    def transform_left_skewed(X): return np.log(1+np.max(X)-X)

    df_new[right_skewed] = np.log1p(df[right_skewed])
    df_new[left_skewed] = transform_left_skewed(df[left_skewed])
    df_new[days] = transform_days(df[days])

    # others
    df_new['OWN_CAR_AGE'] = SimpleImputer(
        strategy='constant', fill_value=0).fit_transform(df[['OWN_CAR_AGE']]).ravel()
    df['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
    df['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
    df['CODE_GENDER'].replace('XNA', np.nan, inplace=True)

    return df_new


def add_new_features(df, inplace=False):
    if inplace:
        X = df
    else:
        X = df.copy()
    X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
    X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
    X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / (1+X['DAYS_EMPLOYED'])
    X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
    X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
    X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
    X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
    X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
    X['income_credit_percentage'] = X['AMT_INCOME_TOTAL'] / X['AMT_CREDIT']
    X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
    X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
    X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
    X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']
    X['phone_to_employ_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / \
        (1+X['DAYS_EMPLOYED'])
    X['external_source_mean'] = X[['EXT_SOURCE_1',
                                   'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    X['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']
    X['child_to_non_child_ratio'] = X['CNT_CHILDREN'] / X['cnt_non_child']
    X['income_per_non_child'] = X['AMT_INCOME_TOTAL'] / X['cnt_non_child']
    X['credit_per_person'] = X['AMT_CREDIT'] / X['CNT_FAM_MEMBERS']
    X['credit_per_child'] = X['AMT_CREDIT'] / (1 + X['CNT_CHILDREN'])
    X['credit_per_non_child'] = X['AMT_CREDIT'] / X['cnt_non_child']

    return X

# agg features

In [None]:
from transform_agg_merge import cash_transform,cashAppsFeaturesAggregater,install_transform,instlmntAppsFeaturesAggregater,credit_transform,creditAppsFeaturesAggregater

appl_train = preprocessing_transformations(datasets['application_train'])

cash = datasets['POS_CASH_balance']
cashData_aggr = cashAppsFeaturesAggregater(cash_transform(cash))
data_aggr = appl_train.merge(cashData_aggr, how='left', on=['SK_ID_CURR'])
install = datasets['installments_payments']
instlmntData_aggr = instlmntAppsFeaturesAggregater(install_transform(install))
data_aggr = appl_train.merge(instlmntData_aggr, how='left', on=['SK_ID_CURR'])
credit = datasets['credit_card_balance']
creditData_aggr = creditAppsFeaturesAggregater(credit_transform(credit))
data_aggr = appl_train.merge(creditData_aggr, how='left', on=['SK_ID_CURR'])

# pipeline

In [7]:
def make_prep_pipeline():
    # numerical and categorical pipelines
    num_attribs = ['AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE',
                   'DAYS_EMPLOYED', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_ID_PUBLISH',
                   'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'OWN_CAR_AGE', 'OBS_30_CNT_SOCIAL_CIRCLE',
                   'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
                   'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
                   'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR',
                   'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
                   'HOUR_APPR_PROCESS_START']
    new_features = ['annuity_income_percentage', 'car_to_birth_ratio', 'car_to_employ_ratio', 'children_ratio',
                   'credit_to_annuity_ratio', 'credit_to_goods_ratio', 'credit_to_income_ratio', 'days_employed_percentage',
                    'income_credit_percentage', 'income_per_child', 'income_per_person', 'payment_rate', 'phone_to_birth_ratio',
                   'phone_to_employ_ratio', 'external_source_mean', 'cnt_non_child', 'child_to_non_child_ratio',
                    'income_per_non_child', 'credit_per_person', 'credit_per_child', 'credit_per_non_child']
    num_attribs_total = num_attribs + new_features

    num_pipeline = Pipeline([
        ('new_features', FunctionTransformer(add_new_features)),
        ('selector', DataFrameSelector(num_attribs_total)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('std_scaler', StandardScaler()),
    ])

    cat_attribs = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
                   'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE',
                   'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
                   'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
                   'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2',
                   'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
                   'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12',
                   'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
                   'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21'
                  ]

    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('ohe', OneHotEncoder(sparse=False, handle_unknown="ignore"))
    ])

    data_prep_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline)
    ])
    return (data_prep_pipeline, num_attribs_total, cat_attribs)

In [63]:
from sklearn.ensemble import GradientBoostingClassifier

def main():
   
    #split train and test data
    y = datasets['application_train']['TARGET']
    X = preprocessing_transformations(datasets['application_train'])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42)

#     y = data_aggr['TARGET']
#     X = data_aggr
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    
    # create numerical and catergorical pipelines
    data_prep_pipeline, num_attribs_total, cat_attribs = make_prep_pipeline()

    #full_pipeline_with_predictor
    params = { 
    'gb__n_estimators': (5000,7000),
    'gb__n_iter_no_change': (50,80),
    'gb__learning_rate': (0.05,0.001),
    #'gb__max_features': ('sqrt', 'log2'),
    'gb__min_samples_split':(5,10),
    'gb__subsample':(0.4,0.8)
} 

    np.random.seed(42)
    full_pipeline_with_predictor = Pipeline([
        ("preparation", data_prep_pipeline),
        ('L1_selector', SelectFromModel(LogisticRegression(
                C=0.006404,
                penalty='l1',
                solver='liblinear',
                class_weight='balanced',
                random_state=0))),
        ("gb", GradientBoostingClassifier(max_features = 'log2',
                                        #subsample=0.5,
                                         max_leaf_nodes=6,
                                         #learning_rate=0.05,
                                          #n_estimators=5000
                                         ))
    ])
    
    #perform grid search
    grid_search = GridSearchCV(full_pipeline_with_predictor, params, scoring='roc_auc', cv=3,
                                   n_jobs=-1, verbose=3)
    grid_search.fit(X_train, y_train)
    
    print('trained')
    
    # Collect the best parameters found by the grid search
    y_train_pred_proba = grid_search.best_estimator_.predict_proba(X_train)[
            :, 1]
    y_test_pred_proba = grid_search.best_estimator_.predict_proba(X_test)[
            :, 1]
    best_train_score = np.round(
            roc_auc_score(y_train, y_train_pred_proba), 5)
    best_test_score = np.round(roc_auc_score(y_test, y_test_pred_proba), 5)

        # Best estimator score
    best_cv_score = np.round(grid_search.best_score_, 5)
    best_cv_std = np.round(
            grid_search.cv_results_['std_test_score'][grid_search.best_index_], 5)

    mean_fit_time = np.round(
            grid_search.cv_results_['mean_fit_time'][grid_search.best_index_], 5)
    mean_score_time = np.round(
            grid_search.cv_results_['mean_score_time'][grid_search.best_index_], 5)

        
    print("Best Parameters:")
    print(grid_search.best_estimator_.get_params())            
            
    results = pd.DataFrame(columns=[
        "ExpID",
        "Train Score",
        "CV Score",
        "CV Score std",
        "Test Score",
        "Train Time(s)",
        "Test Time(s)"
    ])

        # Record the results
    results = results.append({
        "ExpID":'GradientBoosting',
        "Train Score":best_train_score,
        "CV Score":best_cv_score,
        "CV Score std":best_cv_std,
        "Test Score":best_test_score,
        "Train Time(s)":mean_fit_time,
        "Test Time(s)":mean_score_time  
    },ignore_index = True)

    # report
    print(results)


if __name__ == '__main__':
    main()

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 33.3min


KeyboardInterrupt: 

In [None]:
 results.loc= ['GradientBoostingClassifier',
                      best_train_score,
                      best_cv_score,
                      best_cv_std,
                      best_test_score,
                      mean_fit_time,
                      mean_score_time]