# Imports

In [1]:
import functools
import os
import random
import warnings

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn import metrics
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearnex import patch_sklearn

patch_sklearn()
warnings.filterwarnings("ignore")

sns.set_theme()
sns.set_style("ticks")
sns.despine()

%matplotlib inline

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

# Configuration

In [3]:
class Config:
    DATA_DIR = "../input/home-loan-prediction-prepared-datasets"
    
    N_TRIALS = 100
    
    EVAL_METRIC = "logloss"
    
    DEFAULT_VALUES = {
        "max_depth": 6,
        "n_estimators": 100,
        "alpha": 0.0,
        "lambda": 1.0,
        "learning_rate": 0.3,
        "colsample_bytree": 1.0,
        "colsample_bylevel": 1.0,
        "min_child_weight": 1.0,
        "scale_pos_weight": 1.0,
        "sampling_method": "uniform",
        "early_stopping_rounds": None,
    }
    
    STATIC_PARAMS = {
        "tree_method": "gpu_hist",
        "use_label_encoder": False,
        "n_jobs": 2,
        "predictor": "gpu_predictor",
        "max_bin": 1024,
        "eval_metric": EVAL_METRIC,
    }
    
    @classmethod
    def filepath(cls, filename):
        return os.path.join(cls.DATA_DIR, filename)

# Load Dataset

In [4]:
train_df = pd.read_csv(Config.filepath("train_with_new_features.csv"))
test_df = pd.read_csv(Config.filepath("test_with_new_features.csv"))

In [5]:
train_df.head()

Unnamed: 0,sk_id_curr,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,target,loan_rate,loan_income_ratio,annuity_income_ratio,es1_is_missing,es3_is_missing,application_is_incomplete,kfold
0,450407,Cash loans,F,N,Y,1,67500.0,227520.0,11065.5,180000.0,...,0.0,2.0,0,20.561204,3.370667,0.163933,True,True,False,1
1,271298,Cash loans,M,Y,Y,1,247500.0,1882372.5,65560.5,1719000.0,...,1.0,3.0,0,28.711991,7.605545,0.264891,False,False,True,2
2,122238,Cash loans,M,Y,Y,1,180000.0,101880.0,10827.0,90000.0,...,0.0,1.0,0,9.409809,0.566,0.06015,True,False,False,1
3,305311,Cash loans,M,N,N,0,81000.0,405000.0,20677.5,405000.0,...,0.0,2.0,0,19.586507,5.0,0.255278,True,False,True,2
4,414121,Cash loans,F,N,Y,0,157500.0,888840.0,29506.5,675000.0,...,0.0,2.0,0,30.123532,5.643429,0.187343,False,False,True,2


In [6]:
test_df.head()

Unnamed: 0,sk_id_curr,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,loan_rate,loan_income_ratio,annuity_income_ratio,es1_is_missing,es3_is_missing,application_is_incomplete
0,367294,Cash loans,F,N,Y,0,180000.0,265306.5,25317.0,252000.0,...,0.0,1.0,2.0,5.0,10.479381,1.473925,0.14065,True,True,False
1,439847,Cash loans,F,N,Y,0,202500.0,346500.0,21069.0,346500.0,...,0.0,0.0,0.0,5.0,16.445963,1.711111,0.104044,True,False,False
2,380562,Cash loans,M,Y,N,0,360000.0,545040.0,36553.5,450000.0,...,0.0,0.0,1.0,5.0,14.910747,1.514,0.101538,False,False,False
3,407238,Cash loans,F,N,Y,0,135000.0,307557.0,20682.0,265500.0,...,1.0,0.0,1.0,2.0,14.870757,2.2782,0.1532,False,False,True
4,239910,Cash loans,F,N,Y,0,157500.0,1056447.0,31018.5,922500.0,...,,,,,34.05861,6.7076,0.196943,True,True,True


# Training Loop

In [7]:
def train(df, test_df, params, verbose=True):
    # Create copies so that original datatsets do not change
    df = df.copy()
    df["preds"] = pd.NA
    
    drop = ["target", "preds", "kfold"]
    
    # Add static params - Parameters that are not tuned
    params.update(Config.STATIC_PARAMS)
    
    # For storing total accuracy across folds for averaging
    total_f1 = 0.0
    
    # Empty list for storing test predictions in each fold
    test_preds = []
    
    for fold in range(5):
        train = df[df["kfold"] != fold]
        
        # Get training features and labels
        y_train = train["target"]
        X_train = train.drop(drop, axis=1)
        
        val = df[df["kfold"] == fold]
        
        # Get validation features and labels
        y_val = val["target"]
        X_val = val.drop(drop, axis=1)
        
        # Initialize model
        clf = xgb.XGBClassifier(**params)
        
        # Train model on training set
        clf.fit(
            X=X_train,
            y=y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            verbose=False,
        )
        
        # Make predictions on validation set
        val_pred = clf.predict(X_val)
        f1 = metrics.f1_score(y_val, val_pred, average="macro")
        
        # Report accuracy if verbose is True
        if verbose is True:
            print(f"Fold {fold + 1} - Macro F1 = {f1: .4f}")
        
        # Add to total accuracy
        total_f1 += f1
        
        # Make predictions on validation set again
        # But this time in terms of probabilities
        # And store in the df
        # These will be used in the meta model
        df.loc[val.index, "preds"] = clf.predict_proba(X_val)[:, 1]
        
        # Get the test predictions for this fold in terms of probability
        test_preds.append(clf.predict_proba(test_df)[:, 1])
        
    avg_f1 = total_f1 / 5
    
    if verbose is True:
        print(f"Overall ROC AUC = {avg_f1: .4f}")   
    
    # Calculate final test predictions
    test_preds = np.vstack(test_preds)
    test_preds = test_preds.mean(axis=0)
    
    # Return val preds, test preds and overall accuracy
    return df["preds"].values, test_preds, avg_f1

# Optuna Objective

In [8]:
def objective(trial, train_df, test_df):
    params = {
        "max_depth": trial.suggest_int("max_depth", 1, 11),
        "n_estimators": trial.suggest_int("n_estimators", 5, 500),
        "alpha": trial.suggest_uniform("alpha", 0.0, 5.0),
        "lambda": trial.suggest_float("lambda", 1.0, 5.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.8, log=True),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 20, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.2, 1.0),
        "min_child_weight": trial.suggest_uniform("min_child_weight", 1, 100),
        "sampling_method": trial.suggest_categorical("sampling_method", ["uniform", "gradient_based"]),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 5, 20, step=5)
    }
    
    obs_k = f"validation_1-{Config.EVAL_METRIC}"
    params["callbacks"] = [optuna.integration.XGBoostPruningCallback(trial, obs_k)]
    
    _, _, f1 = train(df=train_df, test_df=test_df, params=params, verbose=False)
    return f1

# Hyperparameter Search

In [9]:
def hyperparameter_search(train_df, test_df, n_trials=Config.N_TRIALS):
    _objective = functools.partial(objective, train_df=train_df, test_df=test_df)
    
    pruner = optuna.pruners.HyperbandPruner()
    
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(
        direction="maximize",
        pruner=pruner,
        sampler=sampler,
    )
    
    study.optimize(_objective, n_trials=n_trials, gc_after_trial=True)

    return study.best_params

# Preprocessing Before Encoding Categorical Features

In [10]:
cat_feats = pd.read_csv(Config.filepath("catgeorical_features.csv"))
cat_feats = cat_feats["column"].to_list()

num_feats = pd.read_csv(Config.filepath("numerical_features.csv"))
num_feats = num_feats["column"].to_list()

In [11]:
imputer = IterativeImputer(max_iter=100)
scaler = MinMaxScaler()
cat_imputer = SimpleImputer(strategy="most_frequent")

train_df[num_feats] = imputer.fit_transform(train_df[num_feats])
train_df[num_feats] = scaler.fit_transform(train_df[num_feats])
train_df[cat_feats] = cat_imputer.fit_transform(train_df[cat_feats])

In [12]:
test_df[num_feats] = imputer.transform(test_df[num_feats])
test_df[num_feats] = scaler.transform(test_df[num_feats])
test_df[cat_feats] = cat_imputer.transform(test_df[cat_feats])

# XGBoost with Label Encoding

In [13]:
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

train_le_df = train_df.copy()
train_le_df[cat_feats] = encoder.fit_transform(train_le_df[cat_feats])

test_le_df = test_df.copy()
test_le_df[cat_feats] = encoder.transform(test_le_df[cat_feats])

In [14]:
train_le_df.head()

Unnamed: 0,sk_id_curr,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,target,loan_rate,loan_income_ratio,annuity_income_ratio,es1_is_missing,es3_is_missing,application_is_incomplete,kfold
0,450407,0.0,0.0,0.0,1.0,1.0,0.000354,0.045573,0.036855,0.034792,...,0.0,2.0,0,0.336063,0.068381,0.121219,1.0,1.0,0.0,1
1,271298,0.0,1.0,1.0,1.0,1.0,0.001893,0.45877,0.249386,0.418631,...,1.0,3.0,0,0.554768,0.154416,0.195973,0.0,0.0,1.0,2
2,122238,0.0,1.0,1.0,1.0,1.0,0.001316,0.014202,0.035925,0.012346,...,0.0,1.0,0,0.036844,0.011401,0.044372,1.0,0.0,0.0,1
3,305311,0.0,1.0,0.0,0.0,0.0,0.000469,0.089888,0.074342,0.090909,...,0.0,2.0,0,0.30991,0.101482,0.188855,1.0,0.0,1.0,2
4,414121,0.0,0.0,0.0,1.0,0.0,0.001123,0.210697,0.108775,0.158249,...,0.0,2.0,0,0.592643,0.114554,0.138553,0.0,0.0,1.0,2


In [15]:
test_le_df.head()

Unnamed: 0,sk_id_curr,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,loan_rate,loan_income_ratio,annuity_income_ratio,es1_is_missing,es3_is_missing,application_is_incomplete
0,367294,0.0,0.0,0.0,1.0,0.0,0.001316,0.055008,0.092436,0.05275,...,0.0,1.0,2.0,5.0,0.065544,0.029847,0.103979,1.0,1.0,0.0
1,439847,0.0,0.0,0.0,1.0,0.0,0.001508,0.075281,0.075869,0.076319,...,0.0,0.0,0.0,5.0,0.225641,0.034665,0.076874,1.0,0.0,0.0
2,380562,0.0,1.0,1.0,0.0,0.0,0.002854,0.124854,0.136258,0.102132,...,0.0,0.0,1.0,5.0,0.184448,0.030661,0.075018,0.0,0.0,0.0
3,407238,0.0,0.0,0.0,1.0,0.0,0.000931,0.065557,0.074359,0.056117,...,1.0,0.0,1.0,2.0,0.183375,0.046186,0.113271,0.0,0.0,1.0
4,239910,0.0,0.0,0.0,1.0,0.0,0.001123,0.252546,0.114672,0.219978,...,0.0,0.0,0.0,0.0,0.69823,0.136174,0.145661,1.0,1.0,1.0


## Find Optimal Hyperparameters

In [16]:
train_le_df = train_le_df.drop("sk_id_curr", axis=1)

test_sk_id = test_le_df["sk_id_curr"].to_numpy()
test_le_df = test_le_df.drop("sk_id_curr", axis=1)

best_params = hyperparameter_search(train_df=train_le_df, test_df=test_le_df)

[32m[I 2022-11-22 23:18:39,827][0m A new study created in memory with name: no-name-0206dc47-4548-44a6-b25b-1aab6ae30072[0m
[32m[I 2022-11-22 23:19:10,039][0m Trial 0 finished with value: 0.5397271738567078 and parameters: {'max_depth': 5, 'n_estimators': 476, 'alpha': 3.6599697090570253, 'lambda': 2.6208630215377515, 'learning_rate': 0.05007246338409305, 'scale_pos_weight': 1.5957084694148351, 'colsample_bytree': 0.24646688973455957, 'colsample_bylevel': 0.8929409166199482, 'min_child_weight': 60.510386162577674, 'sampling_method': 'uniform', 'early_stopping_rounds': 20}. Best is trial 0 with value: 0.5397271738567078.[0m
[32m[I 2022-11-22 23:19:36,403][0m Trial 1 finished with value: 0.6142852901803758 and parameters: {'max_depth': 10, 'n_estimators': 110, 'alpha': 0.9091248360355031, 'lambda': 1.3433656868034294, 'learning_rate': 0.08146314642362987, 'scale_pos_weight': 4.816414530907083, 'colsample_bytree': 0.5455560149136927, 'colsample_bylevel': 0.43298331215843355, 'min_

In [17]:
best_params

{'max_depth': 10,
 'n_estimators': 110,
 'alpha': 0.9091248360355031,
 'lambda': 1.3433656868034294,
 'learning_rate': 0.08146314642362987,
 'scale_pos_weight': 4.816414530907083,
 'colsample_bytree': 0.5455560149136927,
 'colsample_bylevel': 0.43298331215843355,
 'min_child_weight': 61.573436577515565,
 'sampling_method': 'gradient_based',
 'early_stopping_rounds': 10}

## Train Model with Optimal Hyperparameters

In [18]:
train_preds, test_preds, f1 = train(
    df=train_le_df,
    test_df=test_le_df,
    params=best_params,
    verbose=True
)

Fold 1 - Macro F1 =  0.6139
Fold 2 - Macro F1 =  0.6142
Fold 3 - Macro F1 =  0.6135
Fold 4 - Macro F1 =  0.6159
Fold 5 - Macro F1 =  0.6139
Overall ROC AUC =  0.6143


## Create Submission

In [19]:
submission_le = pd.DataFrame({"SK_ID_CURR": test_sk_id, "TARGET": (test_preds >= 0.5).astype(int)})
submission_le.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,367294,0
1,439847,0
2,380562,0
3,407238,0
4,239910,1


In [20]:
submission_le.to_csv("./submission_le.csv", index=False)

# XGBoost with One-Hot Encoding

In [21]:
encoder = OneHotEncoder(handle_unknown="ignore", drop="if_binary")

train_ohe_df = train_df.copy()

encoded_features = encoder.fit_transform(train_ohe_df[cat_feats])
encoded_features = encoded_features.toarray()

train_ohe_df = train_ohe_df.drop(cat_feats, axis=1)
encoded_cols = encoder.get_feature_names_out()

train_ohe_df[encoded_cols] = encoded_features

In [22]:
test_ohe_df = test_df.copy()

encoded_features = encoder.transform(test_ohe_df[cat_feats])
encoded_features = encoded_features.toarray()

test_ohe_df = test_ohe_df.drop(cat_feats, axis=1)
encoded_cols = encoder.get_feature_names_out()

test_ohe_df[encoded_cols] = encoded_features

In [23]:
train_ohe_df.head()

Unnamed: 0,sk_id_curr,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,...,def_60_cnt_social_circle_2.0,def_60_cnt_social_circle_3.0,def_60_cnt_social_circle_4.0,def_60_cnt_social_circle_5.0,def_60_cnt_social_circle_6.0,def_60_cnt_social_circle_7.0,def_60_cnt_social_circle_24.0,es1_is_missing_True,es3_is_missing_True,application_is_incomplete_True
0,450407,0.000354,0.045573,0.036855,0.034792,0.254009,0.451163,0.956789,0.761701,0.881717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,271298,0.001893,0.45877,0.249386,0.418631,0.257,0.370427,0.99464,0.756107,0.659251,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,122238,0.001316,0.014202,0.035925,0.012346,0.309272,0.595754,0.657771,0.924154,0.284349,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,305311,0.000469,0.089888,0.074342,0.090909,0.491595,0.702292,0.671114,0.725401,0.276516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,414121,0.001123,0.210697,0.108775,0.158249,0.256321,0.500903,0.980739,0.939657,0.275576,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
test_ohe_df.head()

Unnamed: 0,sk_id_curr,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,...,def_60_cnt_social_circle_2.0,def_60_cnt_social_circle_3.0,def_60_cnt_social_circle_4.0,def_60_cnt_social_circle_5.0,def_60_cnt_social_circle_6.0,def_60_cnt_social_circle_7.0,def_60_cnt_social_circle_24.0,es1_is_missing_True,es3_is_missing_True,application_is_incomplete_True
0,367294,0.001316,0.055008,0.092436,0.05275,0.446855,0.213866,0.702825,0.84344,0.287012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,439847,0.001508,0.075281,0.075869,0.076319,0.361433,0.288731,0.921505,0.690041,0.429892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,380562,0.002854,0.124854,0.136258,0.102132,0.134897,0.688403,0.895377,0.787538,0.165909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,407238,0.000931,0.065557,0.074359,0.056117,0.115414,0.818767,0.918825,0.95644,0.652201,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,239910,0.001123,0.252546,0.114672,0.219978,0.143358,0.390583,21.390967,0.554792,0.713458,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


## Find Optimal Hyperparameters

In [25]:
train_ohe_df = train_ohe_df.drop("sk_id_curr", axis=1)

test_sk_id = test_ohe_df["sk_id_curr"].to_numpy()
test_ohe_df = test_ohe_df.drop("sk_id_curr", axis=1)

best_params = hyperparameter_search(train_df=train_ohe_df, test_df=test_ohe_df)

[32m[I 2022-11-22 23:28:19,073][0m A new study created in memory with name: no-name-a3f23221-78ec-452e-bb0c-ec6260efde51[0m
[32m[I 2022-11-22 23:29:37,743][0m Trial 0 finished with value: 0.5385109043719727 and parameters: {'max_depth': 5, 'n_estimators': 476, 'alpha': 3.6599697090570253, 'lambda': 2.6208630215377515, 'learning_rate': 0.05007246338409305, 'scale_pos_weight': 1.5957084694148351, 'colsample_bytree': 0.24646688973455957, 'colsample_bylevel': 0.8929409166199482, 'min_child_weight': 60.510386162577674, 'sampling_method': 'uniform', 'early_stopping_rounds': 20}. Best is trial 0 with value: 0.5385109043719727.[0m
[32m[I 2022-11-22 23:30:36,880][0m Trial 1 finished with value: 0.6164268967999607 and parameters: {'max_depth': 10, 'n_estimators': 110, 'alpha': 0.9091248360355031, 'lambda': 1.3433656868034294, 'learning_rate': 0.08146314642362987, 'scale_pos_weight': 4.816414530907083, 'colsample_bytree': 0.5455560149136927, 'colsample_bylevel': 0.43298331215843355, 'min_

In [26]:
best_params

{'max_depth': 5,
 'n_estimators': 139,
 'alpha': 4.143687545759647,
 'lambda': 1.7756602567007318,
 'learning_rate': 0.0754614387734387,
 'scale_pos_weight': 5.0823419597214565,
 'colsample_bytree': 0.31273937997981016,
 'colsample_bylevel': 0.8417575846032317,
 'min_child_weight': 8.380513724297312,
 'sampling_method': 'uniform',
 'early_stopping_rounds': 5}

## Train Model with Optimal Hyperparameters

In [27]:
train_preds, test_preds, f1 = train(
    df=train_ohe_df,
    test_df=test_ohe_df,
    params=best_params,
    verbose=True
)

Fold 1 - Macro F1 =  0.6172
Fold 2 - Macro F1 =  0.6223
Fold 3 - Macro F1 =  0.6144
Fold 4 - Macro F1 =  0.6150
Fold 5 - Macro F1 =  0.6183
Overall ROC AUC =  0.6174


## Create Submission

In [28]:
submission_ohe = pd.DataFrame({"SK_ID_CURR": test_sk_id, "TARGET": (test_preds >= 0.5).astype(int)})
submission_ohe.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,367294,0
1,439847,0
2,380562,0
3,407238,0
4,239910,1


In [29]:
submission_ohe.to_csv("./submission_ohe.csv", index=False)