## Introduction


The goal of this classification task is to predict whether an applicant should be approved for a loan or not based on relevant features. In this notebook, we will perform some exploratory data analysis, and then train and tune a XGBoost classification model.


In [1]:
%%capture
!pip install botorch optuna-integration

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import optuna
from optuna.integration.botorch import BoTorchSampler

import warnings
import joblib
from pathlib import Path
from typing import Dict, Tuple, Union

plt.rcParams.update(**{'figure.dpi': 150})
plt.style.use('ggplot')

## Load and inspect the data

In [3]:
path = Path('/kaggle/input/playground-series-s4e10/')
train = pd.read_csv(path / 'train.csv', index_col='id')
test = pd.read_csv(path / 'test.csv', index_col='id')

In [4]:
# Shape of data
print(f'Training data shape: {train.shape}')
print(f'Testing data shape: {test.shape}')

Training data shape: (58645, 12)
Testing data shape: (39098, 11)


In [5]:
train.head()

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


## Exploratory Data Analysis

### Missing values 

We begin by checking for missing values in the dataset. There appear to be no missing values in the dataset.

In [6]:
def filter_greater_than(s: pd.Series, threshold: float=0.0) -> pd.Series:
    return s[s > threshold]

missing_perc = train.isna().sum().pipe(filter_greater_than, 0)
missing_perc

Series([], dtype: int64)

### Target distribution

The target variable is the `loan_status`, which is 1 if the application was approved for a loan or 0 otherwise. Less than 15% of the applicants in the training data were approved. This is clearly an imbalanced classification problem. 

In [7]:
train['loan_status'].value_counts(normalize=True).sort_index()

loan_status
0    0.857618
1    0.142382
Name: proportion, dtype: float64

### Feature analysis 

TODO

## Prepare data

In [8]:
X = train.drop(columns = ['loan_status'])
y = train['loan_status'].values

In [9]:
categorical_columns = train.select_dtypes(exclude=['number']).columns.tolist()
print(categorical_columns)

['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']


## XGBoost model

In [10]:
def return_estimator(params:Dict = {}):
    return Pipeline([
        (
            'preproc', 
            ColumnTransformer(
                transformers = [('ohe', OneHotEncoder(sparse_output=False), categorical_columns)],
                remainder='passthrough',
                sparse_threshold=0
            ).set_output(transform='pandas')
        ),
        ('xgb', XGBClassifier(tree_method='hist', **params))
    ])


## Tuning hyperparameters via optuna

In [11]:
def fit_and_test_model(
    X: pd.DataFrame, y: np.ndarray,
    train_index: np.ndarray, valid_index: np.ndarray,
    params: Dict[str, Union[str, int, float]] = {}
) -> float:
    
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    model = return_estimator(params)
    
    _ = model.fit(X_train, y_train)
    y_valid_pred = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, y_valid_pred)

def optuna_objective(trial: optuna.trial.Trial) -> float:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1500, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.75, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 12, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 100, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 1000, log=True)
    }

    cv_auc = [
        fit_and_test_model(X, y, train_index, valid_index, params) \
            for train_index, valid_index in StratifiedKFold(
                n_splits=5, shuffle=True, random_state=0
            ).split(X, y)
    ]
    return np.mean(cv_auc)


with warnings.catch_warnings():
    warnings.simplefilter(action="ignore", category=optuna.exceptions.ExperimentalWarning)
    warnings.simplefilter(action="ignore", category=FutureWarning)

    study = optuna.create_study(
        direction='maximize', 
        study_name='xgboost',
        sampler=BoTorchSampler(n_startup_trials=10, seed=2)
    )

    # run optuna for a maximum of 120 trials or 1 hr wall clock
    study.optimize(optuna_objective, timeout=3600, n_trials=120)
    
    
# save the runs
_ = joblib.dump(study, 'xgboost_auc.pkl')


[I 2024-10-02 01:04:48,869] A new study created in memory with name: xgboost
[I 2024-10-02 01:04:53,524] Trial 0 finished with value: 0.9123639508933221 and parameters: {'n_estimators': 219, 'learning_rate': 0.0011872426915762793, 'max_depth': 3, 'subsample': 0.7176611963091384, 'colsample_bytree': 0.4783310218787401, 'reg_alpha': 9.397522821096242e-06, 'reg_lambda': 6.94764639026943e-07, 'gamma': 0.01558510252222362, 'min_child_weight': 7.924356787008142}. Best is trial 0 with value: 0.9123639508933221.
[I 2024-10-02 01:04:56,613] Trial 1 finished with value: 0.9341194497375664 and parameters: {'n_estimators': 123, 'learning_rate': 0.06106576779720579, 'max_depth': 3, 'subsample': 0.5672899726724667, 'colsample_bytree': 0.5622203091391718, 'reg_alpha': 4.570448196536859e-07, 'reg_lambda': 0.11695433673868773, 'gamma': 3.4653964518555913, 'min_child_weight': 30.388587370815138}. Best is trial 1 with value: 0.9341194497375664.
[I 2024-10-02 01:05:12,283] Trial 2 finished with value: 0.9

In [12]:
fig = optuna.visualization.plot_optimization_history(study, target_name='CV AUC')

fig.update_layout(
    autosize=True,
    width=800,
    height=600
)
fig.show()

In [13]:
fig = optuna.visualization.plot_param_importances(study)
fig.update_layout(
    autosize=True,
    width=800,
    height=400
)
fig.show()

In [14]:
results = study.trials_dataframe(attrs=('number','value', 'duration', 'params'))
results = results.rename(columns={'value':'AUC'})
results['duration'] = results['duration']/np.timedelta64(1, 's')
results = results.sort_values(by='AUC',ascending=False)
results.to_csv('cv_AUC_history.csv',index=False)

The best hyperparameters are as follows:

In [15]:
study.best_params

{'n_estimators': 1500,
 'learning_rate': 0.10999995057648126,
 'max_depth': 4,
 'subsample': 0.9999999999999999,
 'colsample_bytree': 0.49681888251576845,
 'reg_alpha': 8.919028743833996e-06,
 'reg_lambda': 9.999999999999998,
 'gamma': 0.09787383664032377,
 'min_child_weight': 1.0}

## Final model

In [16]:
model = return_estimator(study.best_params)
_ = model.fit(X, y)

# save model
_ = joblib.dump(model, 'xgboost_loan_approval.pkl')

## Test predictions and submission

In [17]:
submission = pd.DataFrame({
    'id': test.index.tolist(),
    'loan_status': model.predict_proba(test)[:, 1]
})

# save submission
submission.to_csv('submission.csv', index=False)

In [18]:
# check submission output
submission.head(5)

Unnamed: 0,id,loan_status
0,58645,0.997225
1,58646,0.016744
2,58647,0.624585
3,58648,0.007407
4,58649,0.038111
