# Imports

In [54]:
import pandas as pd
import numpy as np

# Data exploration

In [64]:
filename = "./application_data.csv"

df = pd.read_csv(filename)

df.shape

(307511, 122)

In [65]:
columns_to_select = [
    'SK_ID_CURR',
    'TARGET',
    'NAME_CONTRACT_TYPE',
    'AMT_INCOME_TOTAL',
    'AMT_CREDIT',
    'AMT_ANNUITY',
    'AMT_GOODS_PRICE',
    'NAME_INCOME_TYPE',
    'DAYS_EMPLOYED',
    'EXT_SOURCE_1',
    'EXT_SOURCE_2',
    'EXT_SOURCE_3',
    'OBS_30_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE',
    'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_MON'
]

new_df = df[columns_to_select]

def nan_counts(df):
    nan_columns = {}

    for column in df.columns:
        nan_count = df[column].isna().sum()  # Count NaN values in the column
        if nan_count > 0:  # Only include columns with NaN values
            nan_columns[column] = nan_count

    # Display columns with NaN values and their counts
    print("Columns with NaN values:")
    for col, count in nan_columns.items():
        print(f"{col}: {count} NaN values")

print(nan_counts(new_df))
print(new_df.shape)
new_df.head()

Columns with NaN values:
AMT_ANNUITY: 12 NaN values
AMT_GOODS_PRICE: 278 NaN values
EXT_SOURCE_1: 173378 NaN values
EXT_SOURCE_2: 660 NaN values
EXT_SOURCE_3: 60965 NaN values
OBS_30_CNT_SOCIAL_CIRCLE: 1021 NaN values
OBS_60_CNT_SOCIAL_CIRCLE: 1021 NaN values
AMT_REQ_CREDIT_BUREAU_HOUR: 41519 NaN values
AMT_REQ_CREDIT_BUREAU_DAY: 41519 NaN values
AMT_REQ_CREDIT_BUREAU_WEEK: 41519 NaN values
AMT_REQ_CREDIT_BUREAU_MON: 41519 NaN values
None
(307511, 18)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON
0,100002,1,Cash loans,202500.0,406597.5,24700.5,351000.0,Working,-637,0.083037,0.262949,0.139376,2.0,2.0,0.0,0.0,0.0,0.0
1,100003,0,Cash loans,270000.0,1293502.5,35698.5,1129500.0,State servant,-1188,0.311267,0.622246,,1.0,1.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,67500.0,135000.0,6750.0,135000.0,Working,-225,,0.555912,0.729567,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,135000.0,312682.5,29686.5,297000.0,Working,-3039,,0.650442,,2.0,2.0,,,,
4,100007,0,Cash loans,121500.0,513000.0,21865.5,513000.0,Working,-3038,,0.322738,,0.0,0.0,0.0,0.0,0.0,0.0


# Data pre-processing

### Additional imports

In [57]:
from sklearn.preprocessing import LabelEncoder, RobustScaler, PowerTransformer
from sklearn.impute import KNNImputer
from joblib import Parallel, delayed
from tqdm import tqdm

### 1. Combine hour/day/week/month columns
Fill NA with 0, sum their values since each bracket above the other is non-inclusive.

In [58]:
hdwm = ['AMT_REQ_CREDIT_BUREAU_HOUR',
        'AMT_REQ_CREDIT_BUREAU_DAY', 
        'AMT_REQ_CREDIT_BUREAU_WEEK', 
        'AMT_REQ_CREDIT_BUREAU_MON']

new_df.loc[:, hdwm] = new_df.loc[:, hdwm].fillna(0)

new_df['AMT_REQ_CREDIT_BUREAU_MONTH'] = new_df['AMT_REQ_CREDIT_BUREAU_HOUR'] + new_df['AMT_REQ_CREDIT_BUREAU_DAY'] + new_df['AMT_REQ_CREDIT_BUREAU_WEEK'] + new_df['AMT_REQ_CREDIT_BUREAU_MON']

new_df.drop(columns=['AMT_REQ_CREDIT_BUREAU_HOUR', 
                    'AMT_REQ_CREDIT_BUREAU_DAY', 
                    'AMT_REQ_CREDIT_BUREAU_WEEK', 
                    'AMT_REQ_CREDIT_BUREAU_MON'
                    ], inplace=True)

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['AMT_REQ_CREDIT_BUREAU_MONTH'] = new_df['AMT_REQ_CREDIT_BUREAU_HOUR'] + new_df['AMT_REQ_CREDIT_BUREAU_DAY'] + new_df['AMT_REQ_CREDIT_BUREAU_WEEK'] + new_df['AMT_REQ_CREDIT_BUREAU_MON']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.drop(columns=['AMT_REQ_CREDIT_BUREAU_HOUR',


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_MONTH
0,100002,1,Cash loans,202500.0,406597.5,24700.5,351000.0,Working,-637,0.083037,0.262949,0.139376,2.0,2.0,0.0
1,100003,0,Cash loans,270000.0,1293502.5,35698.5,1129500.0,State servant,-1188,0.311267,0.622246,,1.0,1.0,0.0
2,100004,0,Revolving loans,67500.0,135000.0,6750.0,135000.0,Working,-225,,0.555912,0.729567,0.0,0.0,0.0
3,100006,0,Cash loans,135000.0,312682.5,29686.5,297000.0,Working,-3039,,0.650442,,2.0,2.0,0.0
4,100007,0,Cash loans,121500.0,513000.0,21865.5,513000.0,Working,-3038,,0.322738,,0.0,0.0,0.0


### 2. Fill missing values with their median value or 0.
Chose this method of handling missing values because there isn't that many missing values that would skew the distribution.

Fill OBS_30_CNT_SOCIAL_CIRCLE and OBS_60_CNT_SOCIAL_CIRCLE with 0, since NaN is the median.

    AMT_ANNUITY: 12 NaN values
    AMT_GOODS_PRICE: 278 NaN values
    OBS_30_CNT_SOCIAL_CIRCLE: 1021 NaN values
    OBS_60_CNT_SOCIAL_CIRCLE: 1021 NaN values

In [59]:
fill_median = ['AMT_ANNUITY', 'AMT_GOODS_PRICE']
fill_zero = ['OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE']

for column in fill_median:
    median_value = new_df[column].median()
    new_df[column].fillna(median_value, inplace=True)

new_df[fill_zero] = new_df[fill_zero].fillna(0)

new_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  new_df[column].fillna(median_value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(v

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_MONTH
0,100002,1,Cash loans,202500.0,406597.5,24700.5,351000.0,Working,-637,0.083037,0.262949,0.139376,2.0,2.0,0.0
1,100003,0,Cash loans,270000.0,1293502.5,35698.5,1129500.0,State servant,-1188,0.311267,0.622246,,1.0,1.0,0.0
2,100004,0,Revolving loans,67500.0,135000.0,6750.0,135000.0,Working,-225,,0.555912,0.729567,0.0,0.0,0.0
3,100006,0,Cash loans,135000.0,312682.5,29686.5,297000.0,Working,-3039,,0.650442,,2.0,2.0,0.0
4,100007,0,Cash loans,121500.0,513000.0,21865.5,513000.0,Working,-3038,,0.322738,,0.0,0.0,0.0


### 3. Label encoding
Replace categorical data with numerical values.

In [48]:
label_cols = ['NAME_CONTRACT_TYPE', 'NAME_INCOME_TYPE']
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    new_df[col] = le.fit_transform(new_df[col])
    label_encoders[col] = le

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = le.fit_transform(new_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[col] = le.fit_transform(new_df[col])


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_MONTH
0,100002,1,0,202500.0,406597.5,24700.5,351000.0,7,-637,0.083037,0.262949,0.139376,2.0,2.0,0.0
1,100003,0,0,270000.0,1293502.5,35698.5,1129500.0,4,-1188,0.311267,0.622246,,1.0,1.0,0.0
2,100004,0,1,67500.0,135000.0,6750.0,135000.0,7,-225,,0.555912,0.729567,0.0,0.0,0.0
3,100006,0,0,135000.0,312682.5,29686.5,297000.0,7,-3039,,0.650442,,2.0,2.0,0.0
4,100007,0,0,121500.0,513000.0,21865.5,513000.0,7,-3038,,0.322738,,0.0,0.0,0.0


In [49]:
for col, encoder in label_encoders.items():
    print(f"Mapping for {col}:")
    for i, class_label in enumerate(encoder.classes_):
        print(f"{class_label}: {i}")

Mapping for NAME_CONTRACT_TYPE:
Cash loans: 0
Revolving loans: 1
Mapping for NAME_INCOME_TYPE:
Businessman: 0
Commercial associate: 1
Maternity leave: 2
Pensioner: 3
State servant: 4
Student: 5
Unemployed: 6
Working: 7


### 4. Robust Scaling
Scale non-zero positive numerical data (with long right tails, positive skew), to retain interpretability even with outliers.

In [21]:
robust_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']
scaler = RobustScaler()
new_df[robust_cols] = scaler.fit_transform(new_df[robust_cols])

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[robust_cols] = scaler.fit_transform(new_df[robust_cols])


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_MONTH
0,100002,1,0,0.615,-0.198521,-0.011205,-0.22449,7,-637,0.083037,0.262949,0.139376,2.0,2.0,0.0
1,100003,0,0,1.365,1.448012,0.597361,1.540816,4,-1188,0.311267,0.622246,,1.0,1.0,0.0
2,100004,0,1,-0.885,-0.70274,-1.004482,-0.714286,7,-225,,0.555912,0.729567,0.0,0.0,0.0
3,100006,0,0,-0.135,-0.372874,0.264691,-0.346939,7,-3039,,0.650442,,2.0,2.0,0.0
4,100007,0,0,-0.285,-0.000986,-0.168078,0.142857,7,-3038,,0.322738,,0.0,0.0,0.0


### 5. Yeo-Johnson Transformation
Adapative scaling for numerical data, including negative, zero and positive values.

In [22]:
yj_cols = ['DAYS_EMPLOYED', 'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE']
pt = PowerTransformer(method='yeo-johnson')
new_df[yj_cols] = pt.fit_transform(df[yj_cols])

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[yj_cols] = pt.fit_transform(df[yj_cols])


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_MONTH
0,100002,1,0,0.615,-0.198521,-0.011205,-0.22449,7,0.276851,0.083037,0.262949,0.139376,0.989412,1.000236,0.0
1,100003,0,0,1.365,1.448012,0.597361,1.540816,4,0.137695,0.311267,0.622246,,0.50402,0.515863,0.0
2,100004,0,1,-0.885,-0.70274,-1.004482,-0.714286,7,0.362878,,0.555912,0.729567,-0.891926,-0.888367,0.0
3,100006,0,0,-0.135,-0.372874,0.264691,-0.346939,7,-0.435923,,0.650442,,0.989412,1.000236,0.0
4,100007,0,0,-0.285,-0.000986,-0.168078,0.142857,7,-0.435582,,0.322738,,-0.891926,-0.888367,0.0


### 6. K-Nearest Neighbours Imputation
Replace NaN values with the likely values based on the values from the nearest neighbours.
Preserves the shape of the distribution while retaining the relationship between other features.
Chose this method of handling missing values because there is a large % of missing data in the EXT_SOURCE features.

    EXT_SOURCE_1: 173378 NaN values
    EXT_SOURCE_2: 660 NaN values
    EXT_SOURCE_3: 60965 NaN values

In [23]:
knn_cols = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

temp_df = new_df[['SK_ID_CURR', 'TARGET'] + knn_cols].copy()

imputer = KNNImputer(n_neighbors=3)
temp_df[knn_cols] = imputer.fit_transform(temp_df[knn_cols])
new_df[knn_cols] = temp_df[knn_cols]

print(new_df.head())

   SK_ID_CURR  TARGET  NAME_CONTRACT_TYPE  AMT_INCOME_TOTAL  AMT_CREDIT  \
0      100002       1                   0             0.615   -0.198521   
1      100003       0                   0             1.365    1.448012   
2      100004       0                   1            -0.885   -0.702740   
3      100006       0                   0            -0.135   -0.372874   
4      100007       0                   0            -0.285   -0.000986   

   AMT_ANNUITY  AMT_GOODS_PRICE  NAME_INCOME_TYPE  DAYS_EMPLOYED  \
0    -0.011205        -0.224490                 7       0.276851   
1     0.597361         1.540816                 4       0.137695   
2    -1.004482        -0.714286                 7       0.362878   
3     0.264691        -0.346939                 7      -0.435923   
4    -0.168078         0.142857                 7      -0.435582   

   EXT_SOURCE_1  EXT_SOURCE_2  EXT_SOURCE_3  OBS_30_CNT_SOCIAL_CIRCLE  \
0      0.083037      0.262949      0.139376                  0.9894

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[knn_cols] = temp_df[knn_cols]


### Check cleaned dataset

In [24]:
print(nan_counts(new_df))
print(new_df.shape)
new_df.head()

new_df.columns

Columns with NaN values:
OBS_30_CNT_SOCIAL_CIRCLE: 1021 NaN values
OBS_60_CNT_SOCIAL_CIRCLE: 1021 NaN values
None
(307511, 15)


Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE',
       'DAYS_EMPLOYED', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
       'AMT_REQ_CREDIT_BUREAU_MONTH'],
      dtype='object')

## Output cleaned dataset
Just for other group memebers to work on

In [25]:
new_df.to_csv('application_data_clean.csv', index=False, encoding='utf-8')

# Import cleaned dataset
Run the code from here if testing!!

So I don't have to do the preprocessing steps again. (KNN imputation took almost an hour.)

Label encoder mappings for reference:
```
Mapping for NAME_CONTRACT_TYPE:
Cash loans: 0
Revolving loans: 1
Mapping for NAME_INCOME_TYPE:
Businessman: 0
Commercial associate: 1
Maternity leave: 2
Pensioner: 3
State servant: 4
Student: 5
Unemployed: 6
Working: 7
```

## Basic imports

In [66]:
import pandas as pd
import numpy as np

In [69]:
filename = "./application_data_clean.csv"
df = pd.read_csv(filename)

def nan_counts(df):
    nan_columns = {}

    for column in df.columns:
        nan_count = df[column].isna().sum()  # Count NaN values in the column
        if nan_count > 0:  # Only include columns with NaN values
            nan_columns[column] = nan_count

    # Display columns with NaN values and their counts
    print("Columns with NaN values:")
    for col, count in nan_columns.items():
        print(f"{col}: {count} NaN values")

print(df.shape)
print(nan_counts(df))
df.head()

(307511, 15)
Columns with NaN values:
None


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_INCOME_TYPE,DAYS_EMPLOYED,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_MONTH
0,100002,1,0,0.615,-0.198521,-0.011205,-0.22449,7,0.276851,0.083037,0.262949,0.139376,0.989412,1.000236,0.0
1,100003,0,0,1.365,1.448012,0.597361,1.540816,4,0.137695,0.311267,0.622246,0.443125,0.50402,0.515863,0.0
2,100004,0,1,-0.885,-0.70274,-1.004482,-0.714286,7,0.362878,0.587744,0.555912,0.729567,-0.891926,-0.888367,0.0
3,100006,0,0,-0.135,-0.372874,0.264691,-0.346939,7,-0.435923,0.756675,0.650442,0.345748,0.989412,1.000236,0.0
4,100007,0,0,-0.285,-0.000986,-0.168078,0.142857,7,-0.435582,0.593377,0.322738,0.615033,-0.891926,-0.888367,0.0


In [None]:
logreg_param_grid = {
    'C': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [80, 100, 120, 150],
    'tol': [1e-6, 1e-5, 1e-4, 1e-3],
    'class_weight': [None, 'balanced'],
    'fit_intercept': [True, False]
}

perform_grid_search("Logistic Regression", LogisticRegression, logreg_param_grid)

# Best Hyperparameters for Logistic Regression: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# Best Hyperparameters for Logistic Regression: {'C': 0.05, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# Best Hyperparameters for Logistic Regression: {'C': 0.05, 'max_iter': 80, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-05}
# 1min 35.9s runtime
# Best Hyperparameters for Logistic Regression: {'C': 0.05, 'class_weight': None, 'fit_intercept': True, 'max_iter': 80, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-06}
# 162min 53s runtime

1920 fits failed out of a total of 11520.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1920 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Yean Keat\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Yean Keat\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Yean Keat\AppData\Local\Packages\PythonSoftwareFoun

Best Hyperparameters for Logistic Regression: {'C': 0.05, 'class_weight': None, 'fit_intercept': True, 'max_iter': 80, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-06}


## Train, test and validation splits

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold

X = df.drop(columns=['SK_ID_CURR', 'TARGET'])
y = df['TARGET']

# stratified train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=123
)

# distribution of TARGET in train and test sets
print("Train TARGET distribution:\n", y_train.value_counts(normalize=True))
print("Test TARGET distribution:\n", y_test.value_counts(normalize=True))
print("-" * 30)

# stratified k-fold on the training set
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

# stratified k-fold split on the training set
for fold_index, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    print(f"Fold {fold_index + 1}")
    print("TRAIN indices:", train_index)
    print("VALIDATION indices:", val_index)
    print("Train TARGET distribution:\n", y_fold_train.value_counts(normalize=True))
    print("Validation TARGET distribution:\n", y_fold_val.value_counts(normalize=True))
    print("-" * 30)

Train TARGET distribution:
 TARGET
0    0.919273
1    0.080727
Name: proportion, dtype: float64
Test TARGET distribution:
 TARGET
0    0.919266
1    0.080734
Name: proportion, dtype: float64
------------------------------
Fold 1
TRAIN indices: [     0      4      5 ... 215251 215252 215253]
VALIDATION indices: [     1      2      3 ... 215254 215255 215256]
Train TARGET distribution:
 TARGET
0    0.919278
1    0.080722
Name: proportion, dtype: float64
Validation TARGET distribution:
 TARGET
0    0.919265
1    0.080735
Name: proportion, dtype: float64
------------------------------
Fold 2
TRAIN indices: [     0      1      2 ... 215254 215255 215256]
VALIDATION indices: [    14     16     18 ... 215243 215249 215252]
Train TARGET distribution:
 TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64
Validation TARGET distribution:
 TARGET
0    0.919278
1    0.080722
Name: proportion, dtype: float64
------------------------------
Fold 3
TRAIN indices: [     1      2      3 ..

## Basic testing of some models
1. Logistic regression
2. ~~SVC~~
3. Decision tree
4. Random forest
5. XGBoost
6. Voting classifier

#### Imports, initialisation and helper functions

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
from tqdm import tqdm

# model initialization
models = {
    "Logistic Regression": LogisticRegression(),
    #"Support Vector Classifier": SVC(probability=True),    # takes too long to run for each fold
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Voting Classifier": VotingClassifier(estimators=[
        ('lr', LogisticRegression()),
        #('svc', SVC(probability=True)),
        #('dt', DecisionTreeClassifier()),
        ('rf', RandomForestClassifier()),        
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
    ], voting='soft')
}

# function to save the model
def save_model(model, model_name):
    joblib.dump(model, f"{model_name}.joblib")
    print(f"{model_name} saved.")

# function to print evaluation metrics
def print_evaluation_metrics(results):
    """
    Print evaluation metrics for each model stored in the results list.

    Parameters:
    results (list): A list containing model names and their evaluation metrics.
    """
    for model_results in results:
        model_name = model_results[0]
        print(f"\n{model_name} Results:")
        print("Split | Accuracy | Precision | Recall | F1 Score | AUC")
        print("-" * 60)
        
        for i in range(1, len(model_results)):
            metrics = model_results[i]
            print(f"{i:5d} | {metrics[0]:.4f} | {metrics[1]:.4f} | {metrics[2]:.4f} | {metrics[3]:.4f} | {metrics[4]:.4f}")

# store results for evaluation metrics
results = []

## Training and evaluation of models

In [10]:
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # to store metrics for each fold
    model_metrics = [model_name]
    
    for fold_index, (train_index, val_index) in tqdm(enumerate(skf.split(X_train, y_train)), total=n_splits):
        X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        y_pred_val = model.predict(X_fold_val)
        
        # metrics
        accuracy = accuracy_score(y_fold_val, y_pred_val)
        precision = precision_score(y_fold_val, y_pred_val)
        recall = recall_score(y_fold_val, y_pred_val)
        f1 = f1_score(y_fold_val, y_pred_val)
        auc = roc_auc_score(y_fold_val, model.predict_proba(X_fold_val)[:, 1])
        
        # store metrics for this fold
        model_metrics.append((accuracy, precision, recall, f1, auc))
    
    # final training on the whole training set and evaluate on test set
    model.fit(X_train, y_train)
    
    # predict on test set
    y_pred_test = model.predict(X_test)
    
    # calculate test metrics
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    # append test metrics to results
    model_metrics.append((test_accuracy, test_precision, test_recall, test_f1, test_auc))
    
    # save model
    save_model(model, model_name)

    # add to results for printing later
    results.append(model_metrics)


Training Logistic Regression...


100%|██████████| 3/3 [00:02<00:00,  1.44it/s]


Logistic Regression saved.

Training Decision Tree...


100%|██████████| 3/3 [00:07<00:00,  2.39s/it]


Decision Tree saved.

Training Random Forest...


100%|██████████| 3/3 [02:21<00:00, 47.13s/it]


Random Forest saved.

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

100%|██████████| 3/3 [00:09<00:00,  3.29s/it]
Parameters: { "use_label_encoder" } are not used.



XGBoost saved.

Training Voting Classifier...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

100%|██████████| 3/3 [02:18<00:00, 46.09s/it]
Parameters: { "use_label_encoder" } are not used.



Voting Classifier saved.


### Print evaluation metrics

In [11]:
print_evaluation_metrics(results)


Logistic Regression Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9194 | 0.6429 | 0.0031 | 0.0062 | 0.7263
    2 | 0.9194 | 0.5854 | 0.0041 | 0.0082 | 0.7197
    3 | 0.9194 | 0.6098 | 0.0043 | 0.0086 | 0.7250
    4 | 0.9193 | 0.5000 | 0.0026 | 0.0051 | 0.7247

Decision Tree Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.8516 | 0.1427 | 0.1674 | 0.1541 | 0.5395
    2 | 0.8516 | 0.1336 | 0.1528 | 0.1425 | 0.5329
    3 | 0.8513 | 0.1429 | 0.1685 | 0.1547 | 0.5399
    4 | 0.8502 | 0.1301 | 0.1505 | 0.1396 | 0.5311

Random Forest Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9194 | 0.5361 | 0.0090 | 0.0177 | 0.6932
    2 | 0.9193 | 0.5034 | 0.0126 | 0.0246 | 0.6972
    3 | 0.9189 | 0.4277 | 0.0123 | 0.0238 | 0.6990
    4 | 0.919

# Fine-tuning models
Logistic regression, Random Forest, and XGBoost were the more performant models, and the combination of those models using the Voting Classifier ensemble gave better results than any of the individual models themselves.

Use GridSearchCV to find the best hyperparameters for each model, and put them through the voting classifier with those hyperparameters.

In [14]:
from sklearn.model_selection import GridSearchCV

def perform_grid_search(model_name, model_class, param_grid):
    grid_search = GridSearchCV(estimator=model_class(), param_grid=param_grid,
                               cv=5, n_jobs=-1)
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best Hyperparameters for {model_name}: {grid_search.best_params_}")

## Logistic regression

liblinear solver to test l1 vs l2 penalty

In [35]:
logreg_param_grid = {
    'C': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [80, 100, 120, 150],
    'tol': [1e-6, 1e-5, 1e-4, 1e-3],
    'class_weight': [None, 'balanced'],
    'fit_intercept': [True, False]
}

perform_grid_search("Logistic Regression", LogisticRegression, logreg_param_grid)

# Best Hyperparameters for Logistic Regression: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# Best Hyperparameters for Logistic Regression: {'C': 0.05, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
# Best Hyperparameters for Logistic Regression: {'C': 0.05, 'max_iter': 80, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-05}
# 1min 35.9s runtime
# Best Hyperparameters for Logistic Regression: {'C': 0.05, 'class_weight': None, 'fit_intercept': True, 'max_iter': 80, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-06}
# 162min 53s runtime

1920 fits failed out of a total of 11520.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1920 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Yean Keat\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Yean Keat\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Yean Keat\AppData\Local\Packages\PythonSoftwareFoun

Best Hyperparameters for Logistic Regression: {'C': 0.05, 'class_weight': None, 'fit_intercept': True, 'max_iter': 80, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 1e-06}


## Random Forest

In [29]:
rf_param_grid = {
    'n_estimators': [100, 150, 200],
    'criterion': ['log_loss'],
    'max_features': ['sqrt'],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [2, 3, 4],
    'bootstrap': [True, False]
}

perform_grid_search("Random Forest", RandomForestClassifier, rf_param_grid)

# Best Hyperparameters for Random Forest: {'criterion': 'log_loss', 'n_estimators': 100}

# Best Hyperparameters for Random Forest: {'criterion': 'log_loss', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

# Best Hyperparameters for Random Forest: {'bootstrap': True, 'criterion': 'log_loss', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 150}
# 155m 23.6s runtime

Best Hyperparameters for Random Forest: {'bootstrap': True, 'criterion': 'log_loss', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 150}


## XGBoost

In [24]:
xgb_param_grid = {
    'device': ["cuda"],
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 7, 12],
    'learning_rate': [0.1, 0.3],
    'subsample': [0.5, 1],
    'colsample_bytree': [0.8, 1],
    'gamma': [0, 0.01],
    'lambda': [0.8, 1],
    'sampling_method': ['uniform', 'gradient_based']
}

perform_grid_search("XGBoost", XGBClassifier, xgb_param_grid)

# Best Hyperparameters for XGBoost: {'colsample_bytree': 0.8, 'device': 'cuda', 'gamma': 0, 'lambda': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150, 'sampling_method': 'gradient_based', 'subsample': 0.5}
# 123m 5.9s runtime

Best Hyperparameters for XGBoost: {'colsample_bytree': 0.8, 'device': 'cuda', 'gamma': 0, 'lambda': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150, 'sampling_method': 'gradient_based', 'subsample': 0.5}


## Get metrics for models using the fine-tuned hyperparameters

In [38]:
# best hyperparams for Logistic Regression
log_reg_params = {
    'C': 0.05,
    'class_weight': None,
    'fit_intercept': True,
    'max_iter': 80,
    'penalty': 'l1',
    'solver': 'liblinear',
    'tol': 1e-06
}

# best hyperparams for Random Forest
rf_params = {
    'bootstrap': True,
    'criterion': 'log_loss',
    'max_depth': None,
    'max_features': 'sqrt',
    'min_samples_leaf': 4,
    'min_samples_split': 6,
    'n_estimators': 150
}

# best hyperparams for XGBoost
xgb_params = {
    'colsample_bytree': 0.8,
    'device': 'cuda',
    'gamma': 0,
    'lambda': 0.8,
    'learning_rate': 0.1,
    'max_depth': 7,
    'n_estimators': 150,
    'sampling_method': 'gradient_based',
    'subsample': 0.5
}

# model initialization
models = {
    "Logistic Regression": LogisticRegression(**log_reg_params),
    "Random Forest": RandomForestClassifier(**rf_params),
    "XGBoost": XGBClassifier(**xgb_params, use_label_encoder=False, eval_metric='logloss')
}

### in case you restarted the kernel ###

# function to save the model
def save_model(model, model_name):
    joblib.dump(model, f"{model_name}.joblib")
    print(f"{model_name} saved.")

# function to print evaluation metrics
def print_evaluation_metrics(results):
    """
    Print evaluation metrics for each model stored in the results list.

    Parameters:
    results (list): A list containing model names and their evaluation metrics.
    """
    for model_results in results:
        model_name = model_results[0]
        print(f"\n{model_name} Results:")
        print("Split | Accuracy | Precision | Recall | F1 Score | AUC")
        print("-" * 60)
        
        for i in range(1, len(model_results)):
            metrics = model_results[i]
            print(f"{i:5d} | {metrics[0]:.4f} | {metrics[1]:.4f} | {metrics[2]:.4f} | {metrics[3]:.4f} | {metrics[4]:.4f}")

# store results for evaluation metrics
results = []

In [39]:
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # to store metrics for each fold
    model_metrics = [model_name]
    
    for fold_index, (train_index, val_index) in tqdm(enumerate(skf.split(X_train, y_train)), total=n_splits):
        X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        model.fit(X_fold_train, y_fold_train)
        y_pred_val = model.predict(X_fold_val)
        
        # metrics
        accuracy = accuracy_score(y_fold_val, y_pred_val)
        precision = precision_score(y_fold_val, y_pred_val)
        recall = recall_score(y_fold_val, y_pred_val)
        f1 = f1_score(y_fold_val, y_pred_val)
        auc = roc_auc_score(y_fold_val, model.predict_proba(X_fold_val)[:, 1])
        
        # store metrics for this fold
        model_metrics.append((accuracy, precision, recall, f1, auc))
    
    # final training on the whole training set and evaluate on test set
    model.fit(X_train, y_train)
    
    # predict on test set
    y_pred_test = model.predict(X_test)
    
    # calculate test metrics
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    # append test metrics to results
    model_metrics.append((test_accuracy, test_precision, test_recall, test_f1, test_auc))
    
    # save model
    save_model(model, model_name)

    # add to results for printing later
    results.append(model_metrics)


Training Logistic Regression...


100%|██████████| 3/3 [00:05<00:00,  1.99s/it]


Logistic Regression saved.

Training Random Forest...


100%|██████████| 3/3 [02:49<00:00, 56.59s/it]


Random Forest saved.

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

100%|██████████| 3/3 [00:05<00:00,  1.83s/it]
Parameters: { "use_label_encoder" } are not used.



XGBoost saved.


## Evaluation metrics for individual models after fine-tuning

In [40]:
print_evaluation_metrics(results)


Logistic Regression Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9194 | 0.6667 | 0.0024 | 0.0048 | 0.7262
    2 | 0.9194 | 0.6538 | 0.0029 | 0.0058 | 0.7195
    3 | 0.9195 | 0.7407 | 0.0035 | 0.0069 | 0.7249
    4 | 0.9193 | 0.5625 | 0.0024 | 0.0048 | 0.7246

Random Forest Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9194 | 0.6078 | 0.0054 | 0.0106 | 0.7191
    2 | 0.9194 | 0.5521 | 0.0092 | 0.0180 | 0.7155
    3 | 0.9194 | 0.5377 | 0.0098 | 0.0193 | 0.7198
    4 | 0.9193 | 0.5152 | 0.0068 | 0.0135 | 0.7227

XGBoost Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9194 | 0.5339 | 0.0109 | 0.0213 | 0.7330
    2 | 0.9191 | 0.4681 | 0.0152 | 0.0294 | 0.7310
    3 | 0.9195 | 0.5464 | 0.0173 | 0.0335 | 0.7307
    4 | 0.9193 | 0.

# Voting Classifier v2

#### run this in case you restarted your kernel

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
from tqdm import tqdm

# helper functions
def save_model(model, model_name):
    joblib.dump(model, f"{model_name}.joblib")
    print(f"{model_name} saved.")

def print_evaluation_metrics(results):
    """
    Print evaluation metrics for each model stored in the results list.
    
    Parameters:
    results (list): A list containing model names and their evaluation metrics.
    """
    for model_results in results:
        model_name = model_results[0]
        print(f"\n{model_name} Results:")
        print("Split | Accuracy | Precision | Recall | F1 Score | AUC")
        print("-" * 60)
        
        for i in range(1, len(model_results)):
            metrics = model_results[i]
            print(f"{i:5d} | {metrics[0]:.4f} | {metrics[1]:.4f} | {metrics[2]:.4f} | {metrics[3]:.4f} | {metrics[4]:.4f}")

# import cleaned dataset
filename = "./application_data_clean.csv"
df = pd.read_csv(filename)

# train-validation-test split
X = df.drop(columns=['SK_ID_CURR', 'TARGET'])
y = df['TARGET']

# stratified train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=123
)

# stratified k-fold on the training set
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

for fold_index, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

## Baseline Voting Classifier using fine-tuned hyperparameters of individual models

In [48]:
# best hyperparameters for each model
log_reg_params = {
    'C': 0.05,
    'class_weight': None,
    'fit_intercept': True,
    'max_iter': 80,
    'penalty': 'l1',
    'solver': 'liblinear',
    'tol': 1e-06
}

rf_params = {
    'bootstrap': True,
    'criterion': 'log_loss',
    'max_depth': None,
    'max_features': 'sqrt',
    'min_samples_leaf': 4,
    'min_samples_split': 6,
    'n_estimators': 150
}

xgb_params = {
    'colsample_bytree': 0.8,
    'device': 'cuda',
    'gamma': 0,
    'lambda': 0.8,
    'learning_rate': 0.1,
    'max_depth': 7,
    'n_estimators': 150,
    'sampling_method': 'gradient_based',
    'subsample': 0.5
}

# initialize models with best hyperparameters
log_reg_best = LogisticRegression(**log_reg_params)
rf_best = RandomForestClassifier(**rf_params)
xgb_best = XGBClassifier(**xgb_params, use_label_encoder=False, eval_metric='logloss')

# Voting classifier with best hyperparameters
voting_clf_best_indiv = VotingClassifier(estimators=[
    ('lr', log_reg_best), ('rf', rf_best), ('xgb', xgb_best)], voting='soft')

# store results for evaluation metrics
results = []

model_name = "Voting Classifier"
print(f"\nTraining {model_name}...")

# to store metrics for each fold
model_metrics = [model_name]

for fold_index, (train_index, val_index) in tqdm(enumerate(skf.split(X_train, y_train)), total=n_splits):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

    voting_clf_best_indiv.fit(X_fold_train, y_fold_train)
    y_pred_val = voting_clf_best_indiv.predict(X_fold_val)
    
    # metrics
    accuracy = accuracy_score(y_fold_val, y_pred_val)
    precision = precision_score(y_fold_val, y_pred_val)
    recall = recall_score(y_fold_val, y_pred_val)
    f1 = f1_score(y_fold_val, y_pred_val)
    auc = roc_auc_score(y_fold_val, voting_clf_best_indiv.predict_proba(X_fold_val)[:, 1])
    
    # store metrics for this fold
    model_metrics.append((accuracy, precision, recall, f1, auc))

# final training on the whole training set and evaluate on test set
voting_clf_best_indiv.fit(X_train, y_train)

# predict on test set
y_pred_test = voting_clf_best_indiv.predict(X_test)

# calculate test metrics
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, voting_clf_best_indiv.predict_proba(X_test)[:, 1])

# append test metrics to results
model_metrics.append((test_accuracy, test_precision, test_recall, test_f1, test_auc))

# save model
save_model(voting_clf_best_indiv, model_name)

# add to results for printing later
results.append(model_metrics)


Training Voting Classifier...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

100%|██████████| 3/3 [02:52<00:00, 57.51s/it]
Parameters: { "use_label_encoder" } are not used.



Voting Classifier saved.


### Evaluation metrics of baseline Voting Classifier with fine-tuned individual models

In [49]:
print_evaluation_metrics(results)


Voting Classifier Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9195 | 0.7857 | 0.0038 | 0.0076 | 0.7349
    2 | 0.9194 | 0.5571 | 0.0067 | 0.0133 | 0.7311
    3 | 0.9196 | 0.6479 | 0.0079 | 0.0157 | 0.7333
    4 | 0.9194 | 0.5714 | 0.0043 | 0.0085 | 0.7355


## Fine-tuning hyperparameters for Voting Classifier

#### again, in case restarted kernel

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
from tqdm import tqdm

# helper functions
def save_model(model, model_name):
    joblib.dump(model, f"{model_name}.joblib")
    print(f"{model_name} saved.")

def print_evaluation_metrics(results):
    """
    Print evaluation metrics for each model stored in the results list.
    
    Parameters:
    results (list): A list containing model names and their evaluation metrics.
    """
    for model_results in results:
        model_name = model_results[0]
        print(f"\n{model_name} Results:")
        print("Split | Accuracy | Precision | Recall | F1 Score | AUC")
        print("-" * 60)
        
        for i in range(1, len(model_results)):
            metrics = model_results[i]
            print(f"{i:5d} | {metrics[0]:.4f} | {metrics[1]:.4f} | {metrics[2]:.4f} | {metrics[3]:.4f} | {metrics[4]:.4f}")

# import cleaned dataset
filename = "./application_data_clean.csv"
df = pd.read_csv(filename)

# train-validation-test split
X = df.drop(columns=['SK_ID_CURR', 'TARGET'])
y = df['TARGET']

# stratified train-test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=123
)

# stratified k-fold on the training set
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)

## Finding optimal weights for each model

In [54]:
from scipy.optimize import minimize
# best hyperparameters for each model
log_reg_params = {
    'C': 0.05,
    'class_weight': None,
    'fit_intercept': True,
    'max_iter': 80,
    'penalty': 'l1',
    'solver': 'liblinear',
    'tol': 1e-06
}

rf_params = {
    'bootstrap': True,
    'criterion': 'log_loss',
    'max_depth': None,
    'max_features': 'sqrt',
    'min_samples_leaf': 4,
    'min_samples_split': 6,
    'n_estimators': 150
}

xgb_params = {
    'colsample_bytree': 0.8,
    'device': 'cuda',
    'gamma': 0,
    'lambda': 0.8,
    'learning_rate': 0.1,
    'max_depth': 7,
    'n_estimators': 150,
    'sampling_method': 'gradient_based',
    'subsample': 0.5
}

# initialise models with best hyperparameters
log_reg_best = LogisticRegression(**log_reg_params)
rf_best = RandomForestClassifier(**rf_params)
xgb_best = XGBClassifier(**xgb_params, use_label_encoder=False, eval_metric='logloss')

# initialise voting classifier
voting_clf_best = VotingClassifier(estimators=[
    ('lr', log_reg_best), ('rf', rf_best), ('xgb', xgb_best)], voting='soft')

# custom scoring function
def custom_scorer(weights, X, y):
    voting_clf_best.set_params(weights=weights)
    voting_clf_best.fit(X, y)
    y_pred = voting_clf_best.predict(X)
    return -accuracy_score(y, y_pred)  # Negative because minimize looks for the lowest score

# initial weights
initial_weights = [1/3, 1/3, 1/3]

# sse the minimize function
opt_result = minimize(custom_scorer, initial_weights, args=(X_train, y_train), 
                      method='SLSQP', bounds=[(0,1),(0,1),(0,1)], 
                      constraints={'type': 'eq', 'fun': lambda w: 1 - sum(w)},
                      options={'maxiter': 30})

# optimal weights
optimal_weights = opt_result.x
print(f"Optimal Weights: {optimal_weights}")

# 91min 10.7s runtime

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Optimal Weights: [0.33267453 0.33367605 0.33364942]


## Optimal weights with soft voting

In [60]:
# Define the voting classifier
voting_clf_best_soft = VotingClassifier(estimators=[
    ('lr', log_reg_best), ('rf', rf_best), ('xgb', xgb_best)], voting='soft', weights=optimal_weights)

# store results for evaluation metrics
results = []

model_name = "Voting Classifier (soft voting)"
print(f"\nTraining {model_name} with optimized weights...")

# to store metrics for each fold
model_metrics = [model_name]

for fold_index, (train_index, val_index) in tqdm(enumerate(skf.split(X_train, y_train)), total=n_splits):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]

    voting_clf_best_soft.fit(X_fold_train, y_fold_train)
    y_pred_val = voting_clf_best_soft.predict(X_fold_val)
    
    # metrics
    accuracy = accuracy_score(y_fold_val, y_pred_val)
    precision = precision_score(y_fold_val, y_pred_val)
    recall = recall_score(y_fold_val, y_pred_val)
    f1 = f1_score(y_fold_val, y_pred_val)
    auc = roc_auc_score(y_fold_val, voting_clf_best_soft.predict_proba(X_fold_val)[:, 1])
    
    # store metrics for this fold
    model_metrics.append((accuracy, precision, recall, f1, auc))

# final training on the whole training set and evaluate on test set
voting_clf_best_soft.fit(X_train, y_train)

# predict on test set
y_pred_test = voting_clf_best_soft.predict(X_test)

# calculate test metrics
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test)
test_recall = recall_score(y_test, y_pred_test)
test_f1 = f1_score(y_test, y_pred_test)
test_auc = roc_auc_score(y_test, voting_clf_best_soft.predict_proba(X_test)[:, 1])

# append test metrics to results
model_metrics.append((test_accuracy, test_precision, test_recall, test_f1, test_auc))

# save model
save_model(voting_clf_best_soft, model_name)

# add to results for printing later
results.append(model_metrics)


Training Voting Classifier (soft voting) with optimized weights...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

100%|██████████| 3/3 [03:18<00:00, 66.18s/it]
Parameters: { "use_label_encoder" } are not used.



Voting Classifier (soft voting) saved.


In [61]:
print_evaluation_metrics(results)


Voting Classifier (soft voting) Results:
Split | Accuracy | Precision | Recall | F1 Score | AUC
------------------------------------------------------------
    1 | 0.9195 | 0.7419 | 0.0040 | 0.0079 | 0.7347
    2 | 0.9193 | 0.5070 | 0.0062 | 0.0123 | 0.7307
    3 | 0.9196 | 0.6875 | 0.0076 | 0.0150 | 0.7334
    4 | 0.9194 | 0.5902 | 0.0048 | 0.0096 | 0.7354
