In [27]:
#Imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import optuna
import pickle

In [28]:
file_path = '../Data/Raw_data/application_train.csv'
df_train = pd.read_csv(file_path)

baseline_features = ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','DAYS_BIRTH']

df_baseline = df_train.copy()

for col in baseline_features:
    df_baseline[col].fillna(df_baseline[col].median(), inplace=True)
df_baseline[baseline_features]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_baseline[col].fillna(df_baseline[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_baseline[col].fillna(df_baseline[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,DAYS_BIRTH
0,202500.0,406597.5,24700.5,-9461
1,270000.0,1293502.5,35698.5,-16765
2,67500.0,135000.0,6750.0,-19046
3,135000.0,312682.5,29686.5,-19005
4,121500.0,513000.0,21865.5,-19932
...,...,...,...,...
307506,157500.0,254700.0,27558.0,-9327
307507,72000.0,269550.0,12001.5,-20775
307508,153000.0,677664.0,29979.0,-14966
307509,171000.0,370107.0,20205.0,-11961


In [29]:
X = df_baseline[baseline_features]
y = df_baseline['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

model = LogisticRegression(random_state=10)
model.fit(X_train, y_train)

In [30]:
val_preds = model.predict_proba(X_val)[:, 1]
baseline_auc = roc_auc_score(y_val, val_preds)

print(f"Baseline Model Validation AUC: {baseline_auc:.4f}")

Baseline Model Validation AUC: 0.5901


In [31]:
numeric_cols = df_train.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove('TARGET')
numeric_cols.remove('SK_ID_CURR')

X = df_train[numeric_cols]
y = df_train['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val) 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

model = LogisticRegression(random_state=10)
model.fit(X_train_scaled, y_train)

val_preds = model.predict_proba(X_val_scaled)[:, 1]
numeric_auc = roc_auc_score(y_val, val_preds)

print(f"Baseline Model AUC: {baseline_auc:.4f}")
print(f"Model with all numeric features AUC: {numeric_auc:.4f}")

Baseline Model AUC: 0.5901
Model with all numeric features AUC: 0.7346


In [32]:
y = df_train['TARGET']
X = df_train.drop(columns=['TARGET', 'SK_ID_CURR'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

X_train_num = X_train[numeric_cols].copy()
X_val_num = X_val[numeric_cols].copy()

imputer = SimpleImputer(strategy='median')
X_train_num_imputed = pd.DataFrame(imputer.fit_transform(X_train_num), columns=numeric_cols, index=X_train.index)
X_val_num_imputed = pd.DataFrame(imputer.transform(X_val_num), columns=numeric_cols, index=X_val.index)

scaler = StandardScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_num_imputed), columns=numeric_cols, index=X_train.index)
X_val_num_scaled = pd.DataFrame(scaler.transform(X_val_num_imputed), columns=numeric_cols, index=X_val.index)

X_train_cat = X_train[categorical_cols].copy()
X_val_cat = X_val[categorical_cols].copy()

X_train_cat_encoded = pd.get_dummies(X_train_cat)
X_val_cat_encoded = pd.get_dummies(X_val_cat)

train_cols = X_train_cat_encoded.columns

X_val_cat_final = X_val_cat_encoded.reindex(columns=train_cols, fill_value=0)

X_train_final = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
X_val_final = pd.concat([X_val_num_scaled, X_val_cat_final], axis=1)

model = LogisticRegression(random_state=10, C=0.01)
model.fit(X_train_final, y_train)

val_preds = model.predict_proba(X_val_final)[:, 1]
final_auc = roc_auc_score(y_val, val_preds)

print(f"Baseline Model AUC: {baseline_auc:.4f}")
print(f"Model with all numeric features AUC: {numeric_auc:.4f}")
print(f"Model with all features (numeric + categorical) AUC: {final_auc:.4f}")

Baseline Model AUC: 0.5901
Model with all numeric features AUC: 0.7346
Model with all features (numeric + categorical) AUC: 0.7446


In [33]:
y = df_train['TARGET']
X = df_train.drop(columns=['TARGET', 'SK_ID_CURR'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

print("Engineering new features...\n 1) total loan amount to income ratio \n 2) loan installment amount to income ratio \n 3) loan duration ")
for df in [X_train, X_val]:
    df['CREDIT_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['ANNUITY_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
print("New features created.")

numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

X_train_num = X_train[numeric_cols].copy()
X_val_num = X_val[numeric_cols].copy()

imputer = SimpleImputer(strategy='median')
X_train_num_imputed = pd.DataFrame(imputer.fit_transform(X_train_num), columns=numeric_cols, index=X_train.index)
X_val_num_imputed = pd.DataFrame(imputer.transform(X_val_num), columns=numeric_cols, index=X_val.index)

scaler = StandardScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_num_imputed), columns=numeric_cols, index=X_train.index)
X_val_num_scaled = pd.DataFrame(scaler.transform(X_val_num_imputed), columns=numeric_cols, index=X_val.index)

X_train_cat = X_train[categorical_cols].copy()
X_val_cat = X_val[categorical_cols].copy()

X_train_cat_encoded = pd.get_dummies(X_train_cat)
X_val_cat_encoded = pd.get_dummies(X_val_cat)

train_cols = X_train_cat_encoded.columns
X_val_cat_final = X_val_cat_encoded.reindex(columns=train_cols, fill_value=0)

# --- 6. Combine Features and Train Model ---
X_train_final = pd.concat([X_train_num_scaled, X_train_cat_encoded], axis=1)
X_val_final = pd.concat([X_val_num_scaled, X_val_cat_final], axis=1)

model = LogisticRegression(random_state=10, C=0.01)
model.fit(X_train_final, y_train)

val_preds = model.predict_proba(X_val_final)[:, 1]
final_auc = roc_auc_score(y_val, val_preds)

print(f"Model with all features (numeric + categorical) AUC: {final_auc:.4f}")

Engineering new features...
 1) total loan amount to income ratio 
 2) loan installment amount to income ratio 
 3) loan duration 
New features created.
Model with all features (numeric + categorical) AUC: 0.7453


In [34]:
print("\nTraining XGBoost model...")

xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', random_state=10)

xgb_model.fit(X_train_final, y_train)

val_preds_xgb = xgb_model.predict_proba(X_val_final)[:, 1]
xgb_auc = roc_auc_score(y_val, val_preds_xgb)

print(f"\n--- Model Performance Comparison ---")
print(f"Logistic Regression AUC: {final_auc:.4f}")
print(f"XGBoost Model AUC: {xgb_auc:.4f}")


Training XGBoost model...

--- Model Performance Comparison ---
Logistic Regression AUC: 0.7453
XGBoost Model AUC: 0.7552


In [35]:
def objective(trial):

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train_final, y_train)
    
    val_preds = model.predict_proba(X_val_final)[:, 1]
    auc = roc_auc_score(y_val, val_preds)
    
    return auc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  

print("\nBest trial:")
trial = study.best_trial
print(f"  Value: {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-09-05 17:29:05,550] A new study created in memory with name: no-name-092c969f-ec4b-48a0-a599-3547e484f2e8
[I 2025-09-05 17:29:23,048] Trial 0 finished with value: 0.731361188797724 and parameters: {'n_estimators': 836, 'learning_rate': 0.15672701784082468, 'max_depth': 7, 'subsample': 0.8188112102415035, 'colsample_bytree': 0.7606318096972325}. Best is trial 0 with value: 0.731361188797724.
[I 2025-09-05 17:29:28,517] Trial 1 finished with value: 0.7265543619087192 and parameters: {'n_estimators': 151, 'learning_rate': 0.2399245586927746, 'max_depth': 10, 'subsample': 0.979545879624004, 'colsample_bytree': 0.982537455808878}. Best is trial 0 with value: 0.731361188797724.
[I 2025-09-05 17:29:38,028] Trial 2 finished with value: 0.7552617001311348 and parameters: {'n_estimators': 234, 'learning_rate': 0.014689741192402272, 'max_depth': 9, 'subsample': 0.9132697418661322, 'colsample_bytree': 0.9825892497255011}. Best is trial 2 with value: 0.7552617001311348.
[I 2025-09-05 17:29:


Best trial:
  Value: 0.7647
  Params: 
    n_estimators: 515
    learning_rate: 0.04222370821500124
    max_depth: 6
    subsample: 0.6790461917980959
    colsample_bytree: 0.6647103415536036


In [36]:
best_params = study.best_trial.params
best_params['objective'] = 'binary:logistic'
best_params['eval_metric'] = 'auc'
best_params['random_state'] = 10

print("\nTraining final model on the full training data with best parameters...")

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train_final, y_train)

with open('../models/xgb_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

imputer = SimpleImputer(strategy='median').fit(X_train[numeric_cols])
scaler = StandardScaler().fit(imputer.transform(X_train[numeric_cols]))
train_cols = X_train_cat_encoded.columns # from the one-hot encoding step

with open('../models/imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)
    
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
with open('../models/train_cols.pkl', 'wb') as f:
    pickle.dump(train_cols, f)

print("Final model and preprocessing objects saved to the 'models' directory.")


Training final model on the full training data with best parameters...
Final model and preprocessing objects saved to the 'models' directory.


In [38]:
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

with open('../models/categorical_cols.pkl', 'wb') as f:
    pickle.dump(categorical_cols, f)

print("Saved categorical columns list.")

final_model_columns = X_train_final.columns.tolist()

with open('../models/final_model_columns.pkl', 'wb') as f:
    pickle.dump(final_model_columns, f)

print("Saved final model columns list.")

Saved categorical columns list.
Saved final model columns list.
