## Imports and Paths

In [7]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

# Mac project root
BASE_PATH = "/Users/agarwalh/DSIP/Project/DSIP_Project"

PREPROC_DIR = os.path.join(BASE_PATH, "Preprocessing_FE_dataset")
TOPCORR_DIR = os.path.join("/Users/agarwalh/DSIP/Project", "Top_Correlated_dataset")

ID_COL = "SK_ID_CURR"
TARGET_COL = "TARGET"

pd.set_option("display.max_rows", 200)
os.makedirs(TOPCORR_DIR, exist_ok=True)

## Load Preprocessed train/test

In [8]:
train_path = os.path.join(PREPROC_DIR, "final_train.csv")
test_path  = os.path.join(PREPROC_DIR, "final_test.csv")

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

train.shape, test.shape, train[TARGET_COL].mean()

((307511, 1459), (48744, 1458), np.float64(0.08072881945686496))

## Correlation matrix
 Save to disk for future reusability

In [9]:
# Threshold for removing highly correlated variables
threshold = 0.9

corr_matrix = train.corr().abs()

# Save full correlation matrix (for documentation / reuse)
corr_csv_path = os.path.join(PREPROC_DIR, "corr_matrix_full_fs.csv")
corr_matrix.to_csv(corr_csv_path)

corr_matrix.shape, corr_csv_path

((1459, 1459),
 '/Users/agarwalh/DSIP/Project/DSIP_Project/Preprocessing_FE_dataset/corr_matrix_full_fs.csv')

## Drop highly correlated features

In [10]:
# Use upper triangle of correlation matrix
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)

# Columns to drop: any with correlation > threshold
to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
print("There are %d columns to remove due to high correlation." % len(to_drop))

train_red = train.drop(columns=to_drop)
test_red  = test.drop(columns=to_drop)

print("Training shape after corr filter:", train_red.shape)
print("Testing shape after corr filter :", test_red.shape)

There are 688 columns to remove due to high correlation.
Training shape after corr filter: (307511, 771)
Testing shape after corr filter : (48744, 770)


## Missing value analysis & filter (>75%)

In [11]:
train_missing = (train_red.isnull().sum() / len(train_red)).sort_values(ascending=False)
test_missing  = (test_red.isnull().sum() / len(test_red)).sort_values(ascending=False)

train_missing.head(), test_missing.head()

(client_credit_AMT_PAYMENT_CURRENT_min_mean            0.801438
 client_credit_AMT_PAYMENT_CURRENT_max_mean            0.801438
 client_credit_AMT_PAYMENT_CURRENT_mean_mean           0.801438
 client_credit_CNT_DRAWINGS_OTHER_CURRENT_mean_mean    0.801178
 client_credit_CNT_DRAWINGS_ATM_CURRENT_mean_mean      0.801178
 dtype: float64,
 client_credit_CNT_DRAWINGS_OTHER_CURRENT_max_mean    0.773223
 client_credit_AMT_DRAWINGS_POS_CURRENT_mean_mean     0.773223
 client_credit_AMT_DRAWINGS_ATM_CURRENT_max_mean      0.773223
 client_credit_CNT_DRAWINGS_POS_CURRENT_min_mean      0.773223
 client_credit_CNT_DRAWINGS_OTHER_CURRENT_min_mean    0.773223
 dtype: float64)

In [12]:
train_missing_cols = train_missing.index[train_missing > 0.75]
test_missing_cols  = test_missing.index[test_missing > 0.75]

all_missing = list(set(train_missing_cols) | set(test_missing_cols))
print("There are %d columns with more than 75%% missing values." % len(all_missing))

There are 19 columns with more than 75% missing values.


In [13]:
train_fs = train_red.drop(columns=all_missing)
test_fs  = test_red.drop(columns=all_missing)

print("Training set full shape after missing filter:", train_fs.shape)
print("Testing set full shape after missing filter :", test_fs.shape)

Training set full shape after missing filter: (307511, 752)
Testing set full shape after missing filter : (48744, 751)


## Save “Top_Correlated” dataset

In [14]:
train_fs_path = os.path.join(TOPCORR_DIR, "final_train.csv")
test_fs_path  = os.path.join(TOPCORR_DIR, "final_test.csv")

train_fs.to_csv(train_fs_path, index=False)
test_fs.to_csv(test_fs_path, index=False)

train_fs_path, test_fs_path

('/Users/agarwalh/DSIP/Project/Top_Correlated_dataset/final_train.csv',
 '/Users/agarwalh/DSIP/Project/Top_Correlated_dataset/final_test.csv')

## Load Top_Correlated train for modeling

In [15]:
df = pd.read_csv(train_fs_path)
df.shape, df[TARGET_COL].mean()

((307511, 752), np.float64(0.08072881945686496))

## Build X, y and train/valid split

In [16]:
X = df.drop(columns=[ID_COL, TARGET_COL]).astype("float32")
y = df[TARGET_COL].values

feature_names = X.columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_valid.shape, y_train.mean(), y_valid.mean()

((246008, 750),
 (61503, 750),
 np.float64(0.08072908198107379),
 np.float64(0.08072776937710356))

## Eval helper

In [17]:
def eval_model_probs(y_true, y_pred, label):
    auc = roc_auc_score(y_true, y_pred)
    print(f"{label} AUC: {auc:.6f}")
    return auc

# Baseline models on all features

In [18]:
# Random Forest (all)
rf_all = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_leaf=20,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced_subsample",
)
rf_all.fit(X_train, y_train)
p_rf_valid_all = rf_all.predict_proba(X_valid)[:, 1]
auc_rf_all = eval_model_probs(y_valid, p_rf_valid_all, "RF (all)")

RF (all) AUC: 0.753699


In [19]:
# XGBoost (all)
dtrain_all = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names)
dvalid_all = xgb.DMatrix(X_valid, label=y_valid, feature_names=feature_names)

params_xgb_all = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": "hist",
    "max_depth": 8,
    "eta": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

bst_all = xgb.train(
    params_xgb_all,
    dtrain_all,
    num_boost_round=500,
    evals=[(dtrain_all, "train"), (dvalid_all, "valid")],
    early_stopping_rounds=50,
    verbose_eval=50,
)

p_xgb_valid_all = bst_all.predict(
    dvalid_all, iteration_range=(0, bst_all.best_iteration + 1)
)
auc_xgb_all = eval_model_probs(y_valid, p_xgb_valid_all, "XGB (all)")

[0]	train-auc:0.74383	valid-auc:0.71468
[50]	train-auc:0.84739	valid-auc:0.76991
[100]	train-auc:0.88335	valid-auc:0.77766
[150]	train-auc:0.90498	valid-auc:0.78060
[200]	train-auc:0.92095	valid-auc:0.78180
[250]	train-auc:0.93259	valid-auc:0.78286
[300]	train-auc:0.94290	valid-auc:0.78367
[350]	train-auc:0.95177	valid-auc:0.78425
[400]	train-auc:0.95968	valid-auc:0.78451
[450]	train-auc:0.96628	valid-auc:0.78426
[463]	train-auc:0.96769	valid-auc:0.78435
XGB (all) AUC: 0.784619


In [20]:
# LightGBM (all)
train_lgb_all = lgb.Dataset(X_train.values, label=y_train)
valid_lgb_all = lgb.Dataset(X_valid.values, label=y_valid, reference=train_lgb_all)

params_lgb_all = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbose": -1,
}

lgb_all = lgb.train(
    params_lgb_all,
    train_lgb_all,
    num_boost_round=1000,
    valid_sets=[train_lgb_all, valid_lgb_all],
    valid_names=["train", "valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50),
    ],
)

p_lgb_valid_all = lgb_all.predict(
    X_valid.values, num_iteration=lgb_all.best_iteration
)
auc_lgb_all = eval_model_probs(y_valid, p_lgb_valid_all, "LGBM (all)")

Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.796863	valid's auc: 0.767077
[100]	train's auc: 0.828207	valid's auc: 0.777479
[150]	train's auc: 0.85111	valid's auc: 0.781018
[200]	train's auc: 0.870475	valid's auc: 0.781921
[250]	train's auc: 0.887275	valid's auc: 0.782447
[300]	train's auc: 0.901345	valid's auc: 0.782726
[350]	train's auc: 0.913571	valid's auc: 0.782883
[400]	train's auc: 0.92376	valid's auc: 0.783169
[450]	train's auc: 0.933453	valid's auc: 0.783187
Early stopping, best iteration is:
[401]	train's auc: 0.923934	valid's auc: 0.783245
LGBM (all) AUC: 0.783245


## Feature importance (RF, XGB, LGBM)

In [21]:
# Random Forest importance (for reference)
rf_importances = rf_all.feature_importances_
rf_imp_df = (
    pd.DataFrame({"feature": feature_names, "importance": rf_importances})
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)
rf_imp_df.head(20)

Unnamed: 0,feature,importance
0,EXT_SOURCE_2,0.06535
1,EXT_SOURCE_3,0.065171
2,EXT_SOURCE_1,0.029941
3,DAYS_EMPLOYED,0.023576
4,client_installments_AMT_PAYMENT_min_sum,0.020425
5,bureau_DAYS_CREDIT_mean,0.017015
6,DAYS_BIRTH,0.015383
7,bureau_DAYS_CREDIT_max,0.012943
8,bureau_DAYS_CREDIT_ENDDATE_mean,0.009957
9,bureau_CREDIT_ACTIVE_Active_count_norm,0.009766


In [22]:
# XGB importance (gain)
xgb_score_dict = bst_all.get_score(importance_type="gain")
xgb_imp_df = (
    pd.DataFrame(
        {"feature": list(xgb_score_dict.keys()),
         "importance": list(xgb_score_dict.values())}
    )
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)
xgb_imp_df.head(20)

Unnamed: 0,feature,importance
0,EXT_SOURCE_3,51.365173
1,EXT_SOURCE_2,47.93821
2,NAME_EDUCATION_TYPE_Higher education,33.765339
3,CODE_GENDER_F,26.399105
4,NAME_EDUCATION_TYPE_Secondary / secondary special,24.791094
5,bureau_CREDIT_TYPE_Microloan_count,23.954027
6,FLAG_DOCUMENT_3,21.321989
7,previous_CNT_PAYMENT_max,19.972528
8,client_installments_AMT_PAYMENT_min_sum,19.891823
9,client_credit_CNT_DRAWINGS_CURRENT_mean_mean,18.974361


In [23]:
# LGBM importance (gain)
lgb_importances = lgb_all.feature_importance(importance_type="gain")
lgb_imp_df = (
    pd.DataFrame({"feature": feature_names, "importance": lgb_importances})
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)
lgb_imp_df.head(20)

Unnamed: 0,feature,importance
0,EXT_SOURCE_3,50470.955359
1,EXT_SOURCE_2,44089.389776
2,EXT_SOURCE_1,15807.805108
3,client_installments_AMT_PAYMENT_min_sum,12130.788414
4,DAYS_EMPLOYED,10069.637312
5,DAYS_BIRTH,6876.491731
6,AMT_CREDIT,6110.482536
7,AMT_ANNUITY,5433.618271
8,bureau_DAYS_CREDIT_max,4912.523885
9,previous_NAME_CONTRACT_STATUS_Refused_mean,4010.030403


## Non‑zero importance, union + intersection

In [24]:
# Non-zero importance features
xgb_nonzero_feats = xgb_imp_df[xgb_imp_df["importance"] > 0]["feature"].tolist()
lgb_nonzero_feats = lgb_imp_df[lgb_imp_df["importance"] > 0]["feature"].tolist()

len(xgb_nonzero_feats), len(lgb_nonzero_feats)

(625, 573)

In [25]:
# Union of non-zero features
selected_union = sorted(set(xgb_nonzero_feats) | set(lgb_nonzero_feats))
# Intersection of non-zero features
selected_int = sorted(set(xgb_nonzero_feats) & set(lgb_nonzero_feats))

len(feature_names), len(selected_union), len(selected_int)

(750, 629, 569)

In [26]:
X_train_union = X_train[selected_union]
X_valid_union = X_valid[selected_union]

X_train_int = X_train[selected_int]
X_valid_int = X_valid[selected_int]

X_train_union.shape, X_train_int.shape

((246008, 629), (246008, 569))

## Models on union‑selected features (+ blend)

In [27]:
# RF (union)
rf_union = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    min_samples_leaf=20,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced_subsample",
)
rf_union.fit(X_train_union, y_train)
p_rf_valid_union = rf_union.predict_proba(X_valid_union)[:, 1]
auc_rf_union = eval_model_probs(y_valid, p_rf_valid_union, "RF (union)")

RF (union) AUC: 0.754971


In [28]:
# XGB (union)
dtrain_union = xgb.DMatrix(X_train_union, label=y_train, feature_names=selected_union)
dvalid_union = xgb.DMatrix(X_valid_union, label=y_valid, feature_names=selected_union)

bst_union = xgb.train(
    params_xgb_all,
    dtrain_union,
    num_boost_round=500,
    evals=[(dtrain_union, "train"), (dvalid_union, "valid")],
    early_stopping_rounds=50,
    verbose_eval=50,
)

p_xgb_valid_union = bst_union.predict(
    dvalid_union, iteration_range=(0, bst_union.best_iteration + 1)
)
auc_xgb_union = eval_model_probs(y_valid, p_xgb_valid_union, "XGB (union)")

[0]	train-auc:0.72725	valid-auc:0.69760
[50]	train-auc:0.84884	valid-auc:0.77028
[100]	train-auc:0.88400	valid-auc:0.77890
[150]	train-auc:0.90529	valid-auc:0.78231
[200]	train-auc:0.92077	valid-auc:0.78374
[250]	train-auc:0.93283	valid-auc:0.78454
[300]	train-auc:0.94339	valid-auc:0.78530
[350]	train-auc:0.95184	valid-auc:0.78557
[400]	train-auc:0.95918	valid-auc:0.78553
[405]	train-auc:0.95966	valid-auc:0.78562
XGB (union) AUC: 0.785690


In [29]:
# LGBM (union)
train_lgb_union = lgb.Dataset(X_train_union.values, label=y_train)
valid_lgb_union = lgb.Dataset(X_valid_union.values, label=y_valid, reference=train_lgb_union)

lgb_union = lgb.train(
    params_lgb_all,
    train_lgb_union,
    num_boost_round=1000,
    valid_sets=[train_lgb_union, valid_lgb_union],
    valid_names=["train", "valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50),
    ],
)

p_lgb_valid_union = lgb_union.predict(
    X_valid_union.values, num_iteration=lgb_union.best_iteration
)
auc_lgb_union = eval_model_probs(y_valid, p_lgb_valid_union, "LGBM (union)")

Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.79836	valid's auc: 0.766224
[100]	train's auc: 0.828808	valid's auc: 0.777416
[150]	train's auc: 0.851958	valid's auc: 0.781099
[200]	train's auc: 0.871977	valid's auc: 0.782018
[250]	train's auc: 0.888119	valid's auc: 0.782573
[300]	train's auc: 0.902047	valid's auc: 0.782883
[350]	train's auc: 0.914126	valid's auc: 0.782524
Early stopping, best iteration is:
[307]	train's auc: 0.903961	valid's auc: 0.782987
LGBM (union) AUC: 0.782987


In [30]:
# 50/50 blend (union)
p_blend_union = 0.5 * p_xgb_valid_union + 0.5 * p_lgb_valid_union
auc_blend_union = eval_model_probs(y_valid, p_blend_union, "Blend 50/50 (union)")

Blend 50/50 (union) AUC: 0.787099


## Models on intersection‑selected features (+ blend)

In [31]:
# XGB (intersection)
dtrain_int = xgb.DMatrix(X_train_int, label=y_train, feature_names=selected_int)
dvalid_int = xgb.DMatrix(X_valid_int, label=y_valid, feature_names=selected_int)

bst_int = xgb.train(
    params_xgb_all,
    dtrain_int,
    num_boost_round=500,
    evals=[(dtrain_int, "train"), (dvalid_int, "valid")],
    early_stopping_rounds=50,
    verbose_eval=50,
)

p_xgb_valid_int = bst_int.predict(
    dvalid_int, iteration_range=(0, bst_int.best_iteration + 1)
)
auc_xgb_int = eval_model_probs(y_valid, p_xgb_valid_int, "XGB (intersection)")

[0]	train-auc:0.74317	valid-auc:0.71633
[50]	train-auc:0.84815	valid-auc:0.76863
[100]	train-auc:0.88383	valid-auc:0.77745
[150]	train-auc:0.90487	valid-auc:0.78092
[200]	train-auc:0.92038	valid-auc:0.78214
[250]	train-auc:0.93361	valid-auc:0.78308
[300]	train-auc:0.94422	valid-auc:0.78368
[350]	train-auc:0.95293	valid-auc:0.78372
[372]	train-auc:0.95629	valid-auc:0.78354
XGB (intersection) AUC: 0.784082


In [32]:
# LGBM (intersection)
train_lgb_int = lgb.Dataset(X_train_int.values, label=y_train)
valid_lgb_int = lgb.Dataset(X_valid_int.values, label=y_valid, reference=train_lgb_int)

lgb_int = lgb.train(
    params_lgb_all,
    train_lgb_int,
    num_boost_round=1000,
    valid_sets=[train_lgb_int, valid_lgb_int],
    valid_names=["train", "valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50),
    ],
)

p_lgb_valid_int = lgb_int.predict(
    X_valid_int.values, num_iteration=lgb_int.best_iteration
)
auc_lgb_int = eval_model_probs(y_valid, p_lgb_valid_int, "LGBM (intersection)")

Training until validation scores don't improve for 50 rounds
[50]	train's auc: 0.797399	valid's auc: 0.767392
[100]	train's auc: 0.829226	valid's auc: 0.778171
[150]	train's auc: 0.851945	valid's auc: 0.782163
[200]	train's auc: 0.870454	valid's auc: 0.782897
[250]	train's auc: 0.887109	valid's auc: 0.783703
[300]	train's auc: 0.901231	valid's auc: 0.784188
[350]	train's auc: 0.9136	valid's auc: 0.78431
[400]	train's auc: 0.924068	valid's auc: 0.784622
Early stopping, best iteration is:
[397]	train's auc: 0.923425	valid's auc: 0.784666
LGBM (intersection) AUC: 0.784666


In [33]:
# 50/50 blend (intersection)
p_blend_int = 0.5 * p_xgb_valid_int + 0.5 * p_lgb_valid_int
auc_blend_int = eval_model_probs(y_valid, p_blend_int, "Blend 50/50 (intersection)")

Blend 50/50 (intersection) AUC: 0.787173


## Summary comparison

In [34]:
results = pd.DataFrame(
    {
        "model": [
            "RF (all)",
            "RF (union)",
            "XGB (all)",
            "XGB (union)",
            "XGB (intersection)",
            "LGBM (all)",
            "LGBM (union)",
            "LGBM (intersection)",
            "Blend 50/50 (all)",
            "Blend 50/50 (union)",
            "Blend 50/50 (intersection)",
        ],
        "AUC": [
            auc_rf_all,
            auc_rf_union,
            auc_xgb_all,
            auc_xgb_union,
            auc_xgb_int,
            auc_lgb_all,
            auc_lgb_union,
            auc_lgb_int,
            0.5 * auc_xgb_all + 0.5 * auc_lgb_all,  # not exact but left for ref
            auc_blend_union,
            auc_blend_int,
        ],
        "num_features": [
            X_train.shape[1],
            X_train_union.shape[1],
            X_train.shape[1],
            X_train_union.shape[1],
            X_train_int.shape[1],
            X_train.shape[1],
            X_train_union.shape[1],
            X_train_int.shape[1],
            X_train.shape[1],
            X_train_union.shape[1],
            X_train_int.shape[1],
        ],
    }
)

results.sort_values("AUC", ascending=False).reset_index(drop=True)

Unnamed: 0,model,AUC,num_features
0,Blend 50/50 (intersection),0.787173,569
1,Blend 50/50 (union),0.787099,629
2,XGB (union),0.78569,629
3,LGBM (intersection),0.784666,569
4,XGB (all),0.784619,750
5,XGB (intersection),0.784082,569
6,Blend 50/50 (all),0.783932,750
7,LGBM (all),0.783245,750
8,LGBM (union),0.782987,629
9,RF (union),0.754971,629


## Save validation labels and probs for evaluation

In [35]:
import numpy as np
import os

EVAL_DIR = os.path.join(BASE_PATH, "Preprocessing_FE_dataset")
os.makedirs(EVAL_DIR, exist_ok=True)

eval_save_path = os.path.join(EVAL_DIR, "eval_blend_int_valid.npz")

np.savez_compressed(
    eval_save_path,
    y_valid=y_valid,
    p_blend_int=p_blend_int,
)

eval_save_path

'/Users/agarwalh/DSIP/Project/DSIP_Project/Preprocessing_FE_dataset/eval_blend_int_valid.npz'

In [36]:
import os

# Rebuild X, y from df if needed
X_full = df.drop(columns=[ID_COL, TARGET_COL]).astype("float32")
y_full = df[TARGET_COL].values

# Use intersection features
feature_cols_int = sorted(selected_int)

X_full_int = X_full[feature_cols_int].values

X_full_int.shape, y_full.shape

((307511, 569), (307511,))