In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data_cleaned.csv')

In [3]:
#encoding
le = LabelEncoder()
df['Type_encoded'] = le.fit_transform(df['Type'])  # L->0, M->1, H->2 

In [4]:
df = df.drop(columns=['Type','Unnamed: 0','Failure Type'])

In [5]:
#splitting
X = df.drop('Target', axis=1)  
y = df['Target'] 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
#before handling imbalance
y_train.value_counts()

Target
0    7729
1     271
Name: count, dtype: int64

In [8]:
#SMOTE 
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
y_train_res.value_counts()

Target
0    7729
1    7729
Name: count, dtype: int64

Model Experimentation

In [9]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

lr = LogisticRegression(
    max_iter=1000,
    class_weight = 'balanced',
    random_state = 42
)

lr.fit(X_train_res, y_train_res)

y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)[:,1]

print("=== Logistic Regression Metrics ===")
print(classification_report(y_test, y_pred))


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC Score:", roc_auc)

=== Logistic Regression Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.85      0.91      1932
           1       0.15      0.79      0.26        68

    accuracy                           0.84      2000
   macro avg       0.57      0.82      0.58      2000
weighted avg       0.96      0.84      0.89      2000

Confusion Matrix:
 [[1634  298]
 [  14   54]]
ROC-AUC Score: 0.894782913165266


In [10]:
X_train_res.columns = X_train_res.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

In [11]:
#xgboost
from xgboost import XGBClassifier

scale_pos_weight = (y_train_res==0).sum() / (y_train_res==1).sum()


xgb = XGBClassifier(
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)


xgb.fit(X_train_res, y_train_res)


y_pred = xgb.predict(X_test)
y_proba = xgb.predict_proba(X_test)[:,1]


print("=== XGBoost Metrics ===")
print(classification_report(y_test, y_pred))


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC Score:", roc_auc)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== XGBoost Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1932
           1       0.56      0.81      0.66        68

    accuracy                           0.97      2000
   macro avg       0.78      0.89      0.82      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
 [[1889   43]
 [  13   55]]
ROC-AUC Score: 0.9626720253318719


In [12]:
#RansomForestClassfier
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'  
)

rf.fit(X_train_res, y_train_res)

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

print("=== Random Forest Metrics ===")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC Score:", roc_auc)

=== Random Forest Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1932
           1       0.43      0.75      0.55        68

    accuracy                           0.96      2000
   macro avg       0.71      0.86      0.76      2000
weighted avg       0.97      0.96      0.96      2000

Confusion Matrix:
 [[1864   68]
 [  17   51]]
ROC-AUC Score: 0.9619489099987821


In [13]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

dt = DecisionTreeClassifier(
    max_depth=6,                
    min_samples_leaf=20,        
    class_weight='balanced',
    random_state=42
)

dt.fit(X_train_res, y_train_res)

y_pred = dt.predict(X_test)
y_proba = dt.predict_proba(X_test)[:,1]

print("=== Decision Tree Metrics ===")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


=== Decision Tree Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      1932
           1       0.31      0.85      0.45        68

    accuracy                           0.93      2000
   macro avg       0.65      0.89      0.71      2000
weighted avg       0.97      0.93      0.95      2000

Confusion Matrix:
 [[1803  129]
 [  10   58]]
ROC-AUC: 0.9459528376568019


individually they seems fine with average recall, so lets experiment with ensembling

# ensembling

In [14]:
# Xg + RF with softvoting

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

ensemble_soft = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb)
    ],
    voting='soft'
)

ensemble_soft.fit(X_train_res, y_train_res)

y_pred = ensemble_soft.predict(X_test)
y_proba = ensemble_soft.predict_proba(X_test)[:, 1]

print("=== Soft Voting Ensemble Metrics ===")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Soft Voting Ensemble Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.50      0.79      0.62        68

    accuracy                           0.97      2000
   macro avg       0.75      0.88      0.80      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
 [[1879   53]
 [  14   54]]
ROC-AUC: 0.9683503836317136


In [15]:
# Xg + RF with weighted classifier
ensemble_weighted = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb)
    ],
    voting='soft',
    weights=[1, 2]  )

ensemble_weighted.fit(X_train_res, y_train_res)

y_pred = ensemble_weighted.predict(X_test)
y_proba = ensemble_weighted.predict_proba(X_test)[:, 1]

print("=== Weighted Ensemble Metrics ===")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Weighted Ensemble Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.52      0.81      0.63        68

    accuracy                           0.97      2000
   macro avg       0.76      0.89      0.81      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
 [[1881   51]
 [  13   55]]
ROC-AUC: 0.9690354402630617


In [16]:
# XG + RF with stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb)
    ],
    final_estimator=LogisticRegression(class_weight='balanced'),
    stack_method='predict_proba',
    n_jobs=-1
)

stacking_clf.fit(X_train_res, y_train_res)

y_pred = stacking_clf.predict(X_test)
y_proba = stacking_clf.predict_proba(X_test)[:, 1]

print("=== Stacking Ensemble Metrics ===")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


=== Stacking Ensemble Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1932
           1       0.54      0.76      0.63        68

    accuracy                           0.97      2000
   macro avg       0.76      0.87      0.81      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
 [[1887   45]
 [  16   52]]
ROC-AUC: 0.9689136524174886


weighted classifying method gives better results

In [17]:
# XG + RF + DT with weighted voting
ensemble_final = VotingClassifier(
    estimators=[
        ('dt', dt),
        ('rf', rf),
        ('xgb', xgb)
    ],
    voting='soft',
    weights=[1, 2, 3]
)

ensemble_final.fit(X_train_res, y_train_res)

y_pred = ensemble_final.predict(X_test)
y_proba = ensemble_final.predict_proba(X_test)[:,1]

print("=== Final Ensemble (DT + RF + XGB) Metrics ===")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== Final Ensemble (DT + RF + XGB) Metrics ===
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.51      0.81      0.62        68

    accuracy                           0.97      2000
   macro avg       0.75      0.89      0.80      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
 [[1879   53]
 [  13   55]]
ROC-AUC: 0.9682971014492754


3 models does not seem any better. Let's experiment with DT, RF, XG 2 at a time

#experimenting with multiple thresholds on the weihted ensemble model (XG + RF)

In [18]:
y_proba = ensemble_weighted.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC (Final Model):", roc_auc)

#multiple thresholds
thresholds = [0.25, 0.30, 0.35, 0.40, 0.50]

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)

    print("\n" + "="*50)
    print(f"Threshold = {t}")
    print("="*50)

    print(classification_report(y_test, y_pred_t))

    cm = confusion_matrix(y_test, y_pred_t)
    print("Confusion Matrix:\n", cm)

ROC-AUC (Final Model): 0.9690354402630617

Threshold = 0.25
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1932
           1       0.43      0.84      0.56        68

    accuracy                           0.96      2000
   macro avg       0.71      0.90      0.77      2000
weighted avg       0.97      0.96      0.96      2000

Confusion Matrix:
 [[1855   77]
 [  11   57]]

Threshold = 0.3
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1932
           1       0.45      0.84      0.59        68

    accuracy                           0.96      2000
   macro avg       0.72      0.90      0.78      2000
weighted avg       0.98      0.96      0.97      2000

Confusion Matrix:
 [[1863   69]
 [  11   57]]

Threshold = 0.35
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.48      0.84      0.61       

default is 0.5 taken and the performance seems a bit better at 0.35 . i'd like not to go lower than 0.35 tho

#XG + RF with ADASYN sampling at 0.5 nd 0.35

In [20]:
from imblearn.over_sampling import ADASYN

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
X_train.columns = X_train.columns.str.replace('[\\[\\]<]', '', regex=True)
X_test.columns = X_test.columns.str.replace('[\\[\\]<]', '', regex=True)

adasyn = ADASYN(random_state=42)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)

print("After ADASYN, class counts:", np.bincount(y_train_res))
# RF
rf_best = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

# XG
scale_pos_weight = (y_train_res == 0).sum() / (y_train_res == 1).sum()
xgb_best = XGBClassifier(
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

#Ensemble
ensemble_best = VotingClassifier(
    estimators=[
        ('rf', rf_best),
        ('xgb', xgb_best)
    ],
    voting='soft',
    weights=[2, 3]
)


ensemble_best.fit(X_train_res, y_train_res)

y_proba = ensemble_best.predict_proba(X_test)[:, 1]
#0.5
y_pred_default = (y_proba >= 0.5).astype(int)
print("\n=== Weighted Ensemble with ADASYN, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

#0.35
threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== Weighted Ensemble with ADASYN, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


After ADASYN, class counts: [7729 7671]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Weighted Ensemble with ADASYN, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1932
           1       0.52      0.81      0.64        68

    accuracy                           0.97      2000
   macro avg       0.76      0.89      0.81      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
 [[1882   50]
 [  13   55]]
ROC-AUC: 0.9692409572524663

=== Weighted Ensemble with ADASYN, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       0.99      0.96      0.98      1932
           1       0.43      0.82      0.57        68

    accuracy                           0.96      2000
   macro avg       0.71      0.89      0.77      2000
weighted avg       0.97      0.96      0.96      2000

Confusion Matrix:
 [[1858   74]
 [  12   56]]
ROC-AUC: 0.9692409572524663


#let's try XG + DT with SMOTE and ADASYn at 0.5 nd 0.35

In [21]:
adasyn = ADASYN(random_state=42)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)


dt = DecisionTreeClassifier(
    max_depth=5,  
    random_state=42
)

scale_pos_weight = (y_train_res == 0).sum() / (y_train_res == 1).sum()
xgb = XGBClassifier(
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)

ensemble_dt_xgb = VotingClassifier(
    estimators=[('dt', dt), ('xgb', xgb)],
    voting='soft',
    weights=[3, 2]
)

ensemble_dt_xgb.fit(X_train_res, y_train_res)

y_proba = ensemble_dt_xgb.predict_proba(X_test)[:,1]


y_pred_default = (y_proba >= 0.5).astype(int)
print("=== DT+XGB Ensemble, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== DT+XGB Ensemble, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


=== DT+XGB Ensemble, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       0.99      0.94      0.97      1932
           1       0.34      0.85      0.48        68

    accuracy                           0.94      2000
   macro avg       0.66      0.90      0.72      2000
weighted avg       0.97      0.94      0.95      2000

Confusion Matrix:
 [[1817  115]
 [  10   58]]
ROC-AUC: 0.9359624284496407

=== DT+XGB Ensemble, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       1.00      0.91      0.95      1932
           1       0.26      0.88      0.41        68

    accuracy                           0.91      2000
   macro avg       0.63      0.90      0.68      2000
weighted avg       0.97      0.91      0.93      2000

Confusion Matrix:
 [[1764  168]
 [   8   60]]
ROC-AUC: 0.9359624284496407


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
X_train.columns = X_train.columns.str.replace('[\\[\\]<]', '', regex=True)
X_test.columns = X_test.columns.str.replace('[\\[\\]<]', '', regex=True)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("After SMOTE, class counts:", np.bincount(y_train_res))

#DT
dt = DecisionTreeClassifier(max_depth=5, random_state=42)

# XG
scale_pos_weight = (y_train_res == 0).sum() / (y_train_res == 1).sum()
xgb = XGBClassifier(
    n_estimators=200,
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    random_state=42
)


ensemble_dt_xgb = VotingClassifier(
    estimators=[('dt', dt), ('xgb', xgb)],
    voting='soft',
    weights=[3, 2]
)


ensemble_dt_xgb.fit(X_train_res, y_train_res)

y_proba = ensemble_dt_xgb.predict_proba(X_test)[:, 1]


y_pred_default = (y_proba >= 0.5).astype(int)
print("\n=== DT+XGB Ensemble with SMOTE, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== DT+XGB Ensemble with SMOTE, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


After SMOTE, class counts: [7729 7729]

=== DT+XGB Ensemble with SMOTE, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1932
           1       0.37      0.85      0.52        68

    accuracy                           0.95      2000
   macro avg       0.68      0.90      0.74      2000
weighted avg       0.97      0.95      0.96      2000

Confusion Matrix:
 [[1833   99]
 [  10   58]]
ROC-AUC: 0.9550222262818171

=== DT+XGB Ensemble with SMOTE, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      1932
           1       0.29      0.88      0.43        68

    accuracy                           0.92      2000
   macro avg       0.64      0.90      0.69      2000
weighted avg       0.97      0.92      0.94      2000

Confusion Matrix:
 [[1782  150]
 [   8   60]]
ROC-AUC: 0.9550222262818171


#let's try RF + DT with SMOTE and ADASYN at 0.5 and 0.35

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
X_train.columns = X_train.columns.str.replace('[\\[\\]<]', '', regex=True)
X_test.columns = X_test.columns.str.replace('[\\[\\]<]', '', regex=True)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("After SMOTE, class counts:", np.bincount(y_train_res))

#DT
dt = DecisionTreeClassifier(max_depth=5, random_state=42)

#RF
rf = RandomForestClassifier(n_estimators=200, random_state=42)

ensemble_dt_rf = VotingClassifier(
    estimators=[('dt', dt), ('rf', rf)],
    voting='soft',
    weights=[3, 2]
)

ensemble_dt_rf.fit(X_train_res, y_train_res)

y_proba = ensemble_dt_rf.predict_proba(X_test)[:, 1]

y_pred_default = (y_proba >= 0.5).astype(int)
print("\n=== DT+RF Ensemble with SMOTE, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with SMOTE, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


After SMOTE, class counts: [7729 7729]

=== DT+RF Ensemble with SMOTE, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      1932
           1       0.30      0.85      0.44        68

    accuracy                           0.93      2000
   macro avg       0.65      0.89      0.70      2000
weighted avg       0.97      0.93      0.94      2000

Confusion Matrix:
 [[1797  135]
 [  10   58]]
ROC-AUC: 0.9563352514919011

=== DT+RF Ensemble with SMOTE, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      1932
           1       0.28      0.88      0.43        68

    accuracy                           0.92      2000
   macro avg       0.64      0.90      0.69      2000
weighted avg       0.97      0.92      0.94      2000

Confusion Matrix:
 [[1778  154]
 [   8   60]]
ROC-AUC: 0.9563352514919011


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.columns = X_train.columns.astype(str).str.replace('[\\[\\]<]', '', regex=True)
X_test.columns = X_test.columns.astype(str).str.replace('[\\[\\]<]', '', regex=True)

adasyn = ADASYN(random_state=42)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)
print("After ADASYN, class counts:", np.bincount(y_train_res))

dt = DecisionTreeClassifier(max_depth=5,  random_state=42)
rf = RandomForestClassifier(n_estimators=200,  random_state=42)

ensemble = VotingClassifier(
    estimators=[('dt', dt), ('rf', rf)],
    voting='soft',
    weights=[4, 2]  )

ensemble.fit(X_train_res, y_train_res)

y_proba = ensemble.predict_proba(X_test)[:,1]


y_pred_default = (y_proba >= 0.5).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


threshold = 0.3
y_pred_thresh2 = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.3 ===")
print(classification_report(y_test, y_pred_thresh2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh2))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


After ADASYN, class counts: [7729 7671]

=== DT+RF Ensemble with ADASYN, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1932
           1       0.27      0.87      0.41        68

    accuracy                           0.92      2000
   macro avg       0.63      0.89      0.68      2000
weighted avg       0.97      0.92      0.94      2000

Confusion Matrix:
 [[1773  159]
 [   9   59]]
ROC-AUC: 0.957770064547558

=== DT+RF Ensemble with ADASYN, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      1932
           1       0.26      0.87      0.40        68

    accuracy                           0.91      2000
   macro avg       0.63      0.89      0.68      2000
weighted avg       0.97      0.91      0.93      2000

Confusion Matrix:
 [[1764  168]
 [   9   59]]
ROC-AUC: 0.957770064547558

=== DT+RF Ensemble with ADASYN, Threshold 0.3 ===
      

#getting better. Let's try playing with model weights

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.columns = X_train.columns.astype(str).str.replace('[\\[\\]<]', '', regex=True)
X_test.columns = X_test.columns.astype(str).str.replace('[\\[\\]<]', '', regex=True)

adasyn = ADASYN(random_state=42)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)
print("After ADASYN, class counts:", np.bincount(y_train_res))

dt = DecisionTreeClassifier(max_depth=5, random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)

ensemble = VotingClassifier(
    estimators=[('dt', dt), ('rf', rf)],
    voting='soft',
    weights=[5, 2]  
)
ensemble.fit(X_train_res, y_train_res)


y_proba = ensemble.predict_proba(X_test)[:,1]


y_pred_default = (y_proba >= 0.5).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

threshold = 0.3
y_pred_thresh2 = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.3 ===")
print(classification_report(y_test, y_pred_thresh2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh2))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


After ADASYN, class counts: [7729 7671]

=== DT+RF Ensemble with ADASYN, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1932
           1       0.27      0.87      0.41        68

    accuracy                           0.92      2000
   macro avg       0.63      0.89      0.68      2000
weighted avg       0.97      0.92      0.94      2000

Confusion Matrix:
 [[1771  161]
 [   9   59]]
ROC-AUC: 0.9560269760077944

=== DT+RF Ensemble with ADASYN, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       0.99      0.91      0.95      1932
           1       0.26      0.87      0.40        68

    accuracy                           0.91      2000
   macro avg       0.63      0.89      0.68      2000
weighted avg       0.97      0.91      0.93      2000

Confusion Matrix:
 [[1764  168]
 [   9   59]]
ROC-AUC: 0.9560269760077944

=== DT+RF Ensemble with ADASYN, Threshold 0.3 ===
    

In [27]:
#almost the same , let's do class weights too

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.columns = X_train.columns.astype(str).str.replace('[\\[\\]<]', '', regex=True)
X_test.columns = X_test.columns.astype(str).str.replace('[\\[\\]<]', '', regex=True)

adasyn = ADASYN(random_state=42)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)
print("After ADASYN, class counts:", np.bincount(y_train_res))

dt = DecisionTreeClassifier(max_depth=5, class_weight={0:1, 1:10}, random_state=42)
rf = RandomForestClassifier(n_estimators=200, class_weight={0:1, 1:10}, random_state=42)

ensemble = VotingClassifier(
    estimators=[('dt', dt), ('rf', rf)],
    voting='soft',
    weights=[5, 2]  
)
ensemble.fit(X_train_res, y_train_res)

y_proba = ensemble.predict_proba(X_test)[:,1]

y_pred_default = (y_proba >= 0.5).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.5 ===")
print(classification_report(y_test, y_pred_default))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_default))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


threshold = 0.35
y_pred_thresh = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.35 ===")
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


threshold = 0.3
y_pred_thresh2 = (y_proba >= threshold).astype(int)
print("\n=== DT+RF Ensemble with ADASYN, Threshold 0.3 ===")
print(classification_report(y_test, y_pred_thresh2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thresh2))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


After ADASYN, class counts: [7729 7671]

=== DT+RF Ensemble with ADASYN, Threshold 0.5 ===
              precision    recall  f1-score   support

           0       1.00      0.84      0.91      1932
           1       0.17      0.93      0.29        68

    accuracy                           0.85      2000
   macro avg       0.58      0.88      0.60      2000
weighted avg       0.97      0.85      0.89      2000

Confusion Matrix:
 [[1629  303]
 [   5   63]]
ROC-AUC: 0.9436502862014372

=== DT+RF Ensemble with ADASYN, Threshold 0.35 ===
              precision    recall  f1-score   support

           0       1.00      0.76      0.86      1932
           1       0.12      0.93      0.21        68

    accuracy                           0.76      2000
   macro avg       0.56      0.84      0.54      2000
weighted avg       0.97      0.76      0.84      2000

Confusion Matrix:
 [[1463  469]
 [   5   63]]
ROC-AUC: 0.9436502862014372

=== DT+RF Ensemble with ADASYN, Threshold 0.3 ===
    

This is much better. Recall 0.93 at 0.5, 0.93 at 0.35, 0.96 at 0.3 threshold

In [29]:
import joblib

saved_models = {
    'ensemble_model': ensemble,  
    'thresholds': {
        '0.5': 0.5,
        '0.35': 0.35,
        '0.3': 0.3
    }
}

joblib.dump({'model': ensemble, 'threshold': 0.5}, 'ensemble_thresh_0.5.pkl')
joblib.dump({'model': ensemble, 'threshold': 0.35}, 'ensemble_thresh_0.35.pkl')
joblib.dump({'model': ensemble, 'threshold': 0.3}, 'ensemble_thresh_0.3.pkl')


['ensemble_thresh_0.3.pkl']