In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('updatedcreditcard.csv')
df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,street,city,state,zip,lat,...,merch_lat,merch_long,is_fraud,age,year,month,hour,day_of_week,age_group,distance_km
0,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,...,36.011293,-82.048315,0,30,2019,1,0,1,25-34,78.597568
1,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,...,49.159047,-118.186462,0,40,2019,1,0,1,35-44,30.212176
2,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,...,43.150704,-112.154481,0,56,2019,1,0,1,55-64,108.206083
3,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,...,47.034331,-112.561071,0,51,2019,1,0,1,45-54,95.673231
4,375534000000000.0,fraud_Keeling-Crist,misc_pos,41.96,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,...,38.674999,-78.632459,0,32,2019,1,0,1,25-34,77.556744


In [2]:
# -----------------------------------------------------------
# 1) IMPORTS & SETUP
# -----------------------------------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn / ML
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# XGBoost (CPU-based)
import xgboost as xgb

# RandomForest
from sklearn.ensemble import RandomForestClassifier

# Advanced encoding for high-cardinality or single-col cat
from category_encoders import TargetEncoder

# For synthetic sampling (SMOTE)
from imblearn.over_sampling import SMOTE

# Optional progress bar
from tqdm import tqdm

sns.set_style("whitegrid")

# -----------------------------------------------------------
# 2) CREATE A COPY OF DF -> comp_df AND SPLIT INTO TRAIN/VAL/TEST
# -----------------------------------------------------------
"""
We assume your original DataFrame is named 'df' and has columns:
   'category','amt','gender','city','state','city_pop','job',
   'is_fraud','age','year','month','hour','day_of_week','distance_km',
   etc. (adjust as needed).

We will:
   1) Make a copy: comp_df = df.copy()
   2) Use comp_df in all subsequent transformations and modeling.
"""

# 2.1) Copy the original DataFrame
comp_df = df.copy()

# 2.2) Define the columns we actually need
useful_cols = [
    "category", "amt", "gender", "city", "state", "city_pop", "job",
    "is_fraud", "age", "year", "month", "hour", "day_of_week", 
    "distance_km"
]

# 2.3) Clean: drop rows with missing data (if any)
comp_df.dropna(subset=useful_cols, inplace=True)

# 2.4) X / y
X = comp_df.drop(columns=["is_fraud"])
y = comp_df["is_fraud"].astype(int)

print("Initial shape of X:", X.shape)
print("Initial shape of y:", y.shape)
print("Class distribution:\n", y.value_counts(normalize=True)*100, "%")

# Split: 80% Train / 20% Temp
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# From the 20% Temp, split 50/50 => 10% Validation, 10% Test (of original data)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print("\n=== FINAL SPLITS ===")
print("Train shape:", X_train_temp.shape, y_train_temp.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

print("\nTrain class distribution:\n", y_train_temp.value_counts(normalize=True)*100, "%")
print("Val class distribution:\n", y_val.value_counts(normalize=True)*100, "%")
print("Test class distribution:\n", y_test.value_counts(normalize=True)*100, "%")

# -----------------------------------------------------------
# 3) COLUMN GROUPS FOR ENCODING
# -----------------------------------------------------------
numeric_features = [
    "amt", "distance_km", "age", "city_pop", 
    "year", "month", "hour", "day_of_week"
]
cat_small = ["gender"]                         # one-hot
cat_high  = ["city","state","job","category"]  # target-encode

# -----------------------------------------------------------
# 4) COLUMNTRANSFORMER (SCALING + ENCODING)
# -----------------------------------------------------------
numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

ohe_transformer = Pipeline([
    ("ohe", OneHotEncoder(drop='first', handle_unknown='ignore'))
])

target_encoder = Pipeline([
    ("target_enc", TargetEncoder(smoothing=0.3))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("ohe", ohe_transformer, cat_small),
        ("te",  target_encoder,  cat_high),
    ],
    remainder="drop"
)

# -----------------------------------------------------------
# 5) FIT PREPROCESSOR & APPLY SMOTE (TRAIN ONLY)
# -----------------------------------------------------------
X_train_enc = preprocessor.fit_transform(X_train_temp, y_train_temp)
print("\n[INFO] Shape of X_train_enc BEFORE SMOTE:", X_train_enc.shape)
print("Class distribution in y_train_temp BEFORE SMOTE:", np.bincount(y_train_temp))

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train_enc, y_train_temp)

print("\n[INFO] Shape of X_train_enc AFTER SMOTE:", X_train_sm.shape)
print("Class distribution in y_train_sm AFTER SMOTE:", np.bincount(y_train_sm))

# Encode validation & test sets (do NOT refit)
X_val_enc  = preprocessor.transform(X_val)
X_test_enc = preprocessor.transform(X_test)

# Build final feature name list
ohe_step = preprocessor.named_transformers_['ohe'].named_steps['ohe']
ohe_feature_names = ohe_step.get_feature_names_out(cat_small)
te_cols = [f"{col}_te" for col in cat_high]
final_feature_names = numeric_features + list(ohe_feature_names) + te_cols

print("\n[DEBUG] Final Encoded Feature List (Train/Val/Test share):")
print(final_feature_names)

Initial shape of X: (1048575, 24)
Initial shape of y: (1048575,)
Class distribution:
 is_fraud
0    99.427223
1     0.572777
Name: proportion, dtype: float64 %

=== FINAL SPLITS ===
Train shape: (838860, 24) (838860,)
Validation shape: (104857, 24) (104857,)
Test shape: (104858, 24) (104858,)

Train class distribution:
 is_fraud
0    99.427199
1     0.572801
Name: proportion, dtype: float64 %
Val class distribution:
 is_fraud
0    99.427792
1     0.572208
Name: proportion, dtype: float64 %
Test class distribution:
 is_fraud
0    99.426844
1     0.573156
Name: proportion, dtype: float64 %

[INFO] Shape of X_train_enc BEFORE SMOTE: (838860, 13)
Class distribution in y_train_temp BEFORE SMOTE: [834055   4805]

[INFO] Shape of X_train_enc AFTER SMOTE: (1668110, 13)
Class distribution in y_train_sm AFTER SMOTE: [834055 834055]

[DEBUG] Final Encoded Feature List (Train/Val/Test share):
['amt', 'distance_km', 'age', 'city_pop', 'year', 'month', 'hour', 'day_of_week', 'gender_M', 'city_te', '

In [3]:
# ===========================================================
# PART B: RANDOM FOREST (for comparison)
# ===========================================================
print("\n================= RANDOM FOREST PIPELINE =================\n")

# -----------------------------------------------------------
# B.1) RFE USING RANDOMFOREST ON TRAIN
# -----------------------------------------------------------
rf_clf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1  # parallel
)

rf_rfe = RFE(
    estimator=rf_clf,
    n_features_to_select=10,
    step=1,
    verbose=1
)

print("\n[DEBUG] Starting RFE fit (RandomForest) on TRAIN ONLY... (this may take a while)")
rf_rfe.fit(X_train_sm, y_train_sm)

rf_support_mask = rf_rfe.support_
rf_ranking = rf_rfe.ranking_

print("\n[DEBUG] RandomForest RFE Feature Ranking Results:")
for i, feat_name in enumerate(final_feature_names):
    print(f"{i}: {feat_name} | Support={rf_support_mask[i]}, Rank={rf_ranking[i]}")

rf_selected_features = [f for f, s in zip(final_feature_names, rf_support_mask) if s]
print("\n=== SELECTED FEATURES by RandomForest RFE ===")
for feat in rf_selected_features:
    print("   -", feat)

X_train_sel_rf = rf_rfe.transform(X_train_sm)
X_val_sel_rf   = rf_rfe.transform(X_val_enc)
X_test_sel_rf  = rf_rfe.transform(X_test_enc)

# -----------------------------------------------------------
# B.2) TRAIN FINAL RANDOMFOREST ON SELECTED FEATURES (TRAIN ONLY)
# -----------------------------------------------------------
final_rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

print("\n[DEBUG] Training final RandomForest on selected features (TRAIN ONLY)...")
final_rf.fit(X_train_sel_rf, y_train_sm)

# -----------------------------------------------------------
# B.3) EVALUATE ON TRAIN SET FIRST (OVERFITTING CHECK)
# -----------------------------------------------------------
rf_train_pred = final_rf.predict(X_train_sel_rf)
rf_train_acc  = accuracy_score(y_train_sm, rf_train_pred)

print("\n=== [RandomForest] MODEL PERFORMANCE ON TRAIN SET (WITH SMOTE) ===")
print("Accuracy (Train):", rf_train_acc)
print("Classification Report (Train):\n", classification_report(y_train_sm, rf_train_pred))
print("Confusion Matrix (Train):\n", confusion_matrix(y_train_sm, rf_train_pred))

# -----------------------------------------------------------
# B.4) EVALUATE ON VALIDATION SET
# -----------------------------------------------------------
rf_val_pred = final_rf.predict(X_val_sel_rf)
rf_val_acc = accuracy_score(y_val, rf_val_pred)

print("\n=== [RandomForest] MODEL PERFORMANCE ON VALIDATION SET ===")
print("Accuracy (Val):", rf_val_acc)
print("Classification Report (Val):\n", classification_report(y_val, rf_val_pred))
print("Confusion Matrix (Val):\n", confusion_matrix(y_val, rf_val_pred))

# Decide if we want to proceed to the test set:
RF_VAL_THRESHOLD = 0.99  # Example threshold. Adjust as needed.

if rf_val_acc < RF_VAL_THRESHOLD:
    print(f"\n[WARNING] [RandomForest] Validation Accuracy {rf_val_acc:.4f} < {RF_VAL_THRESHOLD}.")
    print("We are NOT satisfied. Consider tuning hyperparameters, features, etc.")
    print("Skipping test evaluation for RandomForest.\n")
else:
    # -----------------------------------------------------------
    # B.5) EVALUATE ON TEST SET IF VALIDATION IS GOOD
    # -----------------------------------------------------------
    rf_test_pred = final_rf.predict(X_test_sel_rf)
    rf_test_acc = accuracy_score(y_test, rf_test_pred)
    print("\n=== [RandomForest] MODEL PERFORMANCE ON TEST SET ===")
    print("Accuracy (Test):", rf_test_acc)
    print("Classification Report (Test):\n", classification_report(y_test, rf_test_pred))
    print("Confusion Matrix (Test):\n", confusion_matrix(y_test, rf_test_pred))






[DEBUG] Starting RFE fit (RandomForest) on TRAIN ONLY... (this may take a while)
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.

[DEBUG] RandomForest RFE Feature Ranking Results:
0: amt | Support=True, Rank=1
1: distance_km | Support=False, Rank=3
2: age | Support=True, Rank=1
3: city_pop | Support=True, Rank=1
4: year | Support=False, Rank=4
5: month | Support=True, Rank=1
6: hour | Support=True, Rank=1
7: day_of_week | Support=True, Rank=1
8: gender_M | Support=False, Rank=2
9: city_te | Support=True, Rank=1
10: state_te | Support=True, Rank=1
11: job_te | Support=True, Rank=1
12: category_te | Support=True, Rank=1

=== SELECTED FEATURES by RandomForest RFE ===
   - amt
   - age
   - city_pop
   - month
   - hour
   - day_of_week
   - city_te
   - state_te
   - job_te
   - category_te

[DEBUG] Training final RandomForest on selected features (TRAIN ONLY)...

=== [RandomForest] MODEL PERFORMANCE ON TRAIN SET (WITH SMOTE) 

IMPROVED MODEL 

In [4]:
# ===========================================================
# PART B: RANDOM FOREST (for comparison) WITH GRIDSEARCHCV
# ===========================================================
print("\n================= RANDOM FOREST PIPELINE WITH GRIDSEARCHCV =================\n")

# -----------------------------------------------------------
# B.1) IMPORT NECESSARY MODULES
# -----------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# -----------------------------------------------------------
# B.2) RFE USING RANDOMFOREST ON TRAIN
# -----------------------------------------------------------
# Initialize the RandomForestClassifier for RFE
rf_clf = RandomForestClassifier(
    n_estimators=100,       # Number of trees in the forest
    random_state=42,        # Ensures reproducibility
    n_jobs=-1                # Utilize all available CPU cores for parallel processing
)

# Initialize RFE with the RandomForestClassifier
rf_rfe = RFE(
    estimator=rf_clf,
    n_features_to_select=10,  # Number of top features to select
    step=1,                    # Number of features to remove at each iteration
    verbose=1                  # Controls the verbosity: the higher, the more messages
)

print("\n[DEBUG] Starting RFE fit (RandomForest) on TRAIN ONLY... (this may take a while)")
rf_rfe.fit(X_train_sm, y_train_sm)

# Retrieve the support mask and ranking of features
rf_support_mask = rf_rfe.support_
rf_ranking = rf_rfe.ranking_

print("\n[DEBUG] RandomForest RFE Feature Ranking Results:")
for i, feat_name in enumerate(final_feature_names):
    print(f"{i}: {feat_name} | Support={rf_support_mask[i]}, Rank={rf_ranking[i]}")

# Extract the names of selected features
rf_selected_features = [f for f, s in zip(final_feature_names, rf_support_mask) if s]
print("\n=== SELECTED FEATURES by RandomForest RFE ===")
for feat in rf_selected_features:
    print("   -", feat)

# Transform the datasets to contain only the selected features
X_train_sel_rf = rf_rfe.transform(X_train_sm)
X_val_sel_rf   = rf_rfe.transform(X_val_enc)
X_test_sel_rf  = rf_rfe.transform(X_test_enc)

# -----------------------------------------------------------
# B.3) HYPERPARAMETER TUNING WITH GRIDSEARCHCV
# -----------------------------------------------------------
print("\n[DEBUG] Starting Grid Search for RandomForest...")

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 500],           # Number of trees in the forest
    'max_depth': [15, 30],           # Maximum depth of each tree
    'min_samples_split': [5, 10],           # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, ],             # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split            s
}

# Initialize a new RandomForestClassifier for GridSearchCV
rf_clf_grid = RandomForestClassifier(
    random_state=42,  # Ensures reproducibility
    n_jobs=-1          # Utilize all available CPU cores
)

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_clf_grid,
    param_grid=param_grid,
    scoring='accuracy',    # Evaluation metric to optimize; can be changed based on your objective
    cv=5,                  # 5-fold cross-validation
    verbose=2,             # Controls the verbosity: the higher, the more messages
    n_jobs=-1              # Utilize all available CPU cores
)

# Perform Grid Search on the selected training set
grid_search_rf.fit(X_train_sel_rf, y_train_sm)

# Best parameters and score from Grid Search
print("\n[INFO] Best Parameters from Grid Search:\n", grid_search_rf.best_params_)
print("\n[INFO] Best Cross-Validation Accuracy:\n", grid_search_rf.best_score_)

# -----------------------------------------------------------
# B.4) TRAIN FINAL RANDOMFOREST ON SELECTED FEATURES (TRAIN ONLY)
# -----------------------------------------------------------
print("\n[DEBUG] Training final RandomForest with best parameters on selected features (TRAIN ONLY)...")

# Retrieve the best estimator from Grid Search
best_rf = grid_search_rf.best_estimator_

# Train the best RandomForestClassifier on the full training data
best_rf.fit(X_train_sel_rf, y_train_sm)

# -----------------------------------------------------------
# B.5) EVALUATE ON TRAIN SET FIRST (OVERFITTING CHECK)
# -----------------------------------------------------------
rf_train_pred = best_rf.predict(X_train_sel_rf)
rf_train_acc  = accuracy_score(y_train_sm, rf_train_pred)

print("\n=== [RandomForest] MODEL PERFORMANCE ON TRAIN SET (WITH SMOTE) ===")
print("Accuracy (Train):", rf_train_acc)
print("Classification Report (Train):\n", classification_report(y_train_sm, rf_train_pred))
print("Confusion Matrix (Train):\n", confusion_matrix(y_train_sm, rf_train_pred))

# -----------------------------------------------------------
# B.6) EVALUATE ON VALIDATION SET
# -----------------------------------------------------------
rf_val_pred = best_rf.predict(X_val_sel_rf)
rf_val_acc = accuracy_score(y_val, rf_val_pred)

print("\n=== [RandomForest] MODEL PERFORMANCE ON VALIDATION SET ===")
print("Accuracy (Val):", rf_val_acc)
print("Classification Report (Val):\n", classification_report(y_val, rf_val_pred))
print("Confusion Matrix (Val):\n", confusion_matrix(y_val, rf_val_pred))

# Decide if we want to proceed to the test set:
RF_VAL_THRESHOLD = 0.99  # Example threshold. Adjust as needed.

if rf_val_acc < RF_VAL_THRESHOLD:
    print(f"\n[WARNING] [RandomForest] Validation Accuracy {rf_val_acc:.4f} < {RF_VAL_THRESHOLD}.")
    print("We are NOT satisfied. Consider tuning hyperparameters, features, etc.")
    print("Skipping test evaluation for RandomForest.\n")
else:
    # -----------------------------------------------------------
    # B.7) EVALUATE ON TEST SET IF VALIDATION IS GOOD
    # -----------------------------------------------------------
    rf_test_pred = best_rf.predict(X_test_sel_rf)
    rf_test_acc = accuracy_score(y_test, rf_test_pred)
    print("\n=== [RandomForest] MODEL PERFORMANCE ON TEST SET ===")
    print("Accuracy (Test):", rf_test_acc)
    print("Classification Report (Test):\n", classification_report(y_test, rf_test_pred))
    print("Confusion Matrix (Test):\n", confusion_matrix(y_test, rf_test_pred))





[DEBUG] Starting RFE fit (RandomForest) on TRAIN ONLY... (this may take a while)
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.

[DEBUG] RandomForest RFE Feature Ranking Results:
0: amt | Support=True, Rank=1
1: distance_km | Support=False, Rank=3
2: age | Support=True, Rank=1
3: city_pop | Support=True, Rank=1
4: year | Support=False, Rank=4
5: month | Support=True, Rank=1
6: hour | Support=True, Rank=1
7: day_of_week | Support=True, Rank=1
8: gender_M | Support=False, Rank=2
9: city_te | Support=True, Rank=1
10: state_te | Support=True, Rank=1
11: job_te | Support=True, Rank=1
12: category_te | Support=True, Rank=1

=== SELECTED FEATURES by RandomForest RFE ===
   - amt
   - age
   - city_pop
   - month
   - hour
   - day_of_week
   - city_te
   - state_te
   - job_te
   - category_te

[DEBUG] Starting Grid Search for RandomForest...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END max_depth=15, max

KeyboardInterrupt: 

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("\n================= RANDOM FOREST (RFE) =================\n")

# 1) Baseline RF for RFE
rf_clf_for_rfe = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# 2) RFE initialization: select top 10 features
rf_rfe = RFE(
    estimator=rf_clf_for_rfe,
    n_features_to_select=10,
    step=1,
    verbose=1
)

print("\n[DEBUG] Starting RFE fit (RandomForest) on TRAIN ONLY...")
rf_rfe.fit(X_train_sm, y_train_sm)

# 3) Inspect the feature rankings
rf_support_mask = rf_rfe.support_
rf_ranking = rf_rfe.ranking_

print("\n[DEBUG] RandomForest RFE Feature Ranking Results:")
for i, feat_name in enumerate(final_feature_names):
    print(f"{i}: {feat_name} | Support={rf_support_mask[i]}, Rank={rf_ranking[i]}")

# 4) Collect selected features
rf_selected_features = [f for f, s in zip(final_feature_names, rf_support_mask) if s]
print("\n=== SELECTED FEATURES by RandomForest RFE ===")
for feat in rf_selected_features:
    print("   -", feat)

# 5) Transform the datasets (train/val/test) to keep only selected features
X_train_sel_rf = rf_rfe.transform(X_train_sm)
X_val_sel_rf   = rf_rfe.transform(X_val_enc)
X_test_sel_rf  = rf_rfe.transform(X_test_enc)





[DEBUG] Starting RFE fit (RandomForest) on TRAIN ONLY...
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.

[DEBUG] RandomForest RFE Feature Ranking Results:
0: amt | Support=True, Rank=1
1: distance_km | Support=False, Rank=3
2: age | Support=True, Rank=1
3: city_pop | Support=True, Rank=1
4: year | Support=False, Rank=4
5: month | Support=True, Rank=1
6: hour | Support=True, Rank=1
7: day_of_week | Support=True, Rank=1
8: gender_M | Support=False, Rank=2
9: city_te | Support=True, Rank=1
10: state_te | Support=True, Rank=1
11: job_te | Support=True, Rank=1
12: category_te | Support=True, Rank=1

=== SELECTED FEATURES by RandomForest RFE ===
   - amt
   - age
   - city_pop
   - month
   - hour
   - day_of_week
   - city_te
   - state_te
   - job_te
   - category_te


In [None]:
from sklearn.model_selection import GridSearchCV
import time

print("\n================= GRID SEARCH FOR RANDOM FOREST =================\n")

# Define a small hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the RF classifier (we'll rely on the param_grid to tune it)
rf_for_grid = RandomForestClassifier(random_state=42, n_jobs=-1)

# Start timer to measure GridSearch execution time
start_time = time.time()

# We will do 3-fold CV here for speed. Increase if you want more robust estimates.
grid_search = GridSearchCV(
    estimator=rf_for_grid,
    param_grid=param_grid,
    scoring='accuracy',      # or f1, roc_auc, etc.
    cv=3,                    # 3-fold cross-validation
    n_jobs=-1,               # parallelize
    verbose=3                # Enable verbosity to see progress
)

print("[DEBUG] Starting Grid Search on selected features (TRAIN ONLY)...")
grid_search.fit(X_train_sel_rf, y_train_sm)

# Measure the time taken for the grid search
end_time = time.time()
elapsed_time = end_time - start_time
print(f"\n[DEBUG] Grid Search Completed in {elapsed_time:.2f} seconds.")

print("\n[DEBUG] Best Grid Search Params:", grid_search.best_params_)
print("[DEBUG] Best Grid Search Score :", grid_search.best_score_)




[DEBUG] Starting Grid Search on selected features (TRAIN ONLY)...
Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV 2/3] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   0.1s
[CV 3/3] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=nan total time=   0.1s
[CV 2/3] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=nan total time=   0.1s
[CV 1/3] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=nan total time=   0.1s
[CV 1/3] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=nan total time=   0.1s
[CV 1/3] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=nan total time=   0.1s
[CV 3/3] END max_depth=None, max_features=auto, mi

In [7]:
best_rf = grid_search.best_estimator_

# Evaluate on TRAIN (SMOTE) to check for overfitting
rf_train_pred = best_rf.predict(X_train_sel_rf)
rf_train_acc  = accuracy_score(y_train_sm, rf_train_pred)

print("\n=== [Tuned RandomForest] MODEL PERFORMANCE ON TRAIN SET (WITH SMOTE) ===")
print("Accuracy (Train):", rf_train_acc)
print("Classification Report (Train):\n", classification_report(y_train_sm, rf_train_pred))
print("Confusion Matrix (Train):\n", confusion_matrix(y_train_sm, rf_train_pred))

# Evaluate on VALIDATION SET
rf_val_pred = best_rf.predict(X_val_sel_rf)
rf_val_acc  = accuracy_score(y_val, rf_val_pred)

print("\n=== [Tuned RandomForest] MODEL PERFORMANCE ON VALIDATION SET ===")
print("Accuracy (Val):", rf_val_acc)
print("Classification Report (Val):\n", classification_report(y_val, rf_val_pred))
print("Confusion Matrix (Val):\n", confusion_matrix(y_val, rf_val_pred))

# Decide if you want to check TEST set based on validation accuracy
RF_VAL_THRESHOLD = 0.90  # example threshold - adjust as needed
if rf_val_acc < RF_VAL_THRESHOLD:
    print(f"\n[WARNING] [RandomForest] Validation Accuracy {rf_val_acc:.4f} < {RF_VAL_THRESHOLD}.")
    print("We are NOT satisfied. Consider tuning again or adding more features.")
    print("Skipping test evaluation for RandomForest.\n")
else:
    # Evaluate on TEST SET
    rf_test_pred = best_rf.predict(X_test_sel_rf)
    rf_test_acc  = accuracy_score(y_test, rf_test_pred)
    print("\n=== [Tuned RandomForest] MODEL PERFORMANCE ON TEST SET ===")
    print("Accuracy (Test):", rf_test_acc)
    print("Classification Report (Test):\n", classification_report(y_test, rf_test_pred))
    print("Confusion Matrix (Test):\n", confusion_matrix(y_test, rf_test_pred))



=== [Tuned RandomForest] MODEL PERFORMANCE ON TRAIN SET (WITH SMOTE) ===
Accuracy (Train): 0.9795882765525056
Classification Report (Train):
               precision    recall  f1-score   support

           0       0.98      0.98      0.98    834055
           1       0.98      0.98      0.98    834055

    accuracy                           0.98   1668110
   macro avg       0.98      0.98      0.98   1668110
weighted avg       0.98      0.98      0.98   1668110

Confusion Matrix (Train):
 [[820322  13733]
 [ 20316 813739]]

=== [Tuned RandomForest] MODEL PERFORMANCE ON VALIDATION SET ===
Accuracy (Val): 0.9834727295268795
Classification Report (Val):
               precision    recall  f1-score   support

           0       1.00      0.98      0.99    104257
           1       0.24      0.89      0.38       600

    accuracy                           0.98    104857
   macro avg       0.62      0.94      0.69    104857
weighted avg       1.00      0.98      0.99    104857

Confusion 

In [7]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

# We'll use the best random forest from the grid search
best_rf = grid_search.best_estimator_

# Evaluate learning curve on the training set (with selected features)
train_sizes, train_scores, val_scores = learning_curve(
    estimator=best_rf,
    X=X_train_sel_rf,
    y=y_train_sm,
    cv=3,               # same # of folds as GridSearch (can adjust)
    scoring='accuracy',
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 5)  # 5 points from 10% to 100% of training data
)

# Compute mean and std for plotting
train_mean = np.mean(train_scores, axis=1)
train_std  = np.std(train_scores, axis=1)
val_mean   = np.mean(val_scores, axis=1)
val_std    = np.std(val_scores, axis=1)

# Plot
plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')

plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')

plt.title('Learning Curve (Tuned RandomForest)')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.legend(loc='best')
plt.grid(True)
plt.show()


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'