In [None]:
# ==============================
# STEP 0: Import Libraries
# ==============================
import pandas as pd
import numpy as np

# ML & preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

In [None]:
df = pd.read_csv('mock_train (1).csv')
df

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15658852.0,Chidimma,651.0,France,Female,35.0,4.0,132271.30,1.0,1.0,0.0,86587.37,0.0
1,1,15615176.0,Chukwubuikem,642.0,France,Male,35.0,7.0,0.00,2.0,1.0,1.0,83917.49,0.0
2,2,15771543.0,Macleod,662.0,France,Female,42.0,4.0,0.00,3.0,0.0,0.0,52337.97,1.0
3,3,15776824.0,Hs?,659.0,France,Female,38.0,5.0,121702.73,1.0,0.0,0.0,73564.44,0.0
4,4,15676937.0,Ts'ui,584.0,Spain,Male,47.0,7.0,0.00,2.0,1.0,1.0,86619.77,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,14995,15730273.0,Yobanna,717.0,France,Female,30.0,7.0,0.00,2.0,1.0,1.0,173365.89,0.0
14996,14996,15696852.0,Ch'ien,706.0,France,Female,35.0,3.0,0.00,2.0,1.0,1.0,141896.74,0.0
14997,14997,15628059.0,Miah,683.0,Germany,Male,41.0,6.0,120599.38,1.0,0.0,0.0,181299.13,1.0
14998,14998,15771580.0,Ts'ui,833.0,France,Female,42.0,3.0,0.00,2.0,1.0,0.0,164083.72,0.0


In [None]:
# ==============================
# STEP 2: Feature–Target Split
# ==============================
target = df.columns[-1]     # last column as target
X = df.drop(columns=[target,"id","CustomerId","Surname"])
y = df[target]

# ==============================
# STEP 3: Column Identification
# ==============================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# ==============================
# STEP 4: Outlier Capping Function (IQR)
# ==============================
def capping_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
    data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])
    return data

# Apply category-wise outlier capping
df_capped = df.copy()
for col in num_cols:
    df_capped = capping_outliers_iqr(df_capped, col)

X = df_capped.drop(columns=[target,"id","CustomerId","Surname"])
y = df_capped[target]



In [None]:
# ==============================
# STEP 5: Train–Test Split
# ==============================

# Drop rows where target variable y is NaN
combined = pd.concat([X, y], axis=1)
combined.dropna(subset=[y.name], inplace=True)
X = combined.drop(columns=[y.name])
y = combined[y.name]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# ==============================
# STEP 6: Preprocessing Pipelines
# ==============================
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

In [None]:
# ==============================
# STEP 7: Handle Class Imbalance
# ==============================
smote = SMOTE(random_state=42)

# ==============================
# STEP 8: Random Forest Model
# ==============================
rf = RandomForestClassifier(random_state=42)

# ==============================
# STEP 9: Full Pipeline
# ==============================
pipeline = ImbPipeline(steps=[
    ("preprocess", preprocessor),
    ("smote", smote),
    ("rf", rf)
])

In [None]:
# ==============================
# STEP 10: Hyperparameter Tuning
# ==============================
param_grid = {
    "rf__n_estimators": [100, 200],
    "rf__max_depth": [None, 10, 20],
    "rf__min_samples_split": [2, 5],
    "rf__min_samples_leaf": [1, 2]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="f1_weighted",
    n_jobs=-1
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
best_model = grid.best_estimator_

# ==============================
# STEP 11: Prediction
# ==============================
y_pred = best_model.predict(X_test)

# ==============================
# STEP 12: Evaluation Metrics
# ==============================
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision (weighted):", precision_score(y_test, y_pred, average="weighted"))
print("Recall (weighted):", recall_score(y_test, y_pred, average="weighted"))
print("F1 Score (weighted):", f1_score(y_test, y_pred, average="weighted"))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ==============================
# STEP 13: Save Results to CSV
# ==============================
final_df = X_test.copy()
final_df["Actual"] = y_test.values
final_df["Predicted"] = y_pred

final_df.to_csv("random_forest_results.csv", index=False)


Accuracy: 0.882
Precision (weighted): 0.886059636387311
Recall (weighted): 0.882
F1 Score (weighted): 0.8837189069720717

Confusion Matrix:
 [[2193  202]
 [ 152  453]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.92      0.93      2395
         1.0       0.69      0.75      0.72       605

    accuracy                           0.88      3000
   macro avg       0.81      0.83      0.82      3000
weighted avg       0.89      0.88      0.88      3000



In [None]:
# ---------------------------------
# 1. Load test dataset
# ---------------------------------
test_df = pd.read_csv("mock_ test.csv")

# ---------------------------------
# 2. Predict using trained model
# ---------------------------------
predictions = best_model.predict(test_df)

# ---------------------------------
# 3. Create submission DataFrame
# ---------------------------------
submission = pd.DataFrame({
    "id": test_df["id"],
    "Class": predictions
})

# ---------------------------------
# 4. Save final CSV
# ---------------------------------
submission.to_csv("final_submission_final.csv", index=False)

print("final_submission_final.csv created successfully")
submission


final_submission_final.csv created successfully


Unnamed: 0,id,Class
0,15000,0.0
1,15001,0.0
2,15002,1.0
3,15003,1.0
4,15004,0.0
...,...,...
9995,24995,0.0
9996,24996,0.0
9997,24997,0.0
9998,24998,0.0
