In [None]:
# --- Step 1: Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_curve, auc
)

In [None]:
# --- Step 2: Load Dataset ---
df = pd.read_csv("C:/Users/DELL/Desktop/MSc/1st Sem/AML/Loan_Default - Copy.csv")

In [None]:
# --- Step 3: Data Exploration ---
print(df.head())
print(df.info())
print(df.describe())

In [None]:
# Missing values summary
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing_values / len(df)) * 100
missing_table = pd.DataFrame({
    "Missing Values": missing_values,
    "Percentage": missing_percent
})
print("Missing Values Summary:")
print(missing_table[missing_table["Missing Values"] > 0])

# Missing values heatmap
plt.figure(figsize=(8,4))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Target distribution
sns.countplot(x="Status", data=df, palette="coolwarm")
plt.title("Class Distribution (Target Variable)")
plt.show()

In [None]:
# --- Drop identifier columns ---
if "ID" in df.columns:
    print("Dropping ID column...")
    df = df.drop(columns=["ID"])

In [None]:
# Correlation heatmap (numerical features)
num_cols = df.select_dtypes(include=['int64','float64']).columns
plt.figure(figsize=(12,8))
sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# --- Step 4: Pre-Processing ---
# Separate numeric and categorical columns
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Impute numeric
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical
cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# One-hot encode categorical features
encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
encoded_array = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(cat_cols), index=df.index)

In [None]:
# Define X and y
X = pd.concat([df[num_cols].drop("Status", axis=1, errors="ignore"), encoded_df], axis=1)
y = df["Status"]

In [None]:
print("Status in X?", "Status" in X.columns)


In [None]:
# --- Step 5: Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Apply SMOTE on training set (no scaling for tree models)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Class distribution after SMOTE
plt.figure(figsize=(6,4))
sns.countplot(x=y_train_res, palette="viridis")
plt.title("Class Distribution After SMOTE (Training Data)")
plt.xlabel("Loan Status (0 = Non-default, 1 = Default)")
plt.ylabel("Count")
plt.show()

In [None]:
# --- Step 6a: Baseline Model ---
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_nm, y_train_nm)

y_pred = dt_model.predict(X_test)

print("Baseline Decision Tree Results:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# --- Step 6b: Hyperparameter Tuning ---
param_grid = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_split": [5, 10, 20],
    "min_samples_leaf": [2, 4, 6]
}

grid_dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=3, scoring="f1", n_jobs=-1, verbose=2
)
grid_dt.fit(X_train_res, y_train_res)

print("Best Parameters:", grid_dt.best_params_)

In [None]:
# Final model
best_dt = grid_dt.best_estimator_
y_pred_best = best_dt.predict(X_test)

print("Tuned Decision Tree Results:")
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

In [None]:
# --- Step 7: ROC Curve ---
y_proba = best_dt.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f'Decision Tree (AUC = {roc_auc:.2f})')
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Decision Tree")
plt.legend()
plt.show()

In [None]:
# --- Step 8: Visualize Tree (Optional) ---
plt.figure(figsize=(16,8))
plot_tree(best_dt, filled=True, feature_names=X.columns, class_names=["Non-default","Default"], max_depth=3)
plt.title("Decision Tree (First 3 Levels)")
plt.show()

In [None]:
from sklearn.utils import shuffle

y_shuffled = shuffle(y, random_state=42)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X, y_shuffled, test_size=0.2, stratify=y_shuffled, random_state=42
)

dt_test = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_test.fit(X_train_s, y_train_s)
print("Accuracy with shuffled target:", dt_test.score(X_test_s, y_test_s))


In [None]:
print(X.columns.tolist())


In [None]:
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# --- Use the preprocessed X and y ---
# Shuffle the target only
y_shuf = shuffle(y, random_state=42).reset_index(drop=True)
X_shuf = X.reset_index(drop=True)

# Train-test split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_shuf, y_shuf, test_size=0.2, stratify=y_shuf, random_state=42
)

# Train a weak Decision Tree
dt_test = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_test.fit(X_train_s, y_train_s)

print("Accuracy with shuffled target:", dt_test.score(X_test_s, y_test_s))


In [None]:
from sklearn.metrics import roc_auc_score

suspicious = []
for col in X.columns:
    try:
        auc = roc_auc_score(y, X[col])
        if auc > 0.95 or auc < 0.05:  # very strong predictor
            suspicious.append((col, auc))
    except Exception:
        continue

print("Suspicious columns:", suspicious)


In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_res, y_train_res)
print("LogReg test accuracy:", lr.score(X_test, y_test))


In [None]:
print("Duplicate rows in dataset:", df.duplicated().sum())


In [None]:
dt_shallow = DecisionTreeClassifier(random_state=42, max_depth=2)
dt_shallow.fit(X_train_res, y_train_res)
print("Shallow tree test accuracy:", dt_shallow.score(X_test, y_test))


In [None]:
from sklearn.model_selection import cross_val_score, learning_curve

# --- Cross-validation ---
model = DecisionTreeClassifier(random_state=42, max_depth=5)   # change model here
scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

In [None]:
# --- Learning curve ---
train_sizes, train_scores, val_scores = learning_curve(
    model, X, y, cv=5, scoring="accuracy",
    train_sizes=np.linspace(0.1, 1.0, 5), n_jobs=-1
)

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)

plt.figure(figsize=(6,4))
plt.plot(train_sizes, train_mean, 'o-', label="Training Accuracy")
plt.plot(train_sizes, val_mean, 'o-', label="Validation Accuracy")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy")
plt.title("Learning Curve")
plt.legend()
plt.grid(True)
plt.show()