In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

KeyboardInterrupt: 

In [None]:
df = pd.read_csv("loan_data.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.tail()

In [None]:
df.isnull().sum()

In [None]:
num_cols = ["annual_income", "debt_to_income_ratio", "loan_amount"]

plt.figure(figsize=(15,5))
for i, col in enumerate(num_cols, 1):
    plt.subplot(1,3,i)
    sns.boxplot(y=df[col])
    plt.title(f"{col} (Before Outlier Removal)")
plt.tight_layout()
plt.show()


In [None]:
def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[col] >= lower) & (data[col] <= upper)]

for col in ["annual_income", "debt_to_income_ratio", "loan_amount"]:
    df = remove_outliers_iqr(df, col)


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("loan_paid_back", axis=1)
y = df["loan_paid_back"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


In [None]:
plt.figure(figsize=(15,5))
for i, col in enumerate(num_cols, 1):
    plt.subplot(1,3,i)
    sns.boxplot(y=df[col])
    plt.title(f"{col} (After Outlier Removal)")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
corr = df.corr(numeric_only=True)      # correlation matrix of numeric columns
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
num_cols = [
    "annual_income",
    "debt_to_income_ratio",
    "credit_score",
    "loan_amount",
    "interest_rate",
    "id"
]

cat_cols = [
    "gender",
    "marital_status",
    "education_level",
    "employment_status",
    "loan_purpose",
    "grade_subgrade"
]

numeric_features = num_cols.copy()
numeric_features.remove("id")  # remove ID only

categorical_features = cat_cols

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

log_reg_prob = log_reg_model.predict_proba(X_test)[:,1]


log_auc = roc_auc_score(y_test, log_reg_prob)


print("Logistic Regression AUC:", log_auc)



In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

log_reg_prob = log_reg_model.predict_proba(X_test)[:, 1]


log_auc = roc_auc_score(y_test, log_reg_prob)


fpr1, tpr1, _ = roc_curve(y_test, log_reg_prob)


plt.figure(figsize=(8,6))
plt.plot(fpr1, tpr1, label=f"Logistic Regression AUC={log_auc:.3f}")

plt.plot([0,1],[0,1],'k--')
plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.show()
