# Home Credit Indonesia — Credit Risk Scorecard
Author: Talitha Salsabila

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve


In [None]:

train = pd.read_csv("application_train.csv")
test = pd.read_csv("application_test.csv")

print(train.shape, test.shape)
print(train["TARGET"].value_counts(normalize=True))


In [None]:

plt.figure(figsize=(5,5))
train["TARGET"].value_counts().plot.pie(autopct="%.1f%%", colors=["#d62728","#7f7f7f"])
plt.title("Target Distribution")
plt.ylabel("")
plt.savefig("target_pie.png")
plt.show()


In [None]:

train["CREDIT_INCOME_RATIO"] = train["AMT_CREDIT"] / (train["AMT_INCOME_TOTAL"]+1)

plt.figure(figsize=(6,4))
sns.histplot(train, x="CREDIT_INCOME_RATIO", hue="TARGET", stat="density", common_norm=False)
plt.title("Credit / Income Ratio vs Default")
plt.savefig("credit_income_ratio.png")
plt.show()


In [None]:

target = "TARGET"
X = train.drop(columns=[target])
y = train[target]

num_features = X.select_dtypes(include=[np.number]).columns
cat_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False, max_categories=20))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", categorical_transformer, cat_features)
])


In [None]:

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:

# Logistic Regression
lr = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", LogisticRegression(max_iter=500, class_weight="balanced"))
])
lr.fit(X_train, y_train)
y_val_lr = lr.predict_proba(X_val)[:,1]

# Random Forest
rf = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=42))
])
rf.fit(X_train, y_train)
y_val_rf = rf.predict_proba(X_val)[:,1]


In [None]:

roc_auc_lr = roc_auc_score(y_val, y_val_lr)
roc_auc_rf = roc_auc_score(y_val, y_val_rf)
print("ROC AUC Logistic:", roc_auc_lr)
print("ROC AUC RF:", roc_auc_rf)

# ROC Curve
fpr_lr, tpr_lr, _ = roc_curve(y_val, y_val_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_val, y_val_rf)
plt.figure(figsize=(6,5))
plt.plot(fpr_lr, tpr_lr, label=f"LogReg AUC={roc_auc_lr:.2f}")
plt.plot(fpr_rf, tpr_rf, label=f"RandomForest AUC={roc_auc_rf:.2f}")
plt.plot([0,1],[0,1],"k--")
plt.legend()
plt.title("ROC Curve")
plt.savefig("roc.png")
plt.show()

# PR Curve
prec_lr, rec_lr, _ = precision_recall_curve(y_val, y_val_lr)
prec_rf, rec_rf, _ = precision_recall_curve(y_val, y_val_rf)
plt.figure(figsize=(6,5))
plt.plot(rec_lr, prec_lr, label="LogReg")
plt.plot(rec_rf, prec_rf, label="RandomForest")
plt.legend()
plt.title("Precision-Recall Curve")
plt.savefig("pr.png")
plt.show()


In [None]:

thresholds = np.linspace(0.1,0.9,20)
profits = []
for th in thresholds:
    preds = (y_val_rf >= th).astype(int)
    tp = np.sum((preds==0)&(y_val==0))  # approve & good
    fp = np.sum((preds==0)&(y_val==1))  # approve & default
    profit = tp*1000 - fp*2000
    profits.append(profit)

plt.figure(figsize=(6,4))
plt.plot(thresholds, profits, marker="o")
plt.title("Profit Curve (Random Forest)")
plt.xlabel("Threshold")
plt.ylabel("Profit (simulasi)")
plt.savefig("profit.png")
plt.show()
