In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer, StandardScaler

In [None]:
data = pd.read_csv('loan_data_2007_2014.csv')

  data = pd.read_csv('loan_data_2007_2014.csv')


In [None]:
total_rows = len(data)

missing_summary = data.isnull().sum().reset_index()
missing_summary.columns = ["column", "missing_count"]
missing_summary["missing_pct"] = (missing_summary["missing_count"] / total_rows) * 100

full_missing_summary = missing_summary[missing_summary["missing_count"] == total_rows]
full_missing_cols = full_missing_summary["column"].tolist()

data = data.drop(columns=full_missing_cols)

print("Shape setelah drop:", data.shape)

Shape setelah drop: (466285, 58)


In [None]:
missing_frac = data.isnull().mean()
cols_to_drop = missing_frac[missing_frac > 0.6].index
data = data.drop(columns=cols_to_drop)
print("Shape setelah drop >60% missing:", data.shape)

Shape setelah drop >60% missing: (466285, 55)


In [None]:
# === 2. Imputasi missing ===
# Numerik → median
for col in data.select_dtypes(include=[np.number]).columns:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna(data[col].median())

# Kategorikal → "Unknown"
for col in data.select_dtypes(include=['object']).columns:
    if data[col].isnull().sum() > 0:
        data[col] = data[col].fillna("Unknown")

In [None]:
# === 3. Tangani outlier (winsorization 1%-99%) ===
outlier_cols = ["annual_inc", "dti", "revol_bal", "loan_amnt", "installment", "int_rate"]
for col in outlier_cols:
    if col in data.columns:
        lower, upper = data[col].quantile([0.01, 0.99])
        data[col] = np.clip(data[col], lower, upper)

In [None]:
# === 4. Tangani skewness ===
num_cols = data.select_dtypes(include=[np.number]).columns
skewness = data[num_cols].skew().sort_values(ascending=False)

# Pilih kolom dengan skew > 1
high_skew = skewness[skewness > 1].index

for col in high_skew:
    if (data[col] >= 0).all():
        # Gunakan log1p kalau semua nilai positif
        data[col+"_log"] = np.log1p(data[col])
    else:
        # Gunakan Yeo-Johnson kalau ada nilai negatif/0
        pt = PowerTransformer(method='yeo-johnson')
        data[col+"_trans"] = pt.fit_transform(data[[col]])

In [None]:
redundant_cols = ["funded_amnt", "funded_amnt_inv"]
data = data.drop(columns=[c for c in redundant_cols if c in data.columns])

In [None]:
data['target'] = data['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default'] else 0)

X = data.drop(columns=['loan_status', 'target'])
y = data['target']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (326399, 74)
Test shape: (139886, 74)


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_cols = X_train.select_dtypes(include=[np.number]).columns
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])


In [None]:
# Konversi kolom datetime ke format datetime
datetime_cols = ["issue_d", "last_pymnt_d", "next_pymnt_d", "last_credit_pull_d", "earliest_cr_line"]

for col in datetime_cols:
    if col in data.columns: # Check if column exists
        data[col] = pd.to_datetime(data[col], errors="coerce")

        # ekstrak bulan (1-12), missing jadi 0
        data[col + "_month"] = data[col].dt.month.fillna(0).astype(int)
    else:
        print(f"Column '{col}' not found in DataFrame.") # Optional: print a message if column is not found

# Drop kolom datetime asli
data = data.drop(columns=[c for c in datetime_cols if c in data.columns]) # Drop columns only if they exist

print(data.filter(like="_month").head())

  data[col] = pd.to_datetime(data[col], errors="coerce")
  data[col] = pd.to_datetime(data[col], errors="coerce")
  data[col] = pd.to_datetime(data[col], errors="coerce")
  data[col] = pd.to_datetime(data[col], errors="coerce")
  data[col] = pd.to_datetime(data[col], errors="coerce")


   issue_d_month  last_pymnt_d_month  next_pymnt_d_month  \
0              0                   0                   0   
1              0                   0                   0   
2              0                   0                   0   
3              0                   0                   0   
4              0                   0                   0   

   last_credit_pull_d_month  earliest_cr_line_month  
0                         0                       1  
1                         0                       4  
2                         0                       0  
3                         0                       2  
4                         0                       1  


In [None]:
# fit dan transform data training
X_train_transformed = preprocessor.fit_transform(X_train)

# transform data testing (pakai fit dari train, bukan fit ulang!)
X_test_transformed = preprocessor.transform(X_test)

print("Shape sebelum preprocessing:", X_train.shape)
print("Shape sesudah preprocessing:", X_train_transformed.shape)


Shape sebelum preprocessing: (326399, 74)
Shape sesudah preprocessing: (326399, 526879)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# === Logistic Regression ===
log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(max_iter=500, solver='liblinear'))
])

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("=== Logistic Regression ===")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, log_reg.predict_proba(X_test)[:,1]))

=== Logistic Regression ===
[[126837     57]
 [   243  12749]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    126894
           1       1.00      0.98      0.99     12992

    accuracy                           1.00    139886
   macro avg       1.00      0.99      0.99    139886
weighted avg       1.00      1.00      1.00    139886

ROC-AUC: 0.9992841058488675


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score # Import roc_auc_score
from sklearn.pipeline import Pipeline

# Create a pipeline with the preprocessor and the Decision Tree model
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])

# parameter grid
param_grid = {
    'model__max_depth': [3, 5],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

# RandomizedSearchCV (ringan)
grid_dt = RandomizedSearchCV(dt_pipeline, param_grid, n_iter=3, cv=2,
                             scoring='roc_auc', random_state=42)
grid_dt.fit(X_train, y_train)

# best estimator
best_model = grid_dt.best_estimator_

# prediksi
y_pred = best_model.predict(X_test)

# evaluasi
print("Best params:", grid_dt.best_params_)
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, grid_dt.best_estimator_.predict_proba(X_test)[:, 1])) # Use grid_dt.best_estimator_

Best params: {'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_depth': 5}
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    126894
           1       1.00      0.89      0.94     12992

    accuracy                           0.99    139886
   macro avg       0.99      0.95      0.97    139886
weighted avg       0.99      0.99      0.99    139886

ROC-AUC: 0.9936041800913349


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import randint

# --- Random Forest dengan class_weight balanced ---
rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"   # 🔑 fokus ke kelas minoritas
)

# Pipeline (preprocessor + model)
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf)
])

# --- Parameter distributions ---
param_dist = {
    'model__n_estimators': randint(100, 200),
    'model__max_depth': [10],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

# --- Randomized Search ---
rand_rf = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=param_dist,
    n_iter=3,        # cukup 10 iterasi dulu biar cepat
    cv=2,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# --- Training ---
rand_rf.fit(X_train, y_train)

# --- Evaluasi ---
y_pred = rand_rf.predict(X_test)
y_proba = rand_rf.predict_proba(X_test)[:, 1]

print("Best params:", rand_rf.best_params_)
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Fitting 2 folds for each of 3 candidates, totalling 6 fits
Best params: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 10, 'model__n_estimators': 182}
              precision    recall  f1-score   support

           0       0.99      0.93      0.96    126894
           1       0.56      0.89      0.69     12992

    accuracy                           0.93    139886
   macro avg       0.77      0.91      0.82    139886
weighted avg       0.95      0.93      0.93    139886

ROC-AUC: 0.9685889094402209


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline

# --- Model dasar ---
xgb = XGBClassifier(
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss",
    use_label_encoder=False
)

# Pipeline (preprocessing + model)
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
])

# --- Parameter distribusi (dibatasi biar cepat) ---
param_dist = {
    'model__n_estimators': [100, 200],      # jumlah tree
    'model__max_depth': [3, 5, 7],          # kedalaman tree
    'model__learning_rate': [0.05, 0.1],    # step size
    'model__subsample': [0.8, 1.0],         # sampel data
    'model__colsample_bytree': [0.8, 1.0]   # fitur per tree
}

# --- Randomized Search ---
rand_xgb = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist,
    n_iter=5,
    cv=2,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# --- Training ---
rand_xgb.fit(X_train, y_train)

# --- Evaluasi ---
y_pred = rand_xgb.predict(X_test)
y_proba = rand_xgb.predict_proba(X_test)[:, 1]

print("Best params:", rand_xgb.best_params_)
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))


Fitting 2 folds for each of 5 candidates, totalling 10 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params: {'model__subsample': 1.0, 'model__n_estimators': 200, 'model__max_depth': 5, 'model__learning_rate': 0.1, 'model__colsample_bytree': 1.0}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    126894
           1       1.00      0.99      0.99     12992

    accuracy                           1.00    139886
   macro avg       1.00      0.99      1.00    139886
weighted avg       1.00      1.00      1.00    139886

ROC-AUC: 0.9998598653158088


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

# --- Prediksi train ---
y_train_pred = rand_xgb.predict(X_train)
y_train_proba = rand_xgb.predict_proba(X_train)[:, 1]

# --- Prediksi test ---
y_test_pred = rand_xgb.predict(X_test)
y_test_proba = rand_xgb.predict_proba(X_test)[:, 1]

# --- Evaluasi Train ---
print("=== Train Performance ===")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("ROC-AUC:", roc_auc_score(y_train, y_train_proba))

# --- Evaluasi Test ---
print("\n=== Test Performance ===")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba))


=== Train Performance ===
Accuracy: 0.9993903167595489
ROC-AUC: 0.9999886692937211

=== Test Performance ===
Accuracy: 0.9989991850506841
ROC-AUC: 0.9998598653158088


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    # Prediksi train
    y_train_pred = model.predict(X_train)
    y_train_proba = model.predict_proba(X_train)[:, 1]

    # Prediksi test
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]

    return {
        "Model": name,
        "Train Accuracy": accuracy_score(y_train, y_train_pred),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Train ROC-AUC": roc_auc_score(y_train, y_train_proba),
        "Test ROC-AUC": roc_auc_score(y_test, y_test_proba),
    }

# --- Training models ---
results = []

# XGBoost
results.append(evaluate_model("XGBoost", rand_xgb, X_train, y_train, X_test, y_test))

# Random Forest
results.append(evaluate_model("Random Forest", rand_rf, X_train, y_train, X_test, y_test))

# Logistic Regression
results.append(evaluate_model("Logistic Regression", log_reg, X_train, y_train, X_test, y_test))

# Decision Tree (tambahan baru)
# Create a pipeline for the Decision Tree model
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])
dt_pipeline.fit(X_train, y_train) # Train the pipeline
results.append(evaluate_model("Decision Tree", dt_pipeline, X_train, y_train, X_test, y_test)) # Evaluate the pipeline

# --- DataFrame hasil evaluasi ---
df_results = pd.DataFrame(results)
print(df_results)

                 Model  Train Accuracy  Test Accuracy  Train ROC-AUC  \
0              XGBoost        0.999390       0.998999       0.999989   
1        Random Forest        0.926348       0.925182       0.970182   
2  Logistic Regression        0.998931       0.997855       0.999759   
3        Decision Tree        1.000000       0.997426       1.000000   

   Test ROC-AUC  
0      0.999860  
1      0.968589  
2      0.999284  
3      0.988805  
