# Training Model

Notebook ini berfungsi untuk melakukan proses pelatihan model machine learning guna meminimalkan False Negative untuk memastikan pelanggan yang layak mendapatkan pinjaman tidak salah ditolak. Adapun prosesnya meliputi hal berikut.
1. Membaca data hasil processing (train_processed.csv) sebagai input untuk pelatihan.
2. Melakukan scaling menggunakan RobustScaler agar distribusi fitur lebih stabil terhadap outlier.
3. Menyeimbangkan distribusi kelas dengan teknik SMOTE (Synthetic Minority Over-sampling Technique) untuk mengatasi ketidakseimbangan data.
4. Melatih tiga model utama, yaitu Logistic Regression, Random Forest, dan Gradient Boosting, dengan optimasi melalui GridSearchCV untuk memaksimalkan recall.
5. Melakukan threshold tuning guna mendapatkan ambang keputusan terbaik yang meminimalkan False Negative dan meningkatkan F2-score.
6. Menyimpan hasil pelatihan berupa model terbaik dengan nilai threshold optimal, dan metrik validasi agar dapat digunakan kembali pada tahap inference atau evaluasi lanjutan (`../outputs/models/`).

## 1) Setup dan Paths

In [1]:
# === Setup Umum ===
import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, average_precision_score, recall_score, precision_score, f1_score,
                             precision_recall_curve, roc_curve)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

try:
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline as ImbPipeline
    IMB = True
except:
    print('imbalanced-learn belum terinstal. Jalankan: !pip install imbalanced-learn')
    IMB = False

RANDOM_STATE = 42
RECALL_TARGET = 0.80

DATA_PATH = Path('data/dataset_hasil_data_processing/train_processed.csv')
MODEL_DIR = Path('outputs/models')
METRIC_DIR = Path('outputs/metrics')
MODEL_DIR.mkdir(parents=True, exist_ok=True)
METRIC_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(DATA_PATH)
target_col = 'TARGET' if 'TARGET' in df.columns else [c for c in df.columns if c.lower() in ['target','label','y']][0]
y = df[target_col]
X = df.drop(columns=[target_col])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, val_idx = next(sss.split(X, y))
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

print('Train shape:', X_train.shape, 'Validation shape:', X_val.shape)

Train shape: (246008, 18) Validation shape: (61503, 18)


## Logistic Regression

In [2]:
from joblib import dump

def tune_and_train(model, param_grid, model_name):
    print(f'=== Training {model_name} ===')

    steps = [
        ('imp', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler()),
    ]
    if IMB:
        steps.append(('smote', SMOTE(random_state=RANDOM_STATE)))
    steps.append(('clf', model))

    pipe = (ImbPipeline if IMB else Pipeline)(steps)

    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=RANDOM_STATE)
    grid = GridSearchCV(pipe, param_grid=param_grid, scoring='recall', cv=cv, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print('Best params:', grid.best_params_)

    # Threshold tuning (selaraskan panjang PR vs thresholds)
    y_score = best_model.predict_proba(X_val)[:, 1]
    prec, rec, thres = precision_recall_curve(y_val, y_score)
    prec_, rec_ = prec[:-1], rec[:-1]
    f2 = (5 * prec_ * rec_) / (4 * prec_ + rec_ + 1e-9)
    thr = float(thres[int(np.argmax(f2))])

    y_pred = (y_score >= thr).astype(int)
    metrics = {
        'recall': float(recall_score(y_val, y_pred)),
        'precision': float(precision_score(y_val, y_pred, zero_division=0)),
        'f1': float(f1_score(y_val, y_pred)),
        'roc_auc': float(roc_auc_score(y_val, y_score)),
        'average_precision': float(average_precision_score(y_val, y_score)),
        'threshold': thr
    }

    # Simpan artefak
    out_dir = MODEL_DIR / model_name
    out_dir.mkdir(parents=True, exist_ok=True)
    dump(best_model, out_dir / 'model.pkl')
    json.dump({'threshold': thr}, open(out_dir / 'threshold.json', 'w'))
    json.dump({'params': grid.best_params_, 'metrics': metrics}, open(out_dir / 'model_meta.json', 'w'), indent=2)
    print('Saved model to', out_dir)
    return metrics

# Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced', solver='lbfgs', random_state=RANDOM_STATE)
params_lr = {'clf__C': [0.01, 0.1, 1, 10]}
metrics_lr = tune_and_train(lr, params_lr, 'logistic_regression')
print(metrics_lr)

=== Training logistic_regression ===
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best params: {'clf__C': 0.1}
Saved model to outputs\models\logistic_regression
{'recall': 0.6870090634441087, 'precision': 0.14836885602435843, 'f1': 0.2440350563405473, 'roc_auc': 0.7268325286093889, 'average_precision': 0.20326281221062686, 'threshold': 0.4889681203942051}


## Random Forest

In [3]:
# Random Forest
rf = RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
params_rf = {
    'clf__n_estimators': [100, 300],
    'clf__max_depth': [5, 10, 20, None],
    'clf__min_samples_split': [2, 5, 10]
}
metrics_rf = tune_and_train(rf, params_rf, 'random_forest')
print(metrics_rf)

=== Training random_forest ===
Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best params: {'clf__max_depth': 5, 'clf__min_samples_split': 10, 'clf__n_estimators': 100}
Saved model to outputs\models\random_forest
{'recall': 0.7256797583081571, 'precision': 0.13564490625705897, 'f1': 0.22856599105528594, 'roc_auc': 0.7138943081602347, 'average_precision': 0.18478401824221657, 'threshold': 0.44096216937142707}


## Gradient Boosting

In [4]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state=RANDOM_STATE)
params_gb = {
    'clf__n_estimators': [100, 200],
    'clf__learning_rate': [0.05, 0.1, 0.2],
    'clf__max_depth': [2, 3, 4]
}
metrics_gb = tune_and_train(gb, params_gb, 'gradient_boosting')
print(metrics_gb)

=== Training gradient_boosting ===
Fitting 10 folds for each of 18 candidates, totalling 180 fits
Best params: {'clf__learning_rate': 0.05, 'clf__max_depth': 2, 'clf__n_estimators': 100}
Saved model to outputs\models\gradient_boosting
{'recall': 0.675730110775428, 'precision': 0.14652574573088178, 'f1': 0.2408298040341684, 'roc_auc': 0.7223143453821236, 'average_precision': 0.19630379365265743, 'threshold': 0.43982455677979604}
