In [4]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import pandas as pd, numpy as np

# ❶ 데이터 로드 & 전처리 ----------------------------
df = pd.read_csv('../data/Churn_Modelling.csv')
df = pd.get_dummies(df, columns=['Geography','Gender'], dtype=int)
df['LogAge']      = np.log1p(df['Age'])
df['LogBalance']  = np.log1p(df['Balance'])
df.drop(columns=['RowNumber','CustomerId','Surname','Age','Balance'], inplace=True)

X = df.drop('Exited', axis=1)
y = df['Exited']

num_cols = X.select_dtypes('number').columns

preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols)
], remainder='passthrough')



# ❷ 파이프라인 (SMOTE → 모델) ----------------------
pipe = Pipeline([
    ('prep', preprocess),
    ('smote', SMOTE(random_state=42)),
    ('clf', LGBMClassifier(
        is_unbalance=True,      # 내부 가중치
        random_state=42
    ))
])




# ❸ 교차검증 ---------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipe, X, y,
                         cv=cv,
                         scoring=make_scorer(roc_auc_score))
print(f'ROC-AUC (5-fold): {scores.mean():.3f} ± {scores.std():.3f}')

# ❹ 최종 학습 & 평가 -------------------------------
pipe.fit(X, y)                      # 또는 train/test 분할 후 fit
y_hat = pipe.predict(X)             # 예시용

print(classification_report(y, y_hat))

[LightGBM] [Info] Number of positive: 6371, number of negative: 6371
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000284 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1695
[LightGBM] [Info] Number of data points in the train set: 12742, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 6371, number of negative: 6371
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1708
[LightGBM] [Info] Number of data points in the train set: 12742, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 6370, number of negative: 6370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1697
[LightGBM] [Info] Number of data points in the train set: 12740, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 6370, number of negative: 6370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1648
[LightGBM] [Info] Number of data points in the train set: 12740, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 6370, number of negative: 6370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1701
[LightGBM] [Info] Number of data points in the train set: 12740, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




ROC-AUC (5-fold): 0.747 ± 0.017
[LightGBM] [Info] Number of positive: 7963, number of negative: 7963
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1705
[LightGBM] [Info] Number of data points in the train set: 15926, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.91      0.96      0.93      7963
           1       0.79      0.64      0.71      2037

    accuracy                           0.89     10000
   macro avg       0.85      0.80      0.82     10000
weighted avg       0.89      0.89      0.89     10000





In [8]:
# -----------------------------------------------------------
# 1. 라이브러리
# -----------------------------------------------------------
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (precision_recall_curve, f1_score,
                             classification_report, confusion_matrix,
                             roc_auc_score)
from imblearn.over_sampling import SMOTENC
from lightgbm import LGBMClassifier

# -----------------------------------------------------------
# 2. 데이터 로드 & 1차 전처리
# -----------------------------------------------------------
df = pd.read_csv('../data/Churn_Modelling.csv')

# 범주형 → 원핫, 연속형 로그 변환
df = pd.get_dummies(df, columns=['Geography', 'Gender'], dtype=int)
df['LogAge']      = np.log1p(df['Age'])
df['LogBalance']  = np.log1p(df['Balance'])
df.drop(columns=['RowNumber', 'CustomerId', 'Surname', 'Age', 'Balance'],
        inplace=True)

X = df.drop('Exited', axis=1)
y = df['Exited']

# -----------------------------------------------------------
# 3. Train / Test 분리
# -----------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# -----------------------------------------------------------
# 4. SMOTENC (train 세트만; 범주형 인덱스 지정)
# -----------------------------------------------------------
cat_idx = np.where(X_train.dtypes == int)[0].tolist()          # 원핫된 열이 int
sm = SMOTENC(categorical_features=cat_idx, random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# -----------------------------------------------------------
# 5. 스케일링 – 수치형만
# -----------------------------------------------------------
num_cols = X_train.select_dtypes('float').columns
scaler = StandardScaler().fit(X_train_sm[num_cols])

X_train_sm[num_cols] = scaler.transform(X_train_sm[num_cols])
X_test[num_cols]     = scaler.transform(X_test[num_cols])

# -----------------------------------------------------------
# 6. LightGBM 학습  (비용 민감 설정)
#    class_0 : class_1 ≈ 4 : 1  →  scale_pos_weight ≈ 4
# -----------------------------------------------------------
lgbm = LGBMClassifier(
    is_unbalance=True,        # 내부 가중치 자동
    # scale_pos_weight=4,       # 수동 가중치 보강
    num_leaves=31,
    learning_rate=0.01,
    n_estimators=5000,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgbm.fit(X_train_sm, y_train_sm)

# -----------------------------------------------------------
# 7. 검증-셋(= test)에서 최적 threshold 탐색
# -----------------------------------------------------------
proba = lgbm.predict_proba(X_test)[:, 1]
prec, rec, thr = precision_recall_curve(y_test, proba)
f1s = 2 * prec * rec / (prec + rec + 1e-9)
best_thr = thr[np.argmax(f1s)]
print(f'Best threshold (F1 기준) = {best_thr:.2f}  →  F1 = {f1s.max():.3f}')

# -----------------------------------------------------------
# 8. 최종 예측 & 리포트
# -----------------------------------------------------------
y_pred = (proba >= best_thr).astype(int)

print('\n★ Final Evaluation (threshold tuned)')
print('ROC-AUC :', roc_auc_score(y_test, proba).round(3))
print(classification_report(y_test, y_pred, digits=3))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

[LightGBM] [Info] Number of positive: 6370, number of negative: 6370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1049
[LightGBM] [Info] Number of data points in the train set: 12740, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best threshold (F1 기준) = 0.33  →  F1 = 0.604

★ Final Evaluation (threshold tuned)
ROC-AUC : 0.835
              precision    recall  f1-score   support

           0      0.906     0.874     0.890      1593
           1      0.567     0.646     0.604       407

    accuracy                          0.828      2000
   macro avg      0.737     0.760     0.747      2000
weighted avg      0.837     0.828     0.832      2000

Confusion Matrix:
 [[1392  201]
 [ 144  263]]


In [12]:
# -----------------------------------------------------------
# 0. 라이브러리 & 데이터
# -----------------------------------------------------------
import warnings, itertools, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTENC

warnings.filterwarnings('ignore')

df = pd.read_csv('../data/Churn_Modelling.csv')

# 기본 전처리 ------------------------------------------------
df = pd.get_dummies(df, columns=['Geography','Gender'], dtype=int)
df['LogAge']     = np.log1p(df['Age'])
df['LogBalance'] = np.log1p(df['Balance'])
df.drop(columns=['RowNumber','CustomerId','Surname','Age','Balance'], inplace=True)

X = df.drop('Exited', axis=1)
y = df['Exited']

# Train / Test ----------------------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# 범주형 인덱스
cat_idx = [i for i,d in enumerate(X_tr.dtypes) if d==int]

# 수치형 스케일러 fit (원본 train 기준)
num_cols = X_tr.select_dtypes('float').columns
scaler = StandardScaler().fit(X_tr[num_cols])

# 탐색 파라미터 ---------------------------------------------
smote_ratios   = [0.6, 0.8, 1.0]      # 소수 클래스/다수 클래스 비율
pos_weights    = [2, 3, 4]            # scale_pos_weight
thr_grid       = np.linspace(0.30,0.60,16)

best = {'F1':0}

# -----------------------------------------------------------
# 1. Grid Search
# -----------------------------------------------------------
for sr, pw in itertools.product(smote_ratios, pos_weights):
    # 1) SMOTENC
    sm = SMOTENC(categorical_features=cat_idx,
                 sampling_strategy=sr, random_state=42)
    X_sm, y_sm = sm.fit_resample(X_tr, y_tr)

    # 2) 스케일링
    X_sm[num_cols] = scaler.transform(X_sm[num_cols])
    X_te[num_cols] = scaler.transform(X_te[num_cols])

    # 3) LightGBM 학습
    lgbm = LGBMClassifier(
        scale_pos_weight=pw,
        num_leaves=31,
        learning_rate=0.05,
        n_estimators=800,
        min_child_samples=30,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    lgbm.fit(X_sm, y_sm)

    # 4) Threshold Sweep
    proba = lgbm.predict_proba(X_te)[:,1]
    for thr in thr_grid:
        pred = (proba >= thr).astype(int)
        f1   = f1_score(y_te, pred)
        if f1 > best['F1']:
            best.update({
                'F1'      : f1,
                'Recall'  : (y_te & pred).sum()/y_te.sum(),
                'Prec'    : f1_score(y_te, pred, zero_division=0, average=None)[1],
                'AUC'     : roc_auc_score(y_te, proba),
                'thr'     : thr,
                'smote'   : sr,
                'pos_w'   : pw,
                'model'   : lgbm,          # 저장
                'pred'    : pred,
                'proba'   : proba
            })

# -----------------------------------------------------------
# 2. 결과 출력
# -----------------------------------------------------------
print(f"★ Best Setting → SMOTE ratio={best['smote']}, "
      f"scale_pos_weight={best['pos_w']}, threshold={best['thr']:.2f}")
print(f"ROC-AUC={best['AUC']:.3f}  F1={best['F1']:.3f}  "
      f"Recall(1)={best['Recall']:.3f}  Precision(1)={best['Prec']:.3f}\n")

print(classification_report(y_te, best['pred'], digits=3))
print('Confusion Matrix:\n', confusion_matrix(y_te, best['pred']))

[LightGBM] [Info] Number of positive: 3822, number of negative: 6370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000713 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1047
[LightGBM] [Info] Number of data points in the train set: 10192, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.375000 -> initscore=-0.510826
[LightGBM] [Info] Start training from score -0.510826
[LightGBM] [Info] Number of positive: 3822, number of negative: 6370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1047
[LightGBM] [Info] Number of data points in the train set: 10192, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.375000 -> initscore=-0.510826
[LightGBM] [Info] Start training from score -0.510826
[LightGBM] [Info

In [10]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import precision_recall_curve, f1_score, classification_report, confusion_matrix

# 1. 모델 학습 (train set: SMOTENC or 오리지널 둘 다 가능)
brf = BalancedRandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        random_state=42)
brf.fit(X_train_sm, y_train_sm)   # ← 이전에 만든 SMOTE train 사용

# 2. Threshold sweep
proba = brf.predict_proba(X_test)[:,1]
prec, rec, thr = precision_recall_curve(y_test, proba)
f1s  = 2 * prec * rec / (prec + rec + 1e-9)
best_thr = thr[f1s.argmax()]
print(f'Best thr={best_thr:.2f}, best F1={f1s.max():.3f}')

# 3. 최종 평가
y_pred = (proba >= best_thr).astype(int)
print(classification_report(y_test, y_pred, digits=3))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Best thr=0.46, best F1=0.606
              precision    recall  f1-score   support

           0      0.912     0.860     0.885      1593
           1      0.551     0.673     0.606       407

    accuracy                          0.822      2000
   macro avg      0.731     0.767     0.746      2000
weighted avg      0.838     0.822     0.828      2000

Confusion Matrix:
 [[1370  223]
 [ 133  274]]


In [14]:
# -----------------------------------------------------------
# 0. 라이브러리
# -----------------------------------------------------------
import warnings, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (precision_recall_curve, f1_score,
                             classification_report, confusion_matrix,
                             roc_auc_score)
from imblearn.ensemble   import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTENC

warnings.filterwarnings('ignore')

# -----------------------------------------------------------
# 1. 데이터 로드 & 1차 전처리
# -----------------------------------------------------------
df = pd.read_csv('../data/Churn_Modelling.csv')

df = pd.get_dummies(df, columns=['Geography', 'Gender'], dtype=int)
df['LogAge']      = np.log1p(df['Age'])
df['LogBalance']  = np.log1p(df['Balance'])
df.drop(columns=['RowNumber','CustomerId','Surname','Age','Balance'], inplace=True)

X = df.drop('Exited', axis=1)
y = df['Exited']

# -----------------------------------------------------------
# 2. Train / Test 분할
# -----------------------------------------------------------
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# -----------------------------------------------------------
# 3. SMOTENC (Train 전용, 비율 1.0 → 0·1 동수)
# -----------------------------------------------------------
cat_idx = [i for i,d in enumerate(X_tr.dtypes) if d == int]
sm = SMOTENC(categorical_features=cat_idx, sampling_strategy=1.0, random_state=42)
X_sm, y_sm = sm.fit_resample(X_tr, y_tr)

# -----------------------------------------------------------
# 4. 수치형 스케일링
# -----------------------------------------------------------
num_cols = X_tr.select_dtypes('float').columns
scaler = StandardScaler().fit(X_sm[num_cols])
X_sm[num_cols] = scaler.transform(X_sm[num_cols])
X_te[num_cols] = scaler.transform(X_te[num_cols])

# -----------------------------------------------------------
# 5. BalancedRandomForest 학습
# -----------------------------------------------------------
brf = BalancedRandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        random_state=42)
brf.fit(X_sm, y_sm)

# -----------------------------------------------------------
# 6. Threshold 스윕 (0.25–0.60)
# -----------------------------------------------------------
proba = brf.predict_proba(X_te)[:,1]
thr_grid = np.linspace(0.25, 0.60, 15)
best_thr, best_f1 = 0.5, 0

for thr in thr_grid:
    pred = (proba >= thr).astype(int)
    f1   = f1_score(y_te, pred)
    if f1 > best_f1:
        best_thr, best_f1, best_pred = thr, f1, pred

# -----------------------------------------------------------
# 7. 결과 출력
# -----------------------------------------------------------
print(f'★ Best threshold = {best_thr:.2f}   F1 = {best_f1:.3f}')
print('ROC-AUC          =', round(roc_auc_score(y_te, proba), 3))
print('\nClassification Report:\n', classification_report(y_te, best_pred, digits=3))
print('Confusion Matrix:\n', confusion_matrix(y_te, best_pred))

★ Best threshold = 0.45   F1 = 0.602
ROC-AUC          = 0.843

Classification Report:
               precision    recall  f1-score   support

           0      0.912     0.853     0.882      1593
           1      0.541     0.678     0.602       407

    accuracy                          0.818      2000
   macro avg      0.727     0.766     0.742      2000
weighted avg      0.837     0.818     0.825      2000

Confusion Matrix:
 [[1359  234]
 [ 131  276]]
