In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix, f1_score, recall_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler

import miceforest as mf

from xgboost import XGBClassifier

from collections import Counter

import warnings
warnings.filterwarnings("ignore")


In [2]:
import platform, sklearn, numpy as np, pandas as pd, scipy, joblib

print("python :", platform.python_version())
print("sklearn:", sklearn.__version__)
print("numpy  :", np.__version__)
print("scipy  :", scipy.__version__)
print("pandas :", pd.__version__)
print("joblib :", joblib.__version__)

python : 3.11.7
sklearn: 1.2.2
numpy  : 1.26.4
scipy  : 1.11.4
pandas : 2.1.4
joblib : 1.5.2


In [3]:
data = pd.read_csv("data\german_credit_train.csv")
data.drop(["Unnamed: 0"], axis=1,inplace= True)

data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,28,male,2,own,,moderate,1887,18,business,good
1,35,male,2,own,,,1979,15,radio/TV,good
2,20,female,2,own,rich,moderate,1577,11,furniture/equipment,good
3,24,female,1,own,little,little,626,12,radio/TV,bad
4,55,male,2,own,little,little,6872,24,furniture/equipment,bad


## 파생변수

In [4]:
# 파생변수 넣 말
data["Monthly payment"]=data["Credit amount"]/data["Duration"]

In [5]:
data.drop(columns=["Duration"], inplace=True)

## 결측치 unknwon

In [6]:
#결측치를 unknown으로 따로 지정한 버전
data['Saving accounts'].fillna('Unknown', inplace=True)
data['Checking account'].fillna('Unknown', inplace=True)

## 결측치 miceforest

In [7]:
# # 원본 복사
# df2 = data.copy()

# for col in df2.select_dtypes(include="object").columns:
#     df2[col] = df2[col].astype("category")

# # -------------------------
# # MICE with Random Forest
# # -------------------------
# # 1) Imputation Kernel 생성
# kernel = mf.ImputationKernel(
#     data=df2,
#     datasets=1,             # 몇 개의 대치 데이터셋 만들지 (모델링에선 1개로 충분)
#     save_all_iterations=True,
#     random_state=42
# )

# # 2) 반복 (iterations 번 돌면서 변수별 결측치 순환 예측)
# kernel.mice(iterations=5)   # 보통 5~10번 정도 반복

# # 3) 완성된 데이터 얻기
# # data["Saving accounts"] = kernel.complete_data(dataset=0)["Saving accounts"]
# data["Checking account"] = kernel.complete_data(dataset=0)["Checking account"]

# # -------------------------
# # 결과 확인
# # -------------------------
# print("(after imputation):")
# # print(data["Saving accounts"].value_counts(), "\n")
# print(data["Checking account"].value_counts(), "\n")

## saving 원핫인코딩

In [8]:
# #인코딩
# categoric_vars_list = ["Sex", "Job", "Housing", "Saving accounts", "Checking account", "Purpose", "Risk"]
# numeric_vars_list = ["Age", "Credit amount"]

# df = data.copy()

# # job은 항상 라벨 인코딩
# # saving accounts가 원핫 인코딩 된 버전

# le = LabelEncoder()
# df["Job"] = le.fit_transform(df["Job"])

# categorical_for_ohe = [col for col in categoric_vars_list if col not in ["Job", "Risk"]]

# df = pd.get_dummies(df, columns=categorical_for_ohe, drop_first=True)

# df.head()


## saving 라벨인코딩

In [9]:
categoric_vars_list = ["Sex", "Job", "Housing", "Saving accounts", "Checking account", "Purpose", "Risk"]
numeric_vars_list = ["Age", "Credit amount"]

df = data.copy()

# Job과 Saving accounts는 Label Encoding
le_job = LabelEncoder()
df["Job"] = le_job.fit_transform(df["Job"])

saving_order = ["little", "moderate", "quite rich", "rich", "Unknown"]
saving_map = {val: idx for idx, val in enumerate(saving_order)}

df["Saving accounts"] = df["Saving accounts"].map(saving_map)



# checking_order = ["little", "moderate", "Unknown", "rich"]
# checking_map = {val: idx for idx, val in enumerate(checking_order)}

# df["Checking account"] = df["Checking account"].map(checking_map)



# 나머지 범주형은 One-Hot Encoding (Risk 제외)
categorical_for_ohe = [col for col in categoric_vars_list if col not in ["Job", "Saving accounts", "Risk"]]

df = pd.get_dummies(df, columns=categorical_for_ohe, drop_first=True)

df.head()


Unnamed: 0,Age,Job,Saving accounts,Credit amount,Risk,Monthly payment,Sex_male,Housing_own,Housing_rent,Checking account_little,Checking account_moderate,Checking account_rich,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,28,2,4,1887,good,104.833333,True,True,False,False,True,False,False,False,False,False,False,False,False
1,35,2,4,1979,good,131.933333,True,True,False,False,False,False,False,False,False,False,True,False,False
2,20,2,3,1577,good,143.363636,False,True,False,False,True,False,False,False,False,True,False,False,False
3,24,1,0,626,bad,52.166667,False,True,False,True,False,False,False,False,False,False,True,False,False
4,55,2,0,6872,bad,286.333333,True,True,False,True,False,False,False,False,False,True,False,False,False


In [10]:
df.isnull().sum()

Age                            0
Job                            0
Saving accounts                0
Credit amount                  0
Risk                           0
Monthly payment                0
Sex_male                       0
Housing_own                    0
Housing_rent                   0
Checking account_little        0
Checking account_moderate      0
Checking account_rich          0
Purpose_car                    0
Purpose_domestic appliances    0
Purpose_education              0
Purpose_furniture/equipment    0
Purpose_radio/TV               0
Purpose_repairs                0
Purpose_vacation/others        0
dtype: int64

In [11]:
data_numeric = df.loc[:, numeric_vars_list]
df.describe()

Unnamed: 0,Age,Job,Saving accounts,Credit amount,Monthly payment
count,800.0,800.0,800.0,800.0,800.0
mean,35.385,1.895,1.11,3311.0675,169.478822
std,11.487242,0.651541,1.572734,2818.209049,161.022087
min,19.0,0.0,0.0,338.0,24.055556
25%,26.75,2.0,0.0,1385.0,91.5
50%,33.0,2.0,0.0,2349.5,131.828869
75%,41.0,2.0,2.0,4070.25,208.125
max,75.0,3.0,4.0,18424.0,2482.666667


## 데이터 분할 비율

## Duration robust 스케일링

In [12]:
# 타겟 변수 생성
y = (df["Risk"] == "good").astype(int)
X = df.drop(["Risk"], axis=1)

# 데이터 분할 비율 8:2 버전
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 스케일링 함수
# 현재 duration을 robust scaling으로 처리한 버전

def scale_numeric(X_train, X_test):
    scaler_age = StandardScaler()
    scaler_duration = RobustScaler()
    scaler_credit = StandardScaler()

    # Age → StandardScaler
    scaler_age.fit(X_train[["Age"]])
    X_train.loc[:, ["Age"]] = scaler_age.transform(X_train[["Age"]])
    X_test.loc[:, ["Age"]] = scaler_age.transform(X_test[["Age"]])

    # # Duration → RobustScaler
    # scaler_duration.fit(X_train[["Duration"]])
    # X_train.loc[:, ["Duration"]] = scaler_duration.transform(X_train[["Duration"]])
    # X_test.loc[:, ["Duration"]] = scaler_duration.transform(X_test[["Duration"]])

    # Credit amount → 로그 변환 + StandardScaler
    X_train_credit_log = np.log1p(X_train["Credit amount"])
    X_test_credit_log = np.log1p(X_test["Credit amount"])

    scaler_credit.fit(X_train_credit_log.values.reshape(-1, 1))
    X_train.loc[:, ["Credit amount"]] = scaler_credit.transform(X_train_credit_log.values.reshape(-1, 1))
    X_test.loc[:, ["Credit amount"]] = scaler_credit.transform(X_test_credit_log.values.reshape(-1, 1))

    # Monthly payment → StandardScaler
    if "Monthly payment" in X_train.columns:
        scaler_monthly = StandardScaler()
        scaler_monthly.fit(X_train[["Monthly payment"]])
        X_train.loc[:, ["Monthly payment"]] = scaler_monthly.transform(X_train[["Monthly payment"]])
        X_test.loc[:, ["Monthly payment"]] = scaler_monthly.transform(X_test[["Monthly payment"]])

    return X_train, X_test



# 스케일링 적용
X_train, X_test = scale_numeric(X_train.copy(), X_test.copy())


-------------------------------------------

여기부터 모델

-------------------------------------------

로지스틱 회귀

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_recall_curve, accuracy_score, confusion_matrix
import numpy as np

log_reg = LogisticRegression(solver="liblinear", random_state=42, max_iter=200)

param_grid = {
    "C": [0.01, 0.05, 0.07, 0.1, 0.12, 0.15, 0.2, 1, 3],  # 범위 확장
    "penalty": ["l1", "l2"],
    "class_weight": [None, "balanced"],
}

scoring = {
    "roc_auc": "roc_auc",
    "f1": make_scorer(f1_score),
    "recall": make_scorer(recall_score),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring=scoring,
    refit="f1",        # 최종 모델 선택 기준: F1
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Best params (by F1):", grid_search.best_params_)
best_model = grid_search.best_estimator_

# -------------------------
# 임계값 튜닝
# -------------------------
oof_probs = cross_val_predict(best_model, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th = th[np.argmax(f1s)]
print("Best threshold (train, by F1):", best_th)

# -------------------------
# 평가 (임계값 적용)
# -------------------------
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
y_pred_log = (y_pred_prob >= best_th).astype(int)

print("Logistic Regression (F1-refit + tuned threshold) Results")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.3f}")
print(f"Recall:   {recall_score(y_test, y_pred_log):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred_log):.3f}\n")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best params (by F1): {'C': 0.2, 'class_weight': None, 'penalty': 'l2'}
Best threshold (train, by F1): 0.5165665804804357
Logistic Regression (F1-refit + tuned threshold) Results
Accuracy: 0.738
Recall:   0.940
F1 Score: 0.834



결정트리

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import make_scorer, f1_score, recall_score, accuracy_score, confusion_matrix, precision_recall_curve
import numpy as np

dt = DecisionTreeClassifier(random_state=42)

param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [3, 4, 5, 6],
    "min_samples_split": [2, 3, 4, 5],
    "min_samples_leaf": [1, 2, 3],
    "class_weight": [None, "balanced"],
}

scoring = {
    "roc_auc": "roc_auc",
    "f1": make_scorer(f1_score),
    "recall": make_scorer(recall_score),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring=scoring,
    refit="f1",      # F1 기준으로 최종 모델 선택
    cv=cv,
    verbose=1,
    n_jobs=-1,
)
grid_search_dt.fit(X_train, y_train)

print("Best params (by F1):", grid_search_dt.best_params_)
best_dt = grid_search_dt.best_estimator_

# -------------------------
# threshold 튜닝 (train OOF 기반)
# -------------------------
oof_probs = cross_val_predict(best_dt, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th = th[np.argmax(f1s)]
print("Best threshold (train, by F1):", best_th)

# -------------------------
# 평가 (튜닝된 threshold 적용)
# -------------------------
y_pred_prob_dt = best_dt.predict_proba(X_test)[:, 1]
y_pred_dt = (y_pred_prob_dt >= best_th).astype(int)


accuracy = accuracy_score(y_test, y_pred_dt)
recall   = recall_score(y_test, y_pred_dt)
f1       = f1_score(y_test, y_pred_dt)

print("Decision Tree Results (threshold tuned)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1 Score: {f1:.3f}\n")



Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best params (by F1): {'class_weight': None, 'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best threshold (train, by F1): 0.25
Decision Tree Results (threshold tuned)
Accuracy: 0.758
Recall:   0.970
F1 Score: 0.849



랜덤포레스트

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
import numpy as np

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [31, 32, 33, 34],
    'max_depth': [6, 7, 8],
    'min_samples_split': [6, 7, 8, 9],
    'min_samples_leaf': [4, 5, 6]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search_rf.fit(X_train, y_train)

print("Best hyperparams:", grid_search_rf.best_params_)

best_rf = grid_search_rf.best_estimator_

oof_probs = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th = th[np.argmax(f1s)]
print(f"Best threshold (train, by F1): {best_th:.6f}")

best_rf.fit(X_train, y_train)
y_pred_prob_rf = best_rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_pred_prob_rf >= best_th).astype(int)


accuracy = accuracy_score(y_test, y_pred_rf)
recall   = recall_score(y_test, y_pred_rf)
f1       = f1_score(y_test, y_pred_rf)

print("Random Forest Results (threshold-optimized)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1 Score: {f1:.3f}\n")


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best hyperparams: {'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 32}
Best threshold (train, by F1): 0.543868
Random Forest Results (threshold-optimized)
Accuracy: 0.792
Recall:   0.958
F1 Score: 0.866



서포트 벡터 머신

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
import numpy as np

svm = SVC(probability=True, random_state=42)
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_svm = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search_svm.fit(X_train, y_train)
print("Best hyperparams:", grid_search_svm.best_params_)

best_svm = grid_search_svm.best_estimator_

oof_probs = cross_val_predict(best_svm, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th = th[np.argmax(f1s)]
print(f"Best threshold (train, by F1): {best_th:.6f}")

best_svm.fit(X_train, y_train)
y_pred_prob_svm = best_svm.predict_proba(X_test)[:, 1]
y_pred_svm = (y_pred_prob_svm >= best_th).astype(int)

accuracy = accuracy_score(y_test, y_pred_svm)
recall   = recall_score(y_test, y_pred_svm)
f1       = f1_score(y_test, y_pred_svm)

print("Support Vector Machine Results (threshold-optimized)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1 Score: {f1:.3f}\n")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best hyperparams: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best threshold (train, by F1): 0.433955
Support Vector Machine Results (threshold-optimized)
Accuracy: 0.767
Recall:   0.988
F1 Score: 0.856



K 근접 이웃 KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
import numpy as np

# =========================
# (A) Plain KNN (F1 + 임계치)
# =========================
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 10, 11, 12, 13],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_knn = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search_knn.fit(X_train, y_train)
print("Best hyperparams (KNN):", grid_search_knn.best_params_)

best_knn = grid_search_knn.best_estimator_

# OOF 확률로 임계치(F1 최대) 탐색
oof_probs_knn = cross_val_predict(best_knn, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs_knn)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th_knn = th[np.argmax(f1s)]
print(f"Best threshold (train, by F1, KNN): {best_th_knn:.6f}")

# 학습+평가
best_knn.fit(X_train, y_train)
y_pred_prob_knn = best_knn.predict_proba(X_test)[:, 1]
y_pred_knn = (y_pred_prob_knn >= best_th_knn).astype(int)

accuracy = accuracy_score(y_test, y_pred_knn)
recall   = recall_score(y_test, y_pred_knn)
f1_plain  = f1_score(y_test, y_pred_knn)

print("K-Nearest Neighbors Results (threshold-optimized)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1_plain Score: {f1_plain:.3f}\n")

# ==========================================
# (B) Pipeline: PCA + SelectKBest + KNN (F1)
# ==========================================
features = []
features.append(('pca', PCA(n_components=4)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)

model = Pipeline([
    ('feature_union', feature_union),
    ('knn', KNeighborsClassifier())
])

param_grid_pipeline = {
    'feature_union__pca__n_components': [2, 3, 4],
    'feature_union__select_best__k': [4, 6, 8],
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

grid_search_pipeline = GridSearchCV(
    estimator=model,
    param_grid=param_grid_pipeline,
    cv=cv,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)
grid_search_pipeline.fit(X_train, y_train)
print("Best Hyperparameters (Pipeline+KNN):", grid_search_pipeline.best_params_)

best_model = grid_search_pipeline.best_estimator_

# OOF 확률로 임계치(F1 최대) 탐색
oof_probs_pipe = cross_val_predict(best_model, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p2, r2, th2 = precision_recall_curve(y_train, oof_probs_pipe)
f1s2 = 2 * p2[:-1] * r2[:-1] / (p2[:-1] + r2[:-1] + 1e-12)
best_th_pipe = th2[np.argmax(f1s2)]
print(f"Best threshold (train, by F1, Pipeline): {best_th_pipe:.6f}")

# 학습+평가
best_model.fit(X_train, y_train)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_prob >= best_th_pipe).astype(int)

accuracy = accuracy_score(y_test, y_pred)
recall   = recall_score(y_test, y_pred)
f1_pipe  = f1_score(y_test, y_pred)

print("K-Nearest Neighbors (Pipeline) Results (threshold-optimized)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1_pipe Score: {f1_pipe:.3f}\n")



Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best hyperparams (KNN): {'metric': 'euclidean', 'n_neighbors': 13, 'weights': 'uniform'}
Best threshold (train, by F1, KNN): 0.538462
K-Nearest Neighbors Results (threshold-optimized)
Accuracy: 0.750
Recall:   0.964
F1_plain Score: 0.844

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters (Pipeline+KNN): {'feature_union__pca__n_components': 3, 'feature_union__select_best__k': 8, 'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'uniform'}
Best threshold (train, by F1, Pipeline): 0.285714
K-Nearest Neighbors (Pipeline) Results (threshold-optimized)
Accuracy: 0.729
Recall:   1.000
F1_pipe Score: 0.838



xgboost

In [18]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
import numpy as np

xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42,
    use_label_encoder=False
)

param_grid = {
    'n_estimators': [30, 50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search_xgb.fit(X_train, y_train)
print("Best hyperparams:", grid_search_xgb.best_params_)

best_xgb = grid_search_xgb.best_estimator_

oof_probs = cross_val_predict(best_xgb, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th = th[np.argmax(f1s)]
print(f"Best threshold (train, by F1): {best_th:.6f}")

best_xgb.fit(X_train, y_train)
y_pred_prob_xgb = best_xgb.predict_proba(X_test)[:, 1]
y_pred_xgb = (y_pred_prob_xgb >= best_th).astype(int)

accuracy = accuracy_score(y_test, y_pred_xgb)
recall   = recall_score(y_test, y_pred_xgb)
f1       = f1_score(y_test, y_pred_xgb)

print("XGBoost Results (threshold-optimized)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1 Score: {f1:.3f}\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best hyperparams: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Best threshold (train, by F1): 0.511748
XGBoost Results (threshold-optimized)
Accuracy: 0.746
Recall:   0.958
F1 Score: 0.841

Confusion Matrix:
[[ 18  54]
 [  7 161]]


In [19]:
results = {}

# Logistic Regression
results["Logistic Regression"] = f1_score(y_test, y_pred_log)

# Decision Tree
results["Decision Tree"] = f1_score(y_test, y_pred_dt)

# Random Forest
results["Random Forest"] = f1_score(y_test, y_pred_rf)

# SVM
results["SVM"] = f1_score(y_test, y_pred_svm)

# KNN
results["KNN"] = max(f1_plain, f1_pipe)

# XGBoost
results["XGBoost"] = f1_score(y_test, y_pred_xgb)



# 결과 정리
for model, f1 in results.items():
    print(f"{model:20s}: F1 Score = {f1:.3f}")


Logistic Regression : F1 Score = 0.834
Decision Tree       : F1 Score = 0.849
Random Forest       : F1 Score = 0.866
SVM                 : F1 Score = 0.856
KNN                 : F1 Score = 0.844
XGBoost             : F1 Score = 0.841


In [20]:
# train.py 같은 파일
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

# 학습된 모델 저장
joblib.dump(model, "model.pkl")


['model.pkl']