In [7]:
import numpy as np
import pandas as pd

# from plotly.offline import init_notebook_mode, iplot, plot
# init_notebook_mode(connected=True)


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix, f1_score, recall_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler


import warnings
warnings.filterwarnings("ignore")


In [8]:
import platform, sklearn, numpy as np, pandas as pd, scipy, joblib

print("python :", platform.python_version())
print("sklearn:", sklearn.__version__)
print("numpy  :", np.__version__)
print("scipy  :", scipy.__version__)
print("pandas :", pd.__version__)
print("joblib :", joblib.__version__)

python : 3.11.13
sklearn: 1.2.2
numpy  : 1.26.4
scipy  : 1.11.4
pandas : 2.1.4
joblib : 1.5.2


In [9]:
data = pd.read_csv("data\german_credit_train.csv")
data.drop(["Unnamed: 0"], axis=1,inplace= True)

data.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,28,male,2,own,,moderate,1887,18,business,good
1,35,male,2,own,,,1979,15,radio/TV,good
2,20,female,2,own,rich,moderate,1577,11,furniture/equipment,good
3,24,female,1,own,little,little,626,12,radio/TV,bad
4,55,male,2,own,little,little,6872,24,furniture/equipment,bad


## 파생변수

In [10]:
# 파생변수 넣 말
data["Monthly payment"]=data["Credit amount"]/data["Duration"]

In [11]:
data.drop(columns=["Duration"], inplace=True)

## 결측치 unknwon

In [12]:
#결측치를 unknown으로 따로 지정한 버전
data['Saving accounts'].fillna('Unknown', inplace=True)
data['Checking account'].fillna('Unknown', inplace=True)

## saving 라벨인코딩

In [13]:
categoric_vars_list = ["Sex", "Job", "Housing", "Saving accounts", "Checking account", "Purpose", "Risk"]
numeric_vars_list = ["Age", "Credit amount"]

df = data.copy()

# Job과 Saving accounts는 Label Encoding
le_job = LabelEncoder()
df["Job"] = le_job.fit_transform(df["Job"])

saving_order = ["little", "moderate", "quite rich", "rich", "Unknown"]
saving_map = {val: idx for idx, val in enumerate(saving_order)}

df["Saving accounts"] = df["Saving accounts"].map(saving_map)



# checking_order = ["little", "moderate", "Unknown", "rich"]
# checking_map = {val: idx for idx, val in enumerate(checking_order)}

# df["Checking account"] = df["Checking account"].map(checking_map)



# 나머지 범주형은 One-Hot Encoding (Risk 제외)
categorical_for_ohe = [col for col in categoric_vars_list if col not in ["Job", "Saving accounts", "Risk"]]

df = pd.get_dummies(df, columns=categorical_for_ohe, drop_first=True)

df.head()


Unnamed: 0,Age,Job,Saving accounts,Credit amount,Risk,Monthly payment,Sex_male,Housing_own,Housing_rent,Checking account_little,Checking account_moderate,Checking account_rich,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,28,2,4,1887,good,104.833333,True,True,False,False,True,False,False,False,False,False,False,False,False
1,35,2,4,1979,good,131.933333,True,True,False,False,False,False,False,False,False,False,True,False,False
2,20,2,3,1577,good,143.363636,False,True,False,False,True,False,False,False,False,True,False,False,False
3,24,1,0,626,bad,52.166667,False,True,False,True,False,False,False,False,False,False,True,False,False
4,55,2,0,6872,bad,286.333333,True,True,False,True,False,False,False,False,False,True,False,False,False


## 데이터 분할 비율

## Duration robust 스케일링

In [14]:
# 타겟 변수 생성
y = (df["Risk"] == "good").astype(int)
X = df.drop(["Risk"], axis=1)

# 데이터 분할 비율 8:2 버전
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 스케일링 함수
# 현재 duration을 robust scaling으로 처리한 버전

def scale_numeric(X_train, X_test):
    scaler_age = StandardScaler()
    scaler_duration = RobustScaler()
    scaler_credit = StandardScaler()

    # Age → StandardScaler
    scaler_age.fit(X_train[["Age"]])
    X_train.loc[:, ["Age"]] = scaler_age.transform(X_train[["Age"]])
    X_test.loc[:, ["Age"]] = scaler_age.transform(X_test[["Age"]])

    # # Duration → RobustScaler
    # scaler_duration.fit(X_train[["Duration"]])
    # X_train.loc[:, ["Duration"]] = scaler_duration.transform(X_train[["Duration"]])
    # X_test.loc[:, ["Duration"]] = scaler_duration.transform(X_test[["Duration"]])

    # Credit amount → 로그 변환 + StandardScaler
    X_train_credit_log = np.log1p(X_train["Credit amount"])
    X_test_credit_log = np.log1p(X_test["Credit amount"])

    scaler_credit.fit(X_train_credit_log.values.reshape(-1, 1))
    X_train.loc[:, ["Credit amount"]] = scaler_credit.transform(X_train_credit_log.values.reshape(-1, 1))
    X_test.loc[:, ["Credit amount"]] = scaler_credit.transform(X_test_credit_log.values.reshape(-1, 1))

    # Monthly payment → StandardScaler
    if "Monthly payment" in X_train.columns:
        scaler_monthly = StandardScaler()
        scaler_monthly.fit(X_train[["Monthly payment"]])
        X_train.loc[:, ["Monthly payment"]] = scaler_monthly.transform(X_train[["Monthly payment"]])
        X_test.loc[:, ["Monthly payment"]] = scaler_monthly.transform(X_test[["Monthly payment"]])

    return X_train, X_test



# 스케일링 적용
X_train, X_test = scale_numeric(X_train.copy(), X_test.copy())


-------------------------------------------

여기부터 모델

-------------------------------------------

랜덤포레스트

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, precision_recall_curve
import numpy as np

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [31, 32, 33, 34],
    'max_depth': [6, 7, 8],
    'min_samples_split': [6, 7, 8, 9],
    'min_samples_leaf': [4, 5, 6]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    verbose=1,
    n_jobs=-1
)
grid_search_rf.fit(X_train, y_train)

print("Best hyperparams:", grid_search_rf.best_params_)

best_rf = grid_search_rf.best_estimator_

oof_probs = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]
p, r, th = precision_recall_curve(y_train, oof_probs)
f1s = 2 * p[:-1] * r[:-1] / (p[:-1] + r[:-1] + 1e-12)
best_th = th[np.argmax(f1s)]
print(f"Best threshold (train, by F1): {best_th:.6f}")

best_rf.fit(X_train, y_train)
y_pred_prob_rf = best_rf.predict_proba(X_test)[:, 1]
y_pred_rf = (y_pred_prob_rf >= best_th).astype(int)


accuracy = accuracy_score(y_test, y_pred_rf)
recall   = recall_score(y_test, y_pred_rf)
f1       = f1_score(y_test, y_pred_rf)

print("Random Forest Results (threshold-optimized)")
print(f"Accuracy: {accuracy:.3f}")
print(f"Recall:   {recall:.3f}")
print(f"F1 Score: {f1:.3f}\n")


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best hyperparams: {'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 32}
Best threshold (train, by F1): 0.543868
Random Forest Results (threshold-optimized)
Accuracy: 0.792
Recall:   0.958
F1 Score: 0.866



In [16]:
# train.py 같은 파일
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

# 학습된 모델 저장
joblib.dump(model, "model.pkl")


['model.pkl']