In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline

In [2]:
accepted = pd.read_csv('accepted.csv')

selected_cols = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
    'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
    'verification_status', 'purpose', 'dti', 'fico_range_low', 'fico_range_high',
    'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'loan_status'
]

accepted_selected = accepted[selected_cols]


# =====================[ IQR 이상치 제거 함수 ]=====================
def remove_outliers_iqr(accepted_selected, cols):
    for col in cols:
        Q1 = accepted_selected[col].quantile(0.25)
        Q3 = accepted_selected[col].quantile(0.75)
        IQR = Q3 - Q1
        mask = (accepted_selected[col] >= Q1 - 1.5*IQR) & (accepted_selected[col] <= Q3 + 1.5*IQR)
        accepted_selected = accepted_selected[mask]
    return accepted_selected.reset_index(drop=True)


iqr_cols = ['loan_amnt', 'installment', 'annual_inc', 'dti',
            'open_acc', 'revol_bal', 'revol_util', 'total_acc']


accepted_filtered = remove_outliers_iqr(accepted_selected, iqr_cols)

# 컬럼별 허용 값 범위로 필터링
accepted_filtered = accepted_filtered[
    (accepted_filtered['int_rate'] >= 6) & (accepted_filtered['int_rate'] <= 31) &
    (accepted_filtered['dti'] >= 0) & (accepted_filtered['dti'] <= 40) &
    (accepted_filtered['annual_inc'] > 0) &
    (accepted_filtered['revol_util'] >= 0) & (accepted_filtered['revol_util'] <= 100) &
    (accepted_filtered['annual_inc'] > 0)
]

# 'emp_length' 결측값을 'Unknown'으로 대체
accepted_filtered['emp_length'] = accepted_filtered['emp_length'].fillna('Unknown')

# 'term' 컬럼에서 숫자만 추출하여 정수형으로 변환 ('36 months' → 36)
accepted_filtered['term'] = accepted_filtered['term'].astype(str).str.extract(r'(\d+)').astype(int)

# 'grade' 등급을 문자에서 숫자 코드(0~6)로 변환 ('grade' ML 시 졔외)
grade_map = {'A':6, 'B':5, 'C':4, 'D':3, 'E':2, 'F':1, 'G':0}
accepted_filtered['grade_map'] = accepted_filtered['grade'].map(grade_map)

# 'sub_grade' 등급을 문자에서 숫자 코드(0~34)로 변환 ('sub_grade' ML 시 졔외)
import re

accepted_filtered['sub_grade'] = accepted_filtered['sub_grade'].astype(str)

def map_subgrade(sg):
   try:
       sg = str(sg).strip().upper()  
       match = re.match(r'^([A-G])([1-5])$', sg) 
       if match:
           letter = match.group(1)
           number = int(match.group(2))
           return grade_map[letter] * 5 + (5 - number)
       else:
           return None 
   except:
       return None

accepted_filtered['sub_grade_map'] = accepted_filtered['sub_grade'].apply(map_subgrade)

# emp_length 문자에서 숫자 코드(0~10)로 변환 ('emp_length' ML 시 졔외)
emp_length_map = {
   '10+ years': 10,
   '9 years': 9,
   '8 years': 8,
   '7 years': 7,
   '6 years': 6,
   '5 years': 5,
   '4 years': 4,
   '3 years': 3,
   '2 years': 2,
   '1 year': 1,
   '< 1 year': 0.5,
   'n/a': 0
}


accepted_filtered['emp_length_map'] = accepted_filtered['emp_length'].map(emp_length_map)

# =====================[ log 변환 (skewed 수치형) ]=====================

# 왜도(skew)가 큰 수치형 변수에 로그 변환 적용('annual_inc', 'revol_bal' ML 시 제외)
accepted_filtered['annual_inc_log'] = np.log1p(accepted_filtered['annual_inc'])
accepted_filtered['revol_bal_log'] = np.log1p(accepted_filtered['revol_bal'])



accepted_filtered['fico_mean'] = (accepted_filtered['fico_range_low'] + accepted_filtered['fico_range_high']) / 2

# 레이블 인코딩 딕셔너리 예시 (이진 분류)
loan_status_map = {
   'Fully Paid': 1,
   'Current': 1,   # 또는 np.nan으로 두고 제거할 수도 있음
   'Charged Off': 0,
   'Late (31-120 days)': 0,
   'Late (16-30 days)': 0,
   'In Grace Period': 0,
   'Does not meet the credit policy. Status:Fully Paid': 1,
   'Does not meet the credit policy. Status:Charged Off': 0
}
# 'accepted_filtered'을 이용할 때
accepted_filtered['loan_status_f'] = accepted_filtered['loan_status'].map(loan_status_map)


  accepted = pd.read_csv('accepted.csv')


In [31]:
cols_ml = [
   'int_rate', 'installment', 'purpose', 'dti', 'open_acc', 'revol_util',
   'total_acc', 'emp_length_map', 'annual_inc_log', 'revol_bal_log', 'fico_mean',
   'term', 'home_ownership', 'verification_status', 'purpose', 'loan_status_f'
]

accepted_ml = accepted_filtered[cols_ml]

In [32]:
# 결측값이 포함된 행 전체 제거
accepted_ml = accepted_ml.dropna().reset_index(drop=True)

In [33]:
# 반드시 한번만 실행(여러번 실행 했을 시 accepted_ml 등장할 때부터 다시 돌리기)
accepted_ml = pd.get_dummies(
   accepted_ml,
   columns=['term', 'home_ownership', 'verification_status', 'purpose'],
   drop_first=True,
   dtype=int
)

accepted_ml = accepted_ml.loc[:, ~accepted_ml.columns.duplicated()]

In [34]:
# 데이터 분할
df = accepted_ml.copy()  
X = df.drop(columns='loan_status_f')
y = df['loan_status_f']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42)

# dtype 정제
y_train = y_train.astype(int).reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)

In [35]:
# SMOTE 적용
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())

Before SMOTE: loan_status_f
1    1145449
0     182891
Name: count, dtype: int64
After SMOTE: loan_status_f
1    1145449
0    1145449
Name: count, dtype: int64


In [8]:
# 2-1. SMOTE 적용 로지스틱 회귀

# StandardScaler로 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_valid_scaled = scaler.transform(X_valid)

# 1. 모델 정의
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# 2. 모델 학습
logistic_model.fit(X_train_scaled, y_train_res)

# 3. 검증 데이터 예측
y_pred = logistic_model.predict(X_valid_scaled)
y_proba = logistic_model.predict_proba(X_valid_scaled)[:, 1]  # 클래스 1에 대한 확률

# 4. 평가 지표 출력
print("✅ Accuracy:", accuracy_score(y_valid, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_valid, y_pred))
print("🎯 ROC AUC Score:", roc_auc_score(y_valid, y_proba))

✅ Accuracy: 0.8247858229067859

📊 Confusion Matrix:
 [[  5703  39821]
 [ 18365 268196]]

📄 Classification Report:
               precision    recall  f1-score   support

         0.0       0.24      0.13      0.16     45524
         1.0       0.87      0.94      0.90    286561

    accuracy                           0.82    332085
   macro avg       0.55      0.53      0.53    332085
weighted avg       0.78      0.82      0.80    332085

🎯 ROC AUC Score: 0.6331357439699553


In [9]:
# 2-2. SMOTE 적용 랜덤포레스트

# 1. 모델 정의
randomforest_model = RandomForestClassifier(
    n_estimators=100,     
    max_depth=None,    
    random_state=42,     
    n_jobs=-1           
)

# 2. 모델 학습
randomforest_model.fit(X_train_res, y_train_res)

# 3. 검증 데이터 예측
y_pred = randomforest_model.predict(X_valid)
y_proba = randomforest_model.predict_proba(X_valid_scaled)[:, 1]  # 클래스 1에 대한 확률

# 4. 성능 평가
print("✅ Accuracy:", accuracy_score(y_valid, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_valid, y_pred))
print("🎯 ROC AUC Score:", roc_auc_score(y_valid, y_proba))



✅ Accuracy: 0.8492584729813151

📊 Confusion Matrix:
 [[  2080  43444]
 [  6615 279946]]

📄 Classification Report:
               precision    recall  f1-score   support

         0.0       0.24      0.05      0.08     45524
         1.0       0.87      0.98      0.92    286561

    accuracy                           0.85    332085
   macro avg       0.55      0.51      0.50    332085
weighted avg       0.78      0.85      0.80    332085

🎯 ROC AUC Score: 0.5266414845106044


In [10]:
# 2-3. SMOTE 적용 XGBOOST

# 1. 모델 정의
xgb_model = XGBClassifier(
    n_estimators=100,        
    learning_rate=0.1,      
    max_depth=6,              
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1                
)


# 2. 모델 학습
xgb_model.fit(X_train_res, y_train_res)


# 3. 예측
y_pred = xgb_model.predict(X_valid)
y_proba = xgb_model.predict_proba(X_valid_scaled)[:, 1]  # 클래스 1에 대한 확률

# 4. 평가 결과 출력
print("✅ Accuracy:", accuracy_score(y_valid, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_valid, y_pred))
print("🎯 ROC AUC Score:", roc_auc_score(y_valid, y_proba))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Accuracy: 0.8483550898113434

📊 Confusion Matrix:
 [[  1611  43913]
 [  6446 280115]]

📄 Classification Report:
               precision    recall  f1-score   support

         0.0       0.20      0.04      0.06     45524
         1.0       0.86      0.98      0.92    286561

    accuracy                           0.85    332085
   macro avg       0.53      0.51      0.49    332085
weighted avg       0.77      0.85      0.80    332085

🎯 ROC AUC Score: 0.530892824515436


In [11]:
# 2-4. SMOTE 적용 LIGHT GBM

# 1. 모델 정의
lgbm_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)

# 2. 모델 학습
lgbm_model.fit(X_train_res, y_train_res)

# 3. 예측
y_pred = lgbm_model.predict(X_valid)
y_proba = lgbm_model.predict_proba(X_valid_scaled)[:, 1]  # 클래스 1에 대한 확률

# 4. 평가
print("✅ Accuracy:", accuracy_score(y_valid, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_valid, y_pred))
print("🎯 ROC AUC Score:", roc_auc_score(y_valid, y_proba))

[LightGBM] [Info] Number of positive: 1145449, number of negative: 1145449
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2584
[LightGBM] [Info] Number of data points in the train set: 2290898, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




✅ Accuracy: 0.8516313594411069

📊 Confusion Matrix:
 [[  1313  44211]
 [  5060 281501]]

📄 Classification Report:
               precision    recall  f1-score   support

         0.0       0.21      0.03      0.05     45524
         1.0       0.86      0.98      0.92    286561

    accuracy                           0.85    332085
   macro avg       0.54      0.51      0.49    332085
weighted avg       0.77      0.85      0.80    332085

🎯 ROC AUC Score: 0.5141773599106432


In [12]:
# LR 하이퍼파라미터 튜닝

# 파이프라인 구성
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# GridSearch에 적용
lr_grid = GridSearchCV(
    lr_pipeline,
    {
        'clf__C': [0.01, 0.1, 1, 10],
        'clf__penalty': ['l2'],
        'clf__class_weight': ['balanced', None],
        'clf__solver': ['liblinear', 'lbfgs']
    },
    scoring='f1',
    cv=3,
    n_jobs=-1
)

lr_grid.fit(X_train_res, y_train_res)

# 예측 및 평가
y_pred = lr_grid.best_estimator_.predict(X_valid)
y_proba = lr_grid.best_estimator_.predict_proba(X_valid)[:, 1]

from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_valid, y_pred))
print("ROC AUC Score:", roc_auc_score(y_valid, y_proba))


              precision    recall  f1-score   support

         0.0       0.24      0.13      0.16     45524
         1.0       0.87      0.94      0.90    286561

    accuracy                           0.82    332085
   macro avg       0.55      0.53      0.53    332085
weighted avg       0.78      0.82      0.80    332085

ROC AUC Score: 0.6331058485346761


In [13]:
# RandomForest 하이퍼파라미터 튜닝

rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [6, 10, None],
    'class_weight': ['balanced', 'balanced_subsample']
}


rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    rf_param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1
)
rf_grid.fit(X_train_res, y_train_res)


y_pred = rf_grid.best_estimator_.predict(X_valid)
y_proba = rf_grid.best_estimator_.predict_proba(X_valid)[:, 1]
print(classification_report(y_valid, y_pred))
print("ROC AUC Score:", roc_auc_score(y_valid, y_proba))


KeyboardInterrupt: 

In [14]:
# XGBOOST 하이퍼파라미터 튜닝

neg, pos = np.bincount(y_train_res)
scale_pos_weight = neg / pos


xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1],
    'scale_pos_weight': [scale_pos_weight, scale_pos_weight*1.5]
}


xgb_grid = GridSearchCV(
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
    xgb_param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1
)
xgb_grid.fit(X_train_res, y_train_res)


y_pred = xgb_grid.best_estimator_.predict(X_valid)
y_proba = xgb_grid.best_estimator_.predict_proba(X_valid)[:, 1]
print(classification_report(y_valid, y_pred))
print("ROC AUC Score:", roc_auc_score(y_valid, y_proba))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


              precision    recall  f1-score   support

         0.0       0.16      0.01      0.01     45524
         1.0       0.86      0.99      0.92    286561

    accuracy                           0.86    332085
   macro avg       0.51      0.50      0.47    332085
weighted avg       0.77      0.86      0.80    332085

ROC AUC Score: 0.6915839849023464


In [16]:
# LightGBM 하이퍼파라미터 튜닝
# is_unbalance와 scale_pos_weight를 동시에 설정하면 안 됨 - 일반 실무선 scale_pos_weight 추천

neg, pos = np.bincount(y_train_res)
scale_pos_weight = neg / pos


lgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [6, 8, 10],
    'num_leaves': [15, 31, 63],
    'learning_rate': [0.01, 0.1],
    # 'is_unbalance': [True],   # <= 이 부분 제거!
    'scale_pos_weight': [scale_pos_weight, scale_pos_weight*1.5]
}


lgb_grid = GridSearchCV(
    LGBMClassifier(random_state=42, n_jobs=-1),
    lgb_param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1
)
lgb_grid.fit(X_train_res, y_train_res)


y_pred = lgb_grid.best_estimator_.predict(X_valid)
y_proba = lgb_grid.best_estimator_.predict_proba(X_valid)[:, 1]
print(classification_report(y_valid, y_pred))
print("ROC AUC Score:", roc_auc_score(y_valid, y_proba))

KeyboardInterrupt: 

CatBoost, Decision Tree, SVM (Support Vector Machine)

In [29]:
# catboost이므로 명목형 변수 숫자형 변수로 변환하지 않는 accepted_cat 만들기
cols_cat = [
  'int_rate', 'installment', 'purpose', 'dti', 'open_acc', 'revol_util',
  'total_acc', 'emp_length_map', 'annual_inc_log', 'revol_bal_log', 'fico_mean',
  'term', 'home_ownership', 'verification_status', 'loan_status_f'
]

accepted_cat = accepted_filtered[cols_cat]

# 결측값이 포함된 행 전체 제거
accepted_cat = accepted_cat.dropna().reset_index(drop=True)

# 데이터 분할
df = accepted_cat.copy()  
X = df.drop(columns='loan_status_f')
y = df['loan_status_f']

X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X, y, test_size = 0.2, random_state=42)

# dtype 정제
y_train2 = y_train2.astype(int).reset_index(drop=True)
X_train2 = X_train2.reset_index(drop=True)
X_valid2 = X_valid2.reset_index(drop=True)


In [30]:
# 1. CatBoost
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# 범주형 컬럼
cat_cols = ['purpose', 'home_ownership', 'verification_status', 'term']

# 클래스 가중치 계산
classes = np.unique(y_train2)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train2)
class_weights = dict(zip(classes, weights))

# CatBoost 모델 정의
cat_model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    class_weights=class_weights,
    random_state=42,
    verbose=0
)

# 모델 학습
cat_model.fit(X_train2, y_train2, cat_features=cat_cols)

# 예측 및 평가
y_pred2 = cat_model.predict(X_valid2)
y_proba2 = cat_model.predict_proba(X_valid2)[:, 1]

print("CatBoost")
print(classification_report(y_valid2, y_pred2))
print("ROC AUC Score:", roc_auc_score(y_valid2, y_proba2))

CatBoost
              precision    recall  f1-score   support

         0.0       0.23      0.69      0.34     45524
         1.0       0.93      0.62      0.74    286561

    accuracy                           0.63    332085
   macro avg       0.58      0.66      0.54    332085
weighted avg       0.83      0.63      0.69    332085

ROC AUC Score: 0.7165524217071628


In [36]:
# 2. Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Decision Tree 모델 정의 및 학습
dt_model = DecisionTreeClassifier(
    max_depth=6,
    class_weight='balanced',  # 불균형 데이터일 경우
    random_state=42
)
dt_model.fit(X_train_res, y_train_res)

# 예측 및 평가
y_pred = dt_model.predict(X_valid)
y_proba = dt_model.predict_proba(X_valid)[:, 1]
print("Decision Tree")
print(classification_report(y_valid, y_pred))
print("ROC AUC Score:", roc_auc_score(y_valid, y_proba))


Decision Tree
              precision    recall  f1-score   support

         0.0       0.20      0.25      0.22     45524
         1.0       0.88      0.84      0.86    286561

    accuracy                           0.76    332085
   macro avg       0.54      0.54      0.54    332085
weighted avg       0.78      0.76      0.77    332085

ROC AUC Score: 0.6332740664506775


In [None]:
# 3. SVM (Support Vector Machine)
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_valid_scaled = scaler.transform(X_valid)

# SVM 모델 정의 및 학습
svm_model = SVC(
    C=1.0,           # 정규화 파라미터
    kernel='rbf',    # RBF(가우시안) 커널
    class_weight='balanced',
    probability=True, # predict_proba 사용을 위해 필요
    random_state=42
)
svm_model.fit(X_train_scaled, y_train_res)

# 예측 및 평가
y_pred = svm_model.predict(X_valid_scaled)
y_proba = svm_model.predict_proba(X_valid_scaled)[:, 1]
print("SVM")
print(classification_report(y_valid, y_pred))
print("ROC AUC Score:", roc_auc_score(y_valid, y_proba))


In [12]:
from imblearn.under_sampling import RandomUnderSampler


rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)