In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
accepted = pd.read_csv('accepted.csv')

selected_cols = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
    'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
    'verification_status', 'purpose', 'dti', 'fico_range_low', 'fico_range_high',
    'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'loan_status'
]

accepted_selected = accepted[selected_cols]


# =====================[ IQR 이상치 제거 함수 ]=====================
def remove_outliers_iqr(accepted_selected, cols):
    for col in cols:
        Q1 = accepted_selected[col].quantile(0.25)
        Q3 = accepted_selected[col].quantile(0.75)
        IQR = Q3 - Q1
        mask = (accepted_selected[col] >= Q1 - 1.5*IQR) & (accepted_selected[col] <= Q3 + 1.5*IQR)
        accepted_selected = accepted_selected[mask]
    return accepted_selected.reset_index(drop=True)


iqr_cols = ['loan_amnt', 'installment', 'annual_inc', 'dti',
            'open_acc', 'revol_bal', 'revol_util', 'total_acc']


accepted_filtered = remove_outliers_iqr(accepted_selected, iqr_cols)

# 컬럼별 허용 값 범위로 필터링
accepted_filtered = accepted_filtered[
    (accepted_filtered['int_rate'] >= 6) & (accepted_filtered['int_rate'] <= 31) &
    (accepted_filtered['dti'] >= 0) & (accepted_filtered['dti'] <= 40) &
    (accepted_filtered['annual_inc'] > 0) &
    (accepted_filtered['revol_util'] >= 0) & (accepted_filtered['revol_util'] <= 100) &
    (accepted_filtered['annual_inc'] > 0)
]

# 'emp_length' 결측값을 'Unknown'으로 대체
accepted_filtered['emp_length'] = accepted_filtered['emp_length'].fillna('Unknown')

# 'term' 컬럼에서 숫자만 추출하여 정수형으로 변환 ('36 months' → 36)
accepted_filtered['term'] = accepted_filtered['term'].astype(str).str.extract(r'(\d+)').astype(int)

# 'grade' 등급을 문자에서 숫자 코드(0~6)로 변환 ('grade' ML 시 졔외)
grade_map = {'A':6, 'B':5, 'C':4, 'D':3, 'E':2, 'F':1, 'G':0}
accepted_filtered['grade_map'] = accepted_filtered['grade'].map(grade_map)

# 'sub_grade' 등급을 문자에서 숫자 코드(0~34)로 변환 ('sub_grade' ML 시 졔외)
import re

accepted_filtered['sub_grade'] = accepted_filtered['sub_grade'].astype(str)

def map_subgrade(sg):
   try:
       sg = str(sg).strip().upper()  
       match = re.match(r'^([A-G])([1-5])$', sg) 
       if match:
           letter = match.group(1)
           number = int(match.group(2))
           return grade_map[letter] * 5 + (5 - number)
       else:
           return None 
   except:
       return None

accepted_filtered['sub_grade_map'] = accepted_filtered['sub_grade'].apply(map_subgrade)

# emp_length 문자에서 숫자 코드(0~10)로 변환 ('emp_length' ML 시 졔외)
emp_length_map = {
   '10+ years': 10,
   '9 years': 9,
   '8 years': 8,
   '7 years': 7,
   '6 years': 6,
   '5 years': 5,
   '4 years': 4,
   '3 years': 3,
   '2 years': 2,
   '1 year': 1,
   '< 1 year': 0.5,
   'n/a': 0
}


accepted_filtered['emp_length_map'] = accepted_filtered['emp_length'].map(emp_length_map)

# =====================[ log 변환 (skewed 수치형) ]=====================

# 왜도(skew)가 큰 수치형 변수에 로그 변환 적용('annual_inc', 'revol_bal' ML 시 제외)
accepted_filtered['annual_inc_log'] = np.log1p(accepted_filtered['annual_inc'])
accepted_filtered['revol_bal_log'] = np.log1p(accepted_filtered['revol_bal'])



accepted_filtered['fico_mean'] = (accepted_filtered['fico_range_low'] + accepted_filtered['fico_range_high']) / 2

# 레이블 인코딩 딕셔너리 예시 (이진 분류)
loan_status_map = {
   'Fully Paid': 1,
   'Current': 1,   # 또는 np.nan으로 두고 제거할 수도 있음
   'Charged Off': 0,
   'Late (31-120 days)': 0,
   'Late (16-30 days)': 0,
   'In Grace Period': 0,
   'Does not meet the credit policy. Status:Fully Paid': 1,
   'Does not meet the credit policy. Status:Charged Off': 0
}
# 'accepted_filtered'을 이용할 때
accepted_filtered['loan_status_f'] = accepted_filtered['loan_status'].map(loan_status_map)


  accepted = pd.read_csv('accepted.csv')


In [6]:
cols_ml = [
   'int_rate', 'installment', 'purpose', 'dti', 'open_acc', 'revol_util',
   'total_acc', 'emp_length_map', 'annual_inc_log', 'revol_bal_log', 'fico_mean',
   'term', 'home_ownership', 'verification_status', 'purpose', 'loan_status_f'
]

accepted_ml = accepted_filtered[cols_ml]

# 결측값이 포함된 행 전체 제거
accepted_ml = accepted_ml.dropna().reset_index(drop=True)

In [7]:
# 반드시 한번만 실행(여러번 실행 했을 시 accepted_ml 등장할 때부터 다시 돌리기)
accepted_ml = pd.get_dummies(
   accepted_ml,
   columns=['term', 'home_ownership', 'verification_status', 'purpose'],
   drop_first=True,
   dtype=int
)

accepted_ml = accepted_ml.loc[:, ~accepted_ml.columns.duplicated()]

In [10]:
# 데이터 분할
df = accepted_ml.copy()  
X = df.drop(columns='loan_status_f')
y = df['loan_status_f']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42)

# dtype 정제
y_train = y_train.astype(int).reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)

In [11]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())

Before SMOTE: loan_status_f
1    1145449
0     182891
Name: count, dtype: int64
After SMOTE: loan_status_f
1    1145449
0    1145449
Name: count, dtype: int64


In [12]:
from imblearn.under_sampling import RandomUnderSampler


rus = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)