# 0. 패키지 임포트

In [225]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.initializers import Constant

# 1. 데이터 준비

In [226]:
df = pd.read_csv('cleaned_scf_data.csv')

In [227]:
target_cols = ['CDS', 'NMMF', 'STOCKS', 'RETQLIQ','MMMF']
for col in target_cols:
    df[col] = (df[col] > 0).astype(int)

y = df[target_cols]

In [228]:
numerical_cols = ['연령','자녀수','급여소득','사업농업소득','자본이득소득']
categorical_cols = ['교육수준','결혼상태','직업분류1','금융위험감수']

X = df[numerical_cols + categorical_cols]

In [229]:
preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numerical_cols),
                                               ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)])

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [231]:
initial_biases = []
for col in target_cols:
    pos = np.sum(y_train[col] == 1)
    neg = np.sum(y_train[col] == 0)
    initial_bias = np.log([pos / (neg + 1e-7)])
    initial_biases.append(initial_bias[0])

print("\n--- 각 상품별 초기 편향 설정값 ---")
for i, col in enumerate(target_cols):
    print(f"{col:<15}: {initial_biases[i]:.4f}")


--- 각 상품별 초기 편향 설정값 ---
CDS            : -2.4741
NMMF           : -1.3704
STOCKS         : -0.8888
RETQLIQ        : 0.3847
MMMF           : -3.1217


In [232]:
custom_class_weights = {}

for col in target_cols:
    pos_ratio = (y_train[col] == 1).sum() / len(y_train[col])
    
    if pos_ratio < 0.05:  # MMMF
        weights = compute_class_weight('balanced', 
                                      classes=np.unique(y_train[col]), 
                                      y=y_train[col])
        weights = np.sqrt(weights)  # 극단값 완화
        
    elif pos_ratio < 0.2:  # CDS, NMMF
        weights = compute_class_weight('balanced', 
                                      classes=np.unique(y_train[col]), 
                                      y=y_train[col])
        weights = weights * 0.7  # 약간 완화
        
    else:  # STOCKS, RETQLIQ
        weights = compute_class_weight('balanced', 
                                      classes=np.unique(y_train[col]), 
                                      y=y_train[col])
    
    custom_class_weights[col] = dict(zip(np.unique(y_train[col]), weights))
    print(f"{col}: ratio={pos_ratio:.3f}, weights={custom_class_weights[col]}")  # 전체 출력

CDS: ratio=0.078, weights={0: 0.37948324681453516, 1: 4.504901960784314}
NMMF: ratio=0.203, weights={0: 0.627004161833936, 1: 2.468439430566747}
STOCKS: ratio=0.291, weights={0: 0.7055662188099808, 1: 1.7161531279178337}
RETQLIQ: ratio=0.595, weights={0: 1.2345513164965072, 1: 0.840343818580834}
MMMF: ratio=0.042, weights={0: 0.7225236642168399, 1: 3.4413345529248733}


# 2. 모델 학습

In [233]:
from keras.layers import BatchNormalization

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_processed.shape[1],)),
    BatchNormalization(),  # 추가
    Dropout(0.25),  # 0.3 -> 0.25
    Dense(128, activation='relu'),
    BatchNormalization(),  # 추가
    Dropout(0.25),  # 0.3 -> 0.25
    Dense(64, activation='relu'),
    Dropout(0.2),  # 0.3 -> 0.2
    Dense(len(target_cols), activation='sigmoid', bias_initializer=Constant(initial_biases))
])

In [234]:
sample_weights = np.ones((len(y_train), len(target_cols)))
for i, col in enumerate(target_cols):
    for j, class_val in enumerate(y_train[col]):
        sample_weights[j, i] = custom_class_weights[col][class_val]

# 평균 대신 조화평균 사용
epsilon = 1e-10
final_sample_weights = len(target_cols) / np.sum(1.0 / (sample_weights + epsilon), axis=1)

# 극단값 제한
final_sample_weights = np.clip(final_sample_weights, 0.5, 3.0)

In [235]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_30 (Dense)            (None, 256)               7424      
                                                                 
 batch_normalization (BatchN  (None, 256)              1024      
 ormalization)                                                   
                                                                 
 dropout_22 (Dropout)        (None, 256)               0         
                                                                 
 dense_31 (Dense)            (None, 128)               32896     
                                                                 
 batch_normalization_1 (Batc  (None, 128)              512       
 hNormalization)                                                 
                                                                 
 dropout_23 (Dropout)        (None, 128)              

In [236]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train_processed, 
                    y_train.values, 
                    epochs=50, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1, 
                    callbacks=[early_stopping],
                    sample_weight=final_sample_weights)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# 3. 모델 성능 평가

In [349]:
loss, accuracy, auc = model.evaluate(X_test_processed, y_test)



In [350]:
y_pred_proba = model.predict(X_test_processed)



In [351]:
y_pred = (y_pred_proba > 0.5).astype(int)

In [352]:
# 각 상품별 최적 임계값 설정
optimal_thresholds = {
    'CDS': 0.25,     # 0.25 -> 0.20
    'NMMF': 0.4,    # 0.50 -> 0.45  
    'STOCKS': 0.45,  # 0.50 -> 0.45
    'RETQLIQ': 0.50, # 그대로
    'MMMF': 0.3     # 0.50 -> 0.15 (희귀 클래스)
}

for i, col in enumerate(target_cols):
    y_pred[:, i] = (y_pred_proba[:, i] >= optimal_thresholds[col]).astype(int)

In [353]:
report = classification_report(y_test, y_pred, target_names=target_cols)
print(report)

              precision    recall  f1-score   support

         CDS       0.40      0.31      0.35       364
        NMMF       0.62      0.63      0.63       881
      STOCKS       0.69      0.63      0.66      1334
     RETQLIQ       0.80      0.88      0.84      2641
        MMMF       0.51      0.27      0.35       187

   micro avg       0.72      0.72      0.72      5407
   macro avg       0.60      0.54      0.56      5407
weighted avg       0.71      0.72      0.71      5407
 samples avg       0.47      0.46      0.44      5407



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [354]:
print("각 금융상품별 ROC AUC 점수")
for i, col in enumerate(target_cols):
    auc_score = roc_auc_score(y_test.iloc[:, i], y_pred_proba[:, i])
    print(f"{col:<15}: {auc_score:.4f}")

각 금융상품별 ROC AUC 점수
CDS            : 0.8274
NMMF           : 0.8814
STOCKS         : 0.8708
RETQLIQ        : 0.8895
MMMF           : 0.8647


In [355]:
print("샘플 데이터 예측 테스트")
# 첫 번째 테스트 데이터로 예측 수행
for j in range(10):
    sample_input = X_test_processed[j:j+1]
    sample_prediction_proba = model.predict(sample_input)[0]

    print("\n예측 결과 (각 상품별 가입 확률):")

    for i, col in enumerate(target_cols):
        print(f"{col:<15}: {sample_prediction_proba[i]*100:.2f}%")

샘플 데이터 예측 테스트

예측 결과 (각 상품별 가입 확률):
CDS            : 1.69%
NMMF           : 6.14%
STOCKS         : 28.44%
RETQLIQ        : 72.89%
MMMF           : 0.32%

예측 결과 (각 상품별 가입 확률):
CDS            : 16.14%
NMMF           : 12.97%
STOCKS         : 30.94%
RETQLIQ        : 56.38%
MMMF           : 4.01%

예측 결과 (각 상품별 가입 확률):
CDS            : 2.89%
NMMF           : 92.66%
STOCKS         : 68.02%
RETQLIQ        : 90.59%
MMMF           : 2.76%

예측 결과 (각 상품별 가입 확률):
CDS            : 4.70%
NMMF           : 15.20%
STOCKS         : 40.15%
RETQLIQ        : 83.91%
MMMF           : 6.82%

예측 결과 (각 상품별 가입 확률):
CDS            : 25.56%
NMMF           : 38.45%
STOCKS         : 75.24%
RETQLIQ        : 96.18%
MMMF           : 6.87%

예측 결과 (각 상품별 가입 확률):
CDS            : 3.21%
NMMF           : 7.95%
STOCKS         : 40.22%
RETQLIQ        : 48.68%
MMMF           : 3.56%

예측 결과 (각 상품별 가입 확률):
CDS            : 4.18%
NMMF           : 4.72%
STOCKS         : 15.42%
RETQLIQ        : 51.64%
MMMF           : 0.15%

예측 결과 