In [2]:
# !pip install interpret

# 1) 라이브러리 임포트 및 설정
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Explainable Boosting Machine
from interpret.glassbox import ExplainableBoostingClassifier

# 2) 데이터 로드
df = pd.read_csv('train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']

# 3) Train/Validation & Hold-out 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 4) 전처리 파이프라인 
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 5) 모델 파이프라인 정의 (EBM 사용)
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', ExplainableBoostingClassifier(
        interactions=10,
        learning_rate=0.01,
        random_state=42,
        n_jobs=-1
    ))
])

# 6) 5-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipe,
    X_trainval, y_trainval,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV 결과")
for i, (a, f, r) in enumerate(zip(acc, f1, auc), start=1):
    print(f"[Fold {i}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {r:.4f}, Composite: {comp[i-1]:.4f}")
print(f"\n평균 Composite Score: {comp.mean():.4f}")

# 7) Hold-out 테스트
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc_te = accuracy_score(y_test, y_pred)
f1_te  = f1_score(y_test, y_pred)
auc_te = roc_auc_score(y_test, y_prob)
comp_te = (acc_te + f1_te + auc_te) / 3

print("\nHold-out Test 결과")
print(f"Accuracy : {acc_te:.4f}")
print(f"F1 Score : {f1_te:.4f}")
print(f"ROC AUC  : {auc_te:.4f}")
print(f"Composite: {comp_te:.4f}")


5-Fold CV 결과
[Fold 1] Accuracy: 0.6633, F1: 0.6545, AUC: 0.7211, Composite: 0.6796
[Fold 2] Accuracy: 0.6532, F1: 0.6482, AUC: 0.7081, Composite: 0.6698
[Fold 3] Accuracy: 0.6405, F1: 0.6363, AUC: 0.7017, Composite: 0.6595
[Fold 4] Accuracy: 0.6579, F1: 0.6573, AUC: 0.7196, Composite: 0.6783
[Fold 5] Accuracy: 0.6447, F1: 0.6400, AUC: 0.7002, Composite: 0.6617

평균 Composite Score: 0.6698

Hold-out Test 결과
Accuracy : 0.6617
F1 Score : 0.6565
ROC AUC  : 0.7210
Composite: 0.6797
