In [4]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.neural_network import MLPClassifier

# 데이터 로드 & 분할
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 모델 파이프라인: MLPClassifier 
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        learning_rate_init=0.001,
        max_iter=800,
        early_stopping=True,
        random_state=42
    ))
])

# 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring)

print("5-Fold CV (MLP)")
for fold, (acc, f1, auc) in enumerate(zip(
        cv_results['test_accuracy'],
        cv_results['test_f1'],
        cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {fold}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] +
                                   cv_results['test_f1'] +
                                   cv_results['test_roc_auc']) / 3).mean())

# 최종 Holdout Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (MLP)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


5-Fold CV (MLP)
[Fold 1] Accuracy: 0.6501, F1: 0.6460, AUC: 0.7077, Composite: 0.6679
[Fold 2] Accuracy: 0.6399, F1: 0.6454, AUC: 0.6920, Composite: 0.6591
[Fold 3] Accuracy: 0.6219, F1: 0.6429, AUC: 0.6747, Composite: 0.6465
[Fold 4] Accuracy: 0.6380, F1: 0.6311, AUC: 0.6957, Composite: 0.6549
[Fold 5] Accuracy: 0.6312, F1: 0.6001, AUC: 0.6810, Composite: 0.6374

평균 Composite Score: 0.6531739872245914

최종 Holdout Test (MLP)
Accuracy : 0.6444
F1 Score : 0.6558
ROC AUC  : 0.7027
Composite: 0.6676
