In [4]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

# 1) 데이터 로드 & 분할
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 3) Voting 앙상블: CatBoost, L1-regularized Logistic, MLP
estimators = [
    ('cb',  CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        random_state=42,
        verbose=0
    )),
    ('lr_l1',  LogisticRegression(
        penalty='l1',
        solver='saga',
        max_iter=1000,
        C=1.0,
        random_state=42
    )),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        learning_rate_init=0.001,
        max_iter=800,
        early_stopping=True,
        random_state=42
    ))
]
voting_clf = VotingClassifier(
    estimators=estimators,
    voting='soft',
    weights=[2, 1, 1],
    n_jobs=-1
)

# 4) 전체 파이프라인
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', voting_clf)
])

# 5) 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring, n_jobs=-1)

print("5-Fold CV (Voting: CatBoost, L1-LR, MLP)")
for i, (acc, f1, auc) in enumerate(zip(
        cv_results['test_accuracy'],
        cv_results['test_f1'],
        cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {i}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] +
                                   cv_results['test_f1'] +
                                   cv_results['test_roc_auc']) / 3).mean())

# 6) 최종 Hold-out Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (Voting: CatBoost, L1-LR, MLP)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


5-Fold CV (Voting: CatBoost, L1-LR, MLP)
[Fold 1] Accuracy: 0.6698, F1: 0.6634, AUC: 0.7267, Composite: 0.6866
[Fold 2] Accuracy: 0.6560, F1: 0.6548, AUC: 0.7141, Composite: 0.6749
[Fold 3] Accuracy: 0.6523, F1: 0.6547, AUC: 0.7031, Composite: 0.6701
[Fold 4] Accuracy: 0.6639, F1: 0.6606, AUC: 0.7203, Composite: 0.6816
[Fold 5] Accuracy: 0.6441, F1: 0.6300, AUC: 0.7041, Composite: 0.6594

평균 Composite Score: 0.6745208116839306

최종 Holdout Test (Voting: CatBoost, L1-LR, MLP)
Accuracy : 0.6696
F1 Score : 0.6677
ROC AUC  : 0.7266
Composite: 0.6880
