In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [6]:
from catboost import CatBoostClassifier
# 데이터 로드 및 분할 
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
# 전처리 
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

In [9]:
# 모델 정의
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_state=42,
        verbose=0  
    ))
])
# 5-Fold CV 
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring)

print("5-Fold CV (CatBoost)")
for fold, (acc, f1, auc) in enumerate(zip(cv_results['test_accuracy'], cv_results['test_f1'], cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {fold}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] + cv_results['test_f1'] + cv_results['test_roc_auc']) / 3).mean())

# 최종 Holdout Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (CatBoost)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

5-Fold CV (CatBoost)
[Fold 1] Accuracy: 0.6523, F1: 0.6423, AUC: 0.7201, Composite: 0.6716
[Fold 2] Accuracy: 0.6543, F1: 0.6513, AUC: 0.7160, Composite: 0.6739
[Fold 3] Accuracy: 0.6456, F1: 0.6442, AUC: 0.7005, Composite: 0.6634
[Fold 4] Accuracy: 0.6655, F1: 0.6625, AUC: 0.7216, Composite: 0.6832
[Fold 5] Accuracy: 0.6436, F1: 0.6366, AUC: 0.7070, Composite: 0.6624

평균 Composite Score: 0.6709066417040125

최종 Holdout Test (CatBoost)
Accuracy : 0.6703
F1 Score : 0.6665
ROC AUC  : 0.7262
Composite: 0.6877
