In [3]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [4]:
from catboost import CatBoostClassifier
# 데이터 로드 및 분할

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_path = '/content/drive/MyDrive/Colab Notebooks/패턴인식/'
df = pd.read_csv(data_path + 'train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)

# drop_cols = [
#     'n_non_stop_words',
#     'global_rate_positive_words', 'global_rate_negative_words',
#     'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04'
# ]
# X = X.drop(columns=[col for col in drop_cols if col in X.columns])

y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# 전처리
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

In [7]:
# 모델 정의
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_state=42,
        verbose=0
    ))
])
# 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring)

print("5-Fold CV (CatBoost)")
for fold, (acc, f1, auc) in enumerate(zip(cv_results['test_accuracy'], cv_results['test_f1'], cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {fold}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] + cv_results['test_f1'] + cv_results['test_roc_auc']) / 3).mean())

# 최종 Holdout Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (CatBoost)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

5-Fold CV (CatBoost)
[Fold 1] Accuracy: 0.6636, F1: 0.6561, AUC: 0.7260, Composite: 0.6819
[Fold 2] Accuracy: 0.6481, F1: 0.6459, AUC: 0.7118, Composite: 0.6686
[Fold 3] Accuracy: 0.6394, F1: 0.6384, AUC: 0.7022, Composite: 0.6600
[Fold 4] Accuracy: 0.6608, F1: 0.6566, AUC: 0.7183, Composite: 0.6785
[Fold 5] Accuracy: 0.6501, F1: 0.6462, AUC: 0.7057, Composite: 0.6673

평균 Composite Score: 0.6712667965566722

최종 Holdout Test (CatBoost)
Accuracy : 0.6631
F1 Score : 0.6586
ROC AUC  : 0.7208
Composite: 0.6808
