In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

# 1) 데이터 로드 & 분할
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 3) 보팅 앙상블: LR + CatBoost
estimators = [
    ('lr',  LogisticRegression(max_iter=200, random_state=42)),
    ('cb',  CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        random_state=42,
        verbose=0
    ))
]
voting_clf = VotingClassifier(
    estimators=estimators,
    voting='soft',
    weights=[1, 2],  
    n_jobs=-1
)

# 4) 전체 파이프라인
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', voting_clf)
])

# 5) 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring, n_jobs=-1)

print("5-Fold CV (Voting LR + CatBoost)")
for i, (acc, f1, auc) in enumerate(zip(
        cv_results['test_accuracy'],
        cv_results['test_f1'],
        cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {i}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] +
                                   cv_results['test_f1'] +
                                   cv_results['test_roc_auc']) / 3).mean())

# 6) 최종 Hold-out Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (Voting LR + CatBoost)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


5-Fold CV (Voting LR + CatBoost)
[Fold 1] Accuracy: 0.6667, F1: 0.6594, AUC: 0.7265, Composite: 0.6842
[Fold 2] Accuracy: 0.6557, F1: 0.6503, AUC: 0.7149, Composite: 0.6736
[Fold 3] Accuracy: 0.6520, F1: 0.6499, AUC: 0.7059, Composite: 0.6693
[Fold 4] Accuracy: 0.6661, F1: 0.6623, AUC: 0.7212, Composite: 0.6832
[Fold 5] Accuracy: 0.6464, F1: 0.6397, AUC: 0.7072, Composite: 0.6644

평균 Composite Score: 0.6749413226906857

최종 Holdout Test (Voting LR + CatBoost)
Accuracy : 0.6673
F1 Score : 0.6604
ROC AUC  : 0.7286
Composite: 0.6854


In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# 모델 및 스태킹
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier

# 1) 데이터 로드 & 분할
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) 전처리 정의
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 3) 스태킹 모델 정의
base_estimators = [
    ('lr', LogisticRegression(max_iter=200, random_state=42)),
    ('cb', CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        random_state=42,
        verbose=0
    ))
]
stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(),
    cv=5,
    passthrough=True,   # 베이스 모델 피처도 그대로 함께 사용
    n_jobs=-1
)

# 4) 전체 파이프라인
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', stacking_clf)
])

# 5) 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy','f1','roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring, n_jobs=-1)

print("5-Fold CV (Stacked LR + CatBoost)")
for i, (acc,f1,auc) in enumerate(zip(
        cv_results['test_accuracy'],
        cv_results['test_f1'],
        cv_results['test_roc_auc']),1):
    comp = (acc+f1+auc)/3
    print(f"[Fold {i}] Acc: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Comp: {comp:.4f}")
print("\nMean Composite:", ((cv_results['test_accuracy'] +
                            cv_results['test_f1'] +
                            cv_results['test_roc_auc'])/3).mean())

# 6) 최종 Hold-out Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp= (acc+f1+auc)/3

print("\nHoldout Test (Stacked LR + CatBoost)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


5-Fold CV (Stacked LR + CatBoost)
[Fold 1] Acc: 0.6650, F1: 0.6571, AUC: 0.7238, Comp: 0.6820
[Fold 2] Acc: 0.6532, F1: 0.6472, AUC: 0.7151, Comp: 0.6718
[Fold 3] Acc: 0.6523, F1: 0.6488, AUC: 0.7056, Comp: 0.6689
[Fold 4] Acc: 0.6627, F1: 0.6589, AUC: 0.7231, Comp: 0.6816
[Fold 5] Acc: 0.6450, F1: 0.6390, AUC: 0.7075, Comp: 0.6638

Mean Composite: 0.6736176178595361

Holdout Test (Stacked LR + CatBoost)
Accuracy : 0.6698
F1 Score : 0.6613
ROC AUC  : 0.7273
Composite: 0.6861
