In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
# 스태킹 앙상블
from sklearn.ensemble import StackingClassifier

# 1) 데이터 로드 & 분할
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) 전처리 파이프라인 
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 3) 스태킹 모델 정의
base_estimators = [
    ('lr',  LogisticRegression(max_iter=200, random_state=42)),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        learning_rate_init=0.001,
        max_iter=800,
        early_stopping=True,
        random_state=42
    ))
]
stack_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(),
    passthrough=False,
    cv=5,
    n_jobs=-1
)

# 4) 전체 파이프라인
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', stack_clf)
])

# 5) 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring, n_jobs=-1)

print("5-Fold CV (Stacked LR + MLP)")
for i, (acc, f1, auc) in enumerate(zip(
        cv_results['test_accuracy'],
        cv_results['test_f1'],
        cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {i}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] +
                                   cv_results['test_f1'] +
                                   cv_results['test_roc_auc']) / 3).mean())

# 6) 최종 Hold-out Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (Stacked LR + MLP)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


5-Fold CV (Stacked LR + MLP)
[Fold 1] Accuracy: 0.6548, F1: 0.6398, AUC: 0.7116, Composite: 0.6688
[Fold 2] Accuracy: 0.6405, F1: 0.6381, AUC: 0.6962, Composite: 0.6583
[Fold 3] Accuracy: 0.6320, F1: 0.6389, AUC: 0.6860, Composite: 0.6523
[Fold 4] Accuracy: 0.6470, F1: 0.6359, AUC: 0.6999, Composite: 0.6609
[Fold 5] Accuracy: 0.6278, F1: 0.5962, AUC: 0.6828, Composite: 0.6356

평균 Composite Score: 0.6551706607235743

최종 Holdout Test (Stacked LR + MLP)
Accuracy : 0.6579
F1 Score : 0.6598
ROC AUC  : 0.7088
Composite: 0.6755


In [4]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# 개별 모델
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
# 보팅 앙상블
from sklearn.ensemble import VotingClassifier

# 1) 데이터 로드 & 분할
df = pd.read_csv('train.csv')
X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2) 전처리 파이프라인 (변경 없음)
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 3) 보팅 모델 정의
estimators = [
    ('lr',  LogisticRegression(max_iter=200, random_state=42)),
    ('mlp', MLPClassifier(
        hidden_layer_sizes=(100, 50),
        activation='relu',
        solver='adam',
        learning_rate_init=0.001,
        max_iter=800,
        early_stopping=True,
        random_state=42
    ))
]
voting_clf = VotingClassifier(
    estimators=estimators,
    voting='soft',        # 확률 평균 (soft voting)
    weights=[1, 1],       # 두 모델에 동일 가중치
    n_jobs=-1
)

# 4) 전체 파이프라인
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', voting_clf)
])

# 5) 5-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring, n_jobs=-1)

print("5-Fold CV (Voting LR + MLP)")
for i, (acc, f1, auc) in enumerate(zip(
        cv_results['test_accuracy'],
        cv_results['test_f1'],
        cv_results['test_roc_auc']), 1):
    comp = (acc + f1 + auc) / 3
    print(f"[Fold {i}] Accuracy: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, Composite: {comp:.4f}")
print("\n평균 Composite Score:", ((cv_results['test_accuracy'] +
                                   cv_results['test_f1'] +
                                   cv_results['test_roc_auc']) / 3).mean())

# 6) 최종 Hold-out Test
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1  = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test (Voting LR + MLP)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


5-Fold CV (Voting LR + MLP)
[Fold 1] Accuracy: 0.6562, F1: 0.6458, AUC: 0.7114, Composite: 0.6711
[Fold 2] Accuracy: 0.6399, F1: 0.6388, AUC: 0.6960, Composite: 0.6583
[Fold 3] Accuracy: 0.6301, F1: 0.6360, AUC: 0.6858, Composite: 0.6506
[Fold 4] Accuracy: 0.6492, F1: 0.6382, AUC: 0.6994, Composite: 0.6623
[Fold 5] Accuracy: 0.6301, F1: 0.6056, AUC: 0.6825, Composite: 0.6394

평균 Composite Score: 0.6563370583333976

최종 Holdout Test (Voting LR + MLP)
Accuracy : 0.6586
F1 Score : 0.6592
ROC AUC  : 0.7088
Composite: 0.6755
