In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # 경고문 무시

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression

In [2]:
# 데이터 불러오기
from google.colab import drive
drive.mount('/content/drive')
train_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/train.csv'

df = pd.read_csv(train_src)

X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Mounted at /content/drive


In [3]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

# 전처리 파이프라인
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

In [4]:
# LogisticRegression + L1
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(
        penalty='l1',
        solver='liblinear',
        max_iter=1000,
        random_state=42
    ))
])

In [5]:
# 5-Fold CV (only on train_val)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring)

acc = cv_results['test_accuracy']
f1 = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV (only on train_val)")
for i in range(len(acc)):
    print(f"[Fold {i+1}] Accuracy: {acc[i]:.4f}, F1: {f1[i]:.4f}, AUC: {auc[i]:.4f}, Composite: {comp[i]:.4f}")
print("\n평균 Composite Score:", comp.mean())

# 전체 train_val로 학습 후 test로 최종 성능 평가
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test 성능")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

5-Fold CV (only on train_val)
[Fold 1] Accuracy: 0.6489, F1: 0.6342, AUC: 0.6994, Composite: 0.6609
[Fold 2] Accuracy: 0.6320, F1: 0.6206, AUC: 0.6822, Composite: 0.6450
[Fold 3] Accuracy: 0.6343, F1: 0.6264, AUC: 0.6792, Composite: 0.6466
[Fold 4] Accuracy: 0.6320, F1: 0.6202, AUC: 0.6831, Composite: 0.6451
[Fold 5] Accuracy: 0.6174, F1: 0.6044, AUC: 0.6702, Composite: 0.6307

평균 Composite Score: 0.6456456039745565

최종 Holdout Test 성능
Accuracy : 0.6563
F1 Score : 0.6454
ROC AUC  : 0.6995
Composite: 0.6671
