In [1]:
# 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Google Drive 연동
from google.colab import drive
drive.mount('/content/drive')

# 데이터 로드
train_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/train.csv'
train_df = pd.read_csv(train_src)
X = train_df.drop(['id', 'shares', 'y'], axis=1)
y = train_df['y']

Mounted at /content/drive


In [2]:
# 수치형/범주형 컬럼 정의
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

# 수치형 결측치 → 평균 대체
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])

# 범주형 결측치 → 최빈값 대체
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])

# 전처리된 데이터 합치기
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols),
])

In [3]:
# LogisticRegression + L1
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(
        penalty='l1',
        solver='liblinear',
        max_iter=1000,
        random_state=42
    ))
])

In [4]:
# CV 세팅 및 평가 지표 계산
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy','f1','roc_auc']

cv_results = cross_validate(
    pipe, X, y,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("Fold별 Accuracy  :", np.round(acc,4))
print("Fold별 F1 Score  :", np.round(f1,4))
print("Fold별 ROC AUC   :", np.round(auc,4))
print("Fold별 Composite :", np.round(comp,4), "\n")

print("Stratified k-fold cross-validation")
print(f"평균 Accuracy  : {acc.mean():.4f}")
print(f"평균 F1 Score  : {f1.mean():.4f}")
print(f"평균 ROC AUC   : {auc.mean():.4f}")
print(f"평균 Composite : {comp.mean():.4f}")

# 홀드아웃 교차검증 및 평가 지표 계산
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(X_tr, y_tr)

y_pred = pipe.predict(X_te)
y_prob = pipe.predict_proba(X_te)[:,1]  # 양성 클래스 확률

acc = accuracy_score(y_te, y_pred)
f1  = f1_score(y_te, y_pred)
auc = roc_auc_score(y_te, y_prob)
comp = (acc + f1 + auc) / 3

print("\nHoldout cross-validation")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

Fold별 Accuracy  : [0.6367 0.6486 0.6288 0.6426 0.634 ]
Fold별 F1 Score  : [0.6267 0.6396 0.618  0.6334 0.6168]
Fold별 ROC AUC   : [0.685  0.6952 0.6773 0.6913 0.6844]
Fold별 Composite : [0.6495 0.6611 0.6414 0.6558 0.6451] 

Stratified k-fold cross-validation
평균 Accuracy  : 0.6382
평균 F1 Score  : 0.6269
평균 ROC AUC   : 0.6866
평균 Composite : 0.6506

Holdout cross-validation
Accuracy : 0.6563
F1 Score : 0.6454
ROC AUC  : 0.6995
Composite: 0.6671
