In [2]:
# 필수 라이브러리 설치
!pip install lightgbm

# 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier

# Google Drive 연동
from google.colab import drive
drive.mount('/content/drive')

# 데이터 불러오기
train_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/train.csv'
train_df = pd.read_csv(train_src)
X = train_df.drop(['id', 'shares', 'y'], axis=1)
y = train_df['y']


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# 수치형/범주형 컬럼 정의
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

# 수치형 결측치 → 평균 대체
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])

# 범주형 결측치 → 최빈값 대체
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])

# 전처리된 데이터 합치기
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols),
])

In [6]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from lightgbm import LGBMClassifier

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LGBMClassifier(random_state=42, verbose=-1))
])

In [8]:
# CV 세팅 및 평가 지표 계산
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy','f1','roc_auc']

cv_results = cross_validate(
    pipe, X, y,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("Fold별 Accuracy  :", np.round(acc,4))
print("Fold별 F1 Score  :", np.round(f1,4))
print("Fold별 ROC AUC   :", np.round(auc,4))
print("Fold별 Composite :", np.round(comp,4), "\n")

print("Stratified k-fold cross-validation")
print(f"평균 Accuracy  : {acc.mean():.4f}")
print(f"평균 F1 Score  : {f1.mean():.4f}")
print(f"평균 ROC AUC   : {auc.mean():.4f}")
print(f"평균 Composite : {comp.mean():.4f}")

# 홀드아웃 교차검증 및 평가 지표 계산
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pipe.fit(X_tr, y_tr)

y_pred = pipe.predict(X_te)
y_prob = pipe.predict_proba(X_te)[:,1]  # 양성 클래스 확률

acc = accuracy_score(y_te, y_pred)
f1  = f1_score(y_te, y_pred)
auc = roc_auc_score(y_te, y_prob)
comp = (acc + f1 + auc) / 3

print("\nHoldout cross-validation")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

Fold별 Accuracy  : [0.6574 0.6563 0.6477 0.6477 0.6559]
Fold별 F1 Score  : [0.6564 0.6526 0.6485 0.6437 0.6438]
Fold별 ROC AUC   : [0.7125 0.7194 0.7034 0.7136 0.7137]
Fold별 Composite : [0.6755 0.6761 0.6665 0.6684 0.6711] 

Stratified k-fold cross-validation
평균 Accuracy  : 0.6530
평균 F1 Score  : 0.6490
평균 ROC AUC   : 0.7125
평균 Composite : 0.6715

Holdout cross-validation
Accuracy : 0.6588
F1 Score : 0.6539
ROC AUC  : 0.7206
Composite: 0.6778




---



하이퍼파라미터 튜닝

In [9]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

param_grid = {
    'clf__num_leaves': [31, 50, 70],
    'clf__max_depth': [5, 7, 9],
    'clf__learning_rate': [0.05, 0.1],
    'clf__n_estimators': [100, 300]
}

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', LGBMClassifier(random_state=42, verbose=-1))
])

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid.fit(X, y)

print("최적 파라미터:", grid.best_params_)
print("최고 F1 Score:", grid.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
최적 파라미터: {'clf__learning_rate': 0.05, 'clf__max_depth': 7, 'clf__n_estimators': 100, 'clf__num_leaves': 50}
최고 F1 Score: 0.6573535312890225


In [None]:
# test 로드
test_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/test.csv'
test_df = pd.read_csv(test_src)

test_ids = test_df['id']
X_test = test_df.drop(['id'], axis=1)

In [10]:
# 최종 모델
final_model = Pipeline([
    ('pre', preprocessor),
    ('clf', LGBMClassifier(
        learning_rate=0.05,
        max_depth=7,
        n_estimators=100,
        num_leaves=50,
        random_state=42,
        verbose=-1
    ))
])

# 학습
final_model.fit(X, y)

# 예측
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:, 1]

# 확인용
result_df = pd.DataFrame({
    'id': test_ids,
    'y_predict': y_pred,
    'y_prob': y_prob
})

result_df.head(30) # 상위 30개 확인


Unnamed: 0,id,y_predict,y_prob
0,4979,0,0.393036
1,15552,0,0.462425
2,29370,1,0.681289
3,37272,0,0.248697
4,6836,0,0.41327
5,2699,0,0.217879
6,27093,1,0.598465
7,19112,1,0.56746
8,11841,1,0.901398
9,9616,0,0.454108


In [None]:
# 제출용
submission = pd.DataFrame({
    'id': test_ids,
    'y_predict': y_pred,
    'y_prob': y_prob
})

# 저장
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/패턴인식/prediction.csv', index=False)
print("저장 완료")