In [1]:
! pip install feature_engine
! pip install CatBoost
! pip install optuna

Collecting feature_engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.6/378.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature_engine
Successfully installed feature_engine-1.8.3
Collecting CatBoost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CatBoost
Successfully installed CatBoost-1.2.8
Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6

In [2]:
# 데이터 불러오기
from google.colab import drive
drive.mount('/content/drive')
train_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/train.csv'

Mounted at /content/drive


In [3]:
# ── ② 파생 변수 생성 ──                                                                           # df 통계형 변수를 쓸 때는 데이터 누수 주의

def create_features(df):
    # 1) 제목 길이 대비 본문 길이 비율
    df['title_content_ratio'] = df['n_tokens_title'] / df['n_tokens_content'].replace(0, np.nan)

    # 2) 키워드 밀도: 전체 토큰 대비 키워드 개수 비율
    df['keyword_density']    = df['num_keywords'] / df['n_tokens_content'].replace(0, np.nan)

    # 3) 비중어(non-stop) 단어 비율: 본문 대비
    df['nonstop_ratio']      = df['n_non_stop_words'] / df['n_tokens_content'].replace(0, np.nan)

    # 4) 본문 내 링크 대비 자기링크 비율
    df['self_href_ratio']    = df['num_self_hrefs'] / df['num_hrefs'].replace(0, np.nan)

    # 5) 이미지/동영상 비율
    df['img_video_ratio']    = df['num_imgs'] / (df['num_videos'] + 1)

    # 6) 키워드 분포 폭: (최댓값 키워드 빈도 – 최솟값 키워드 빈도)
    df['kw_spread']          = df['kw_max_max'] - df['kw_min_min']

    # 7) 감성 범위: (최대 양성 편향 – 최소 음성 편향)
    df['sentiment_range']    = df['max_positive_polarity'] - df['min_negative_polarity']

    # 8) 제목 감성 상호작용: 주관성 × 편향 절대값
    df['title_sent_interact']= df['abs_title_subjectivity'] * df['abs_title_sentiment_polarity']

    # 9) 주말 여부 플래그
    df['is_weekend']         = df['weekday'].isin(['Saturday','Sunday']).astype(int)

    # 10) 채널×주말 교차 카테고리 (필요시 one-hot 인코딩)
    df['channel_weekend']    = df['data_channel'] + '_' + df['is_weekend'].astype(str)

    return df


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.impute import SimpleImputer
from feature_engine.outliers import Winsorizer
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
from sklearn.model_selection import cross_val_score

# 1) 데이터 로드 & train/holdout 분리
df = pd.read_csv(train_src)

df = create_features(df)

X  = df.drop(['id','shares','y'], axis=1)
y  = df['y']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 2) 컬럼 나누기
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel','weekday','channel_weekend']  # ← 실제 범주형명 사용

# 3) 수치형 전처리: median → Winsorizer
num_imputer = SimpleImputer(strategy='median')
winsorizer  = Winsorizer(capping_method='gaussian', tail='both', fold=3)  # ← fold 조절 가능

X_tr_num = num_imputer.fit_transform( X_tr[num_cols] )
X_tr_num = winsorizer.fit_transform(pd.DataFrame(X_tr_num, columns=num_cols)).values

X_te_num = num_imputer.transform( X_te[num_cols] )
X_te_num = winsorizer.transform(pd.DataFrame(X_te_num, columns=num_cols)).values

# 4) 범주형 전처리: most_frequent → category
cat_imputer = SimpleImputer(strategy='most_frequent')

X_tr_cat = pd.DataFrame(
    cat_imputer.fit_transform(X_tr[cat_cols]),
    columns=cat_cols, index=X_tr.index
).astype('category')

X_te_cat = pd.DataFrame(
    cat_imputer.transform(X_te[cat_cols]),
    columns=cat_cols, index=X_te.index
).astype('category')

# 5) 최종 학습용 DataFrame 합치기
X_tr_final = pd.concat([
    pd.DataFrame(X_tr_num, columns=num_cols, index=X_tr.index),
    X_tr_cat
], axis=1)

X_te_final = pd.concat([
    pd.DataFrame(X_te_num, columns=num_cols, index=X_te.index),
    X_te_cat
], axis=1)

# 6) SelectFromModel로 피처 선택
base_model = CatBoostClassifier(
    iterations=500,             # ← 이후 그리드/베이지안 탐색할 파라미터
    learning_rate=0.05,         # ← 여기부터 튜닝
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    thread_count=1,
    verbose=False,
    cat_features=cat_cols
)
base_model.fit(X_tr_final, y_tr)

selector = SelectFromModel(
    estimator=base_model,
    threshold='median'            # ← 'mean','median' 또는 float 값으로 바꿔가며 실험
)
selector.fit(X_tr_final, y_tr)

selected_feats = X_tr_final.columns[ selector.get_support() ].tolist()
print("▶ 선택된 피처:", selected_feats)

X_tr_sel = X_tr_final[selected_feats]
X_te_sel = X_te_final[selected_feats]


# 7) Optuna로 하이퍼파라미터 최적화
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

def objective(trial):
    # 7-1) 하이퍼파라미터 제안
    params = {
        'depth':          trial.suggest_int('depth', 4, 10),
        'learning_rate':  trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),    # ← 변경
        'l2_leaf_reg':    trial.suggest_float('l2_leaf_reg',   0.1,   10,   log=True),    # ← 변경
        'iterations':     trial.suggest_int('iterations', 200, 1000),
        'random_seed':    42,
        'eval_metric':    'AUC',
        'verbose':        False,
    }

    # 7-2) 모델 생성 & CV 평가
    model = CatBoostClassifier(**params, cat_features=cat_cols, thread_count=1)
    aucs = cross_val_score(
        model,
        X_tr_sel, y_tr,
        cv=inner_cv,
        scoring='roc_auc',
        n_jobs=-1
    )
    # 7-3) 평균 AUC 반환
    return aucs.mean()

# 7-4) 스터디 생성 및 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 7-5) 결과 출력
print("▶ Best params:", study.best_trial.params)
print("▶ Best 3-fold CV ROC AUC:", study.best_value)

# 8) 최적 파라미터로 최종 모델 학습 & Holdout 평가
best_params = study.best_trial.params
best_model = CatBoostClassifier(
    **best_params,
    random_seed=42,
    thread_count=1,
    early_stopping_rounds = 30,
    verbose=False,
    cat_features=cat_cols,
)
best_model.fit(X_tr_sel, y_tr)
y_pred = best_model.predict(X_te_sel)
y_prob = best_model.predict_proba(X_te_sel)[:,1]

acc  = accuracy_score(y_te, y_pred)
f1   = f1_score(y_te, y_pred)
auc  = roc_auc_score(y_te, y_prob)
comp = (acc + f1 + auc) / 3

# 7-5) 결과 출력
best_params = study.best_trial.params
print("▶ Best params:", best_params)
print("  - depth         :", best_params['depth'])
print("  - learning_rate:", best_params['learning_rate'])
print("  - l2_leaf_reg  :", best_params['l2_leaf_reg'])
print("  - iterations   :", best_params['iterations'])
print("▶ Best 3-fold CV ROC AUC:", study.best_value)


print("\n▶ Holdout Test Performance (best model)")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")


# # 7) 최종 CatBoost 모델 학습 & 평가
# model = CatBoostClassifier(
#     iterations=1000,            # ← 최종 탐색 범위
#     learning_rate=0.05,
#     depth=4,
#     eval_metric='AUC',
#     random_seed=42,
#     thread_count=1,
#     early_stopping_rounds = 30,
#     verbose=False,
#     cat_features=[c for c in selected_feats if c in cat_cols]
# )

# # 7-1) 5-Fold CV (train_val)
# cv      = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# scoring = ['accuracy','f1','roc_auc']
# cv_res  = cross_validate(model, X_tr_sel, y_tr, cv=cv, scoring=scoring, error_score='raise')

# acc  = cv_res['test_accuracy']
# f1   = cv_res['test_f1']
# auc  = cv_res['test_roc_auc']
# comp = (acc + f1 + auc) / 3

# print("\n5-Fold CV (train_val)")
# for i,(a,f,u,c) in enumerate(zip(acc,f1,auc,comp), 1):
#     print(f"[Fold {i}] Acc:{a:.4f}, F1:{f:.4f}, AUC:{u:.4f}, Comp:{c:.4f}")
# print("평균 Composite:", comp.mean())

# # 7-2) Holdout Test
# model.fit(X_tr_sel, y_tr)
# y_pred = model.predict(X_te_sel)
# y_prob = model.predict_proba(X_te_sel)[:,1]

# acc  = accuracy_score(y_te, y_pred)
# f1   = f1_score(y_te, y_pred)
# auc  = roc_auc_score(y_te, y_prob)
# comp = (acc + f1 + auc) / 3

# print("\nHoldout Test")
# print(f"Accuracy : {acc:.4f}")
# print(f"F1 Score : {f1:.4f}")
# print(f"ROC AUC  : {auc:.4f}")
# print(f"Composite: {comp:.4f}")


[I 2025-05-17 02:09:20,215] A new study created in memory with name: no-name-0575cab3-19b0-4d94-a3ed-84d9856fae86


▶ 선택된 피처: ['n_unique_tokens', 'n_non_stop_unique_tokens', 'average_token_length', 'kw_max_min', 'kw_avg_min', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess', 'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 'global_rate_positive_words', 'global_rate_negative_words', 'avg_positive_polarity', 'title_content_ratio', 'keyword_density', 'self_href_ratio', 'img_video_ratio', 'data_channel', 'weekday', 'channel_weekend']


[I 2025-05-17 02:13:46,634] Trial 0 finished with value: 0.7099063220431535 and parameters: {'depth': 10, 'learning_rate': 0.010482260789173897, 'l2_leaf_reg': 0.1455509331160738, 'iterations': 619}. Best is trial 0 with value: 0.7099063220431535.
[I 2025-05-17 02:13:57,189] Trial 1 finished with value: 0.6826745487358922 and parameters: {'depth': 4, 'learning_rate': 0.005378646586870534, 'l2_leaf_reg': 0.35295055322652047, 'iterations': 230}. Best is trial 0 with value: 0.7099063220431535.
[I 2025-05-17 02:17:13,223] Trial 2 finished with value: 0.6990700109520475 and parameters: {'depth': 10, 'learning_rate': 0.07243259768948074, 'l2_leaf_reg': 0.7106464878254433, 'iterations': 449}. Best is trial 0 with value: 0.7099063220431535.
[I 2025-05-17 02:18:17,323] Trial 3 finished with value: 0.6933368723158658 and parameters: {'depth': 7, 'learning_rate': 0.001959105214477111, 'l2_leaf_reg': 2.528578799269137, 'iterations': 590}. Best is trial 0 with value: 0.7099063220431535.
[I 2025-05-

▶ Best params: {'depth': 6, 'learning_rate': 0.01887143254640496, 'l2_leaf_reg': 5.612912910084233, 'iterations': 897}
▶ Best 3-fold CV ROC AUC: 0.7149144214772792


SyntaxError: keyword argument repeated: thread_count (<ipython-input-6-7b63e49c261c>, line 129)