In [None]:
! pip install feature_engine
! pip install CatBoost

Collecting feature_engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.6/378.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature_engine
Successfully installed feature_engine-1.8.3
Collecting CatBoost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CatBoost
Successfully installed CatBoost-1.2.8


In [None]:
# 데이터 불러오기
from google.colab import drive
drive.mount('/content/drive')
train_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/train.csv'

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # 경고문 무시

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from scipy.stats import skew
from feature_engine.outliers import Winsorizer
from catboost import CatBoostClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
# 파생 변수 생성                           # df 통계형 변수를 쓸 때는 데이터 누수 주의

def create_features(df):
    # 1) 제목 길이 대비 본문 길이 비율
    df['title_content_ratio'] = df['n_tokens_title'] / df['n_tokens_content'].replace(0, np.nan)

    # 2) 키워드 밀도: 전체 토큰 대비 키워드 개수 비율
    df['keyword_density']    = df['num_keywords'] / df['n_tokens_content'].replace(0, np.nan)

    # 3) 비중어(non-stop) 단어 비율: 본문 대비
    df['nonstop_ratio']      = df['n_non_stop_words'] / df['n_tokens_content'].replace(0, np.nan)

    # 4) 본문 내 링크 대비 자기링크 비율
    df['self_href_ratio']    = df['num_self_hrefs'] / df['num_hrefs'].replace(0, np.nan)

    # 5) 이미지/동영상 비율
    df['img_video_ratio']    = df['num_imgs'] / (df['num_videos'] + 1)

    # 6) 키워드 분포 폭: (최댓값 키워드 빈도 – 최솟값 키워드 빈도)
    df['kw_spread']          = df['kw_max_max'] - df['kw_min_min']

    # 7) 감성 범위: (최대 양성 편향 – 최소 음성 편향)
    df['sentiment_range']    = df['max_positive_polarity'] - df['min_negative_polarity']

    # 8) 제목 감성 상호작용: 주관성 × 편향 절대값
    df['title_sent_interact']= df['abs_title_subjectivity'] * df['abs_title_sentiment_polarity']

    # 9) 주말 여부 플래그
    df['is_weekend']         = df['weekday'].isin(['Saturday','Sunday']).astype(int)

    # 10) 채널×주말 교차 카테고리 (필요시 one-hot 인코딩)
    df['channel_weekend']    = df['data_channel'] + '_' + df['is_weekend'].astype(str)

    return df

In [None]:
df = pd.read_csv(train_src)
df = create_features(df)

X  = df.drop(['id','shares','y'], axis=1)
y  = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

num_cols = X_trainval.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel','weekday','channel_weekend']

In [None]:
# 1) 왜도 상위 k개
def get_skew_feats(df, k=7):
    num = df.select_dtypes(include=['int64','float64'])
    skews = num.apply(lambda x: abs(skew(x.dropna()))).sort_values(ascending=False)
    return skews.head(k).index.tolist()

# 2) 이상치 비율 상위 k개 (IQR 기준)
def get_outlier_feats(df, k=8):
    num = df.select_dtypes(include=['int64','float64'])
    def outlier_rate(col):
        q1, q3 = col.quantile([.25, .75])
        iqr = q3 - q1
        return (~col.between(q1 - 1.5*iqr, q3 + 1.5*iqr)).mean()
    rates = num.apply(outlier_rate).sort_values(ascending=False)
    return rates.head(k).index.tolist()

def safe_log1p(X):
    X_clipped = np.clip(X, a_min=0, a_max=None)
    with np.errstate(divide='ignore'):
        Y = np.log1p(X_clipped)
    Y[np.isneginf(Y)] = 0
    return Y

In [None]:
# 수치형 전처리: median → Winsorizer
num_imputer = SimpleImputer(strategy='median')
winsorizer  = Winsorizer(capping_method='gaussian', tail='both', fold=3)

X_trainval_num = num_imputer.fit_transform(X_trainval[num_cols])
X_trainval_num = winsorizer.fit_transform(pd.DataFrame(X_trainval_num, columns=num_cols)).values

X_test_num = num_imputer.transform(X_test[num_cols])
X_test_num = winsorizer.transform(pd.DataFrame(X_test_num, columns=num_cols)).values

# 범주형 전처리: most_frequent → category
cat_imputer = SimpleImputer(strategy='most_frequent')

X_trainval_cat = pd.DataFrame(
    cat_imputer.fit_transform(X_trainval[cat_cols]),
    columns=cat_cols, index=X_trainval.index
).astype('category')

X_test_cat = pd.DataFrame(
    cat_imputer.transform(X_test[cat_cols]),
    columns=cat_cols, index=X_test.index
).astype('category')

# 최종 학습용 DataFrame 합치기
X_trainval_final = pd.concat([
    pd.DataFrame(X_trainval_num, columns=num_cols, index=X_trainval.index), X_trainval_cat], axis=1)

X_test_final = pd.concat([pd.DataFrame(X_test_num, columns=num_cols, index=X_test.index), X_test_cat], axis=1)


In [None]:
# SelectFromModel로 피처 선택
base_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    random_seed=42,
    thread_count=1,
    verbose=False,
    cat_features=cat_cols
)

base_model.fit(X_trainval_final, y_trainval)

selector = SelectFromModel(estimator=base_model,threshold='median')
selector.fit(X_trainval_final, y_trainval)

selected_feats = X_trainval_final.columns[selector.get_support()].tolist()
print("▶ 선택된 피처:", selected_feats)

X_trainval_sel = X_trainval_final[selected_feats]
X_test_sel = X_test_final[selected_feats]

In [None]:
# 최종 CatBoost 모델 학습 & 평가
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=4,
    eval_metric='AUC',
    random_seed=42,
    thread_count=1,
    early_stopping_rounds=30,
    verbose=False,
    cat_features=[c for c in selected_feats if c in cat_cols]
)

In [None]:
# 성능 평가
# 5-Fold CV (only on train_val)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']
cv_res = cross_validate(model, X_trainval_sel, y_trainval, cv=cv, scoring=scoring, error_score='raise')

acc = cv_res['test_accuracy']
f1 = cv_res['test_f1']
auc = cv_res['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("\n5-Fold CV (train_val)")
for i, (a, f, u, c) in enumerate(zip(acc, f1, auc, comp), 1):
    print(f"[Fold {i}] Acc:{a:.4f}, F1:{f:.4f}, AUC:{u:.4f}, Comp:{c:.4f}")
print("평균 Composite:", comp.mean())

# 전체 train_val로 학습 후 test로 최종 성능 평가
model.fit(X_trainval_sel, y_trainval)
y_pred = model.predict(X_test_sel)
y_prob = model.predict_proba(X_test_sel)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\nHoldout Test")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

▶ 선택된 피처: ['n_unique_tokens', 'n_non_stop_unique_tokens', 'average_token_length', 'kw_max_min', 'kw_avg_min', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg', 'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares', 'self_reference_avg_sharess', 'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 'global_rate_positive_words', 'global_rate_negative_words', 'avg_positive_polarity', 'title_content_ratio', 'keyword_density', 'self_href_ratio', 'img_video_ratio', 'data_channel', 'weekday', 'channel_weekend']

5-Fold CV (train_val)
[Fold 1] Acc:0.6695, F1:0.6599, AUC:0.7263, Comp:0.6852
[Fold 2] Acc:0.6470, F1:0.6472, AUC:0.7085, Comp:0.6675
[Fold 3] Acc:0.6534, F1:0.6518, AUC:0.7074, Comp:0.6709
[Fold 4] Acc:0.6554, F1:0.6570, AUC:0.7157, Comp:0.6760
[Fold 5] Acc:0.6394, F1:0.6378, AUC:0.6976, Comp:0.6582
평균 Composite: 0.6715705749852122

Holdout Test
Accuracy : 0.6682
F1 Score : 0.6656
ROC AUC  : 0.7256
Composite: 0.6865
