In [4]:
! pip install feature_engine
! pip install CatBoost



In [None]:
import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier
from feature_engine.outliers import Winsorizer
from sklearn.base import TransformerMixin

In [None]:
def get_skew_feats(df, k=8):
    num = df.select_dtypes(include=['int64','float64'])
    sk = num.apply(lambda x: abs(skew(x.dropna()))).sort_values(ascending=False)
    return sk.head(k).index.tolist()

def get_outlier_feats(df, k=7):
    num = df.select_dtypes(include=['int64','float64'])
    def outlier_rate(col):
        q1, q3 = col.quantile([.25, .75])
        iqr = q3 - q1
        return (~col.between(q1 - 1.5*iqr, q3 + 1.5*iqr)).mean()
    rates = num.apply(outlier_rate).sort_values(ascending=False)
    return rates.head(k).index.tolist()

def safe_log1p(X):
    X_clipped = np.clip(X, a_min=0, a_max=None)
    with np.errstate(divide='ignore'):
        Y = np.log1p(X_clipped)
    Y[np.isneginf(Y)] = 0
    return Y

class DFTransformer(TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)


In [None]:
df = pd.read_csv(train_src)
X  = df.drop(['id','shares','y'], axis=1)
y  = df['y']

In [None]:
num_cols = X.select_dtypes(['int64','float64']).columns.tolist()
cat_cols = ['data_channel','weekday']

skew_feats    = get_skew_feats(X, k=7)
outlier_feats = get_outlier_feats(X, k=8)
both_feats    = list(set(skew_feats) & set(outlier_feats))
log_only      = [c for c in skew_feats    if c not in both_feats]
winsor_only   = [c for c in outlier_feats if c not in both_feats]
base_num_feats= [c for c in num_cols if c not in (both_feats + log_only + winsor_only)]

In [None]:
num_imputer   = SimpleImputer(strategy='median')
cat_imputer   = SimpleImputer(strategy='most_frequent')
win_tf        = Winsorizer(capping_method='gaussian', tail='both', fold=3)
safe_log_tf   = FunctionTransformer(safe_log1p, validate=False)

both_pipe     = Pipeline([('imputer', num_imputer), ('winsor', win_tf),   ('log', safe_log_tf)])
log_pipe      = Pipeline([('imputer', num_imputer), ('log', safe_log_tf)])
winsor_pipe   = Pipeline([('imputer', num_imputer), ('winsor', win_tf)])
base_pipe     = Pipeline([('imputer', num_imputer)])
cat_pipe      = Pipeline([('imputer', cat_imputer)])

preprocessor_orig = ColumnTransformer([
    ('both',   both_pipe,   both_feats),
    ('log',    log_pipe,    log_only),
    ('winsor', winsor_pipe, winsor_only),
    ('base',   base_pipe,   base_num_feats),
    ('cat',    cat_pipe,    cat_cols),
])
all_cols = both_feats + log_only + winsor_only + base_num_feats + cat_cols

In [None]:
pipe_orig = Pipeline([
    ('pre',   preprocessor_orig),
    ('to_df', DFTransformer(all_cols)),
    ('clf',   CatBoostClassifier(
                  cat_features=cat_cols,
                  verbose=False,
                  thread_count=1,
                  early_stopping_rounds=30
              )),
])

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# 1차 학습 → Feature importance 추출
pipe_orig.fit(X_trainval, y_trainval)
importances = pipe_orig.named_steps['clf'].get_feature_importance()
feat_imp = pd.DataFrame({
    'feature': all_cols,
    'importance': importances
}).sort_values('importance', ascending=False).reset_index(drop=True)

# 하위 30% 제거 목록
threshold = feat_imp['importance'].quantile(0.3)
low_feats = feat_imp.loc[feat_imp['importance'] <= threshold, 'feature'].tolist()

# 선택 피처 리스트 및 preprocessor 재정의
both_sel     = [c for c in both_feats     if c not in low_feats]
log_sel      = [c for c in log_only       if c not in low_feats]
winsor_sel   = [c for c in winsor_only    if c not in low_feats]
base_sel     = [c for c in base_num_feats if c not in low_feats]
cat_sel      = [c for c in cat_cols       if c not in low_feats]

preprocessor_sel = ColumnTransformer([
    ('both',   both_pipe,   both_sel),
    ('log',    log_pipe,    log_sel),
    ('winsor', winsor_pipe, winsor_sel),
    ('base',   base_pipe,   base_sel),
    ('cat',    cat_pipe,    cat_sel),
])
all_cols_sel = both_sel + log_sel + winsor_sel + base_sel + cat_sel

In [None]:
pipe_sel = Pipeline([
    ('pre',   preprocessor_sel),
    ('to_df', DFTransformer(all_cols_sel)),
    ('clf',   CatBoostClassifier(
                  cat_features=cat_sel,
                  verbose=False,
                  thread_count=1,
                  early_stopping_rounds=30
              )),
])

In [22]:
def evaluate(name, pipe, X_tr, X_te):
    print(f"\n===== {name} =====")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = ['accuracy','f1','roc_auc']
    cvr = cross_validate(pipe, X_tr, y_trainval,
                         cv=cv, scoring=scoring, error_score='raise')
    acc, f1, auc = cvr['test_accuracy'], cvr['test_f1'], cvr['test_roc_auc']
    comp = (acc + f1 + auc) / 3

    print("5-Fold CV (train_val):")
    for i,(a,f,u,c) in enumerate(zip(acc,f1,auc,comp), start=1):
        print(f"[Fold {i:>2}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {u:.4f}, Composite: {c:.4f}")
    print(f"평균 Composite: {comp.mean():.4f}")

    pipe.fit(X_tr, y_trainval)
    y_p  = pipe.predict(X_te)
    y_pr = pipe.predict_proba(X_te)[:,1]
    a = accuracy_score(y_test, y_p)
    f = f1_score(y_test,  y_p)
    u = roc_auc_score(y_test, y_pr)
    c = (a + f + u) / 3

    print("\nHoldout Test:")
    print(f"Accuracy : {a:.4f}")
    print(f"F1 Score : {f:.4f}")
    print(f"ROC AUC  : {u:.4f}")
    print(f"Composite: {c:.4f}")

# 실행
evaluate("원본 피처", pipe_orig, X_trainval, X_test)
evaluate("선택 피처", pipe_sel,  X_trainval, X_test)



===== 원본 피처 =====
5-Fold CV (train_val):
[Fold  1] Accuracy: 0.6619, F1: 0.6570, AUC: 0.7287, Composite: 0.6825
[Fold  2] Accuracy: 0.6529, F1: 0.6506, AUC: 0.7181, Composite: 0.6739
[Fold  3] Accuracy: 0.6515, F1: 0.6501, AUC: 0.7064, Composite: 0.6693
[Fold  4] Accuracy: 0.6622, F1: 0.6610, AUC: 0.7211, Composite: 0.6814
[Fold  5] Accuracy: 0.6512, F1: 0.6483, AUC: 0.7070, Composite: 0.6688
평균 Composite: 0.6752

Holdout Test:
Accuracy : 0.6633
F1 Score : 0.6591
ROC AUC  : 0.7280
Composite: 0.6834

===== 선택 피처 =====
5-Fold CV (train_val):
[Fold  1] Accuracy: 0.6667, F1: 0.6609, AUC: 0.7304, Composite: 0.6860
[Fold  2] Accuracy: 0.6478, F1: 0.6453, AUC: 0.7138, Composite: 0.6690
[Fold  3] Accuracy: 0.6456, F1: 0.6455, AUC: 0.7072, Composite: 0.6661
[Fold  4] Accuracy: 0.6548, F1: 0.6525, AUC: 0.7184, Composite: 0.6753
[Fold  5] Accuracy: 0.6416, F1: 0.6393, AUC: 0.7043, Composite: 0.6617
평균 Composite: 0.6716

Holdout Test:
Accuracy : 0.6633
F1 Score : 0.6595
ROC AUC  : 0.7249
Composit