In [1]:
! pip install feature_engine
! pip install CatBoost

Collecting feature_engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.6/378.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: feature_engine
Successfully installed feature_engine-1.8.3
Collecting CatBoost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: CatBoost
Successfully installed CatBoost-1.2.8


In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) # 경고문 무시

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from scipy.stats import skew
from feature_engine.outliers import Winsorizer as FEWinsorizer
from catboost import CatBoostClassifier

In [3]:
# 데이터 불러오기
from google.colab import drive
drive.mount('/content/drive')
train_src = '/content/drive/MyDrive/Colab Notebooks/패턴인식/train.csv'

Mounted at /content/drive


In [4]:
df = pd.read_csv(train_src)

X = df.drop(['id', 'shares', 'y', 'n_tokens_content', 'n_unique_tokens',  'n_non_stop_unique_tokens',], axis=1)
y = df['y']

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

In [5]:
class DFTransformer(TransformerMixin):
    """NumPy → pandas.DataFrame으로 바꿔주는 Transformer."""
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

In [6]:
# 1) 왜도 상위 k개
def get_skew_feats(df, k=8):
    num = df.select_dtypes(include=['int64','float64'])
    skews = num.apply(lambda x: abs(skew(x.dropna()))).sort_values(ascending=False)
    return skews.head(k).index.tolist()

# 2) 이상치 비율 상위 k개 (IQR 기준)
def get_outlier_feats(df, k=7):
    num = df.select_dtypes(include=['int64','float64'])
    def outlier_rate(col):
        q1, q3 = col.quantile([.25, .75])
        iqr = q3 - q1
        return (~col.between(q1 - 1.5*iqr, q3 + 1.5*iqr)).mean()
    rates = num.apply(outlier_rate).sort_values(ascending=False)
    return rates.head(k).index.tolist()

def safe_log1p(X):
    X_clipped = np.clip(X, a_min=0, a_max=None)
    with np.errstate(divide='ignore'):
        Y = np.log1p(X_clipped)
    Y[np.isneginf(Y)] = 0
    return Y

# 3) 대상 피처 리스트 뽑기
skew_k, outlier_k = 7, 8
skew_feats    = get_skew_feats(X, skew_k)
outlier_feats = get_outlier_feats(X, outlier_k)
both_feats    = list(set(skew_feats) & set(outlier_feats))
log_only      = [f for f in skew_feats    if f not in both_feats]
winsor_only   = [f for f in outlier_feats if f not in both_feats]
base_num_feats = [c for c in num_cols if c not in (both_feats + log_only + winsor_only)]


# 결측치
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# 변환기
log_tf = FunctionTransformer(np.log1p, validate=False)
safe_log_tf = FunctionTransformer(safe_log1p, validate=False)
win_tf = FEWinsorizer(capping_method='gaussian', tail='both', fold=3)


# 수치형 변수 파이프라인
both_pipeline = Pipeline([('imputer', num_imputer), ('winsor', win_tf),('log', safe_log_tf)])
log_pipeline = Pipeline([('imputer', num_imputer), ('log', safe_log_tf)])
winsor_pipeline = Pipeline([('imputer', num_imputer),('winsor',   win_tf)])
base_num_pipeline = Pipeline([('imputer', num_imputer)])                                 # ('scaler', StandardScaler()),  # 필요 시 추가

# 범주형 변수 파이프라인
cat_pipeline = Pipeline([('imputer', cat_imputer)])                                       # ('onehot',  OneHotEncoder(handle_unknown='ignore'))

# 전체 전처리 파이프라인
preprocessor = ColumnTransformer([
    ('both',    both_pipeline,    both_feats),
    ('log',     log_pipeline,     log_only),
    ('winsor',  winsor_pipeline,  winsor_only),
    ('num',     base_num_pipeline, base_num_feats),
    ('cat',     cat_pipeline,     cat_cols),
])

all_cols = (
    both_feats + log_only + winsor_only +
    base_num_feats + cat_cols
)



In [7]:
# 최종 모델 파이프라인
pipe = Pipeline([
    ('pre', preprocessor),
    ('to_df', DFTransformer(columns=all_cols)),
    ('model', CatBoostClassifier(cat_features=cat_cols, verbose=False,thread_count=1, early_stopping_rounds=30)),
]) #     thread_count=1 지우면 속도가 빨라지지만 매번 결과가 약간 다름

In [8]:
# 성능 평가
# 5-Fold CV (only on train_val)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(pipe, X_trainval, y_trainval, cv=cv, scoring=scoring, error_score='raise')

acc = cv_results['test_accuracy']
f1 = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV (only on train_val)")
for i in range(len(acc)):
    print(f"[Fold  {i+1}] Accuracy: {acc[i]:.4f}, F1: {f1[i]:.4f}, AUC: {auc[i]:.4f}, Composite: {comp[i]:.4f}")
print("\n평균 Composite Score:", comp.mean())

# 전체 train_val로 학습 후 test로 최종 성능 평가
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
comp = (acc + f1 + auc) / 3

print("\n최종 Holdout Test 성능")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC  : {auc:.4f}")
print(f"Composite: {comp:.4f}")

5-Fold CV (only on train_val)
[Fold  1] Accuracy: 0.6681, F1: 0.6619, AUC: 0.7235, Composite: 0.6845
[Fold  2] Accuracy: 0.6565, F1: 0.6556, AUC: 0.7172, Composite: 0.6764
[Fold  3] Accuracy: 0.6472, F1: 0.6469, AUC: 0.7057, Composite: 0.6666
[Fold  4] Accuracy: 0.6624, F1: 0.6614, AUC: 0.7211, Composite: 0.6816
[Fold  5] Accuracy: 0.6425, F1: 0.6408, AUC: 0.7087, Composite: 0.6640

평균 Composite Score: 0.674643664179878

최종 Holdout Test 성능
Accuracy : 0.6588
F1 Score : 0.6569
ROC AUC  : 0.7244
Composite: 0.6800
