In [1]:
!pip install --upgrade --force-reinstall interpret-core


Collecting interpret-core
  Using cached interpret_core-0.6.10-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.25 (from interpret-core)
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas>=0.19.2 (from interpret-core)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=0.18.1 (from interpret-core)
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting joblib>=0.11 (from interpret-core)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting python-dateutil>=2.8.2 (from pandas>=0.19.2->interpret-core)
  Downloa

In [2]:
# 1) 라이브러리 임포트 및 설정
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Explainable Boosting Machine
from interpret.glassbox import ExplainableBoostingClassifier

In [None]:
# 2) 데이터 로드
from google.colab import drive
drive.mount('/content/drive')


In [4]:
data_path = '/content/drive/MyDrive/Colab Notebooks/패턴인식/'
df = pd.read_csv(data_path + 'train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']

# 3) Train/Validation & Hold-out 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 불필요한 열 제거
drop_cols = [
    'n_non_stop_words',
    'global_rate_positive_words', 'global_rate_negative_words',
    'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04'
]
X = X.drop(columns=[col for col in drop_cols if col in X.columns])


# 4) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 5) 모델 파이프라인 정의 (EBM 사용)
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', ExplainableBoostingClassifier(
        interactions=10,
        learning_rate=0.01,
        random_state=42,
        n_jobs=-1
    ))
])

# 6) 5-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipe,
    X_trainval, y_trainval,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV 결과")
for i, (a, f, r) in enumerate(zip(acc, f1, auc), start=1):
    print(f"[Fold {i}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {r:.4f}, Composite: {comp[i-1]:.4f}")
print(f"\n평균 Composite Score: {comp.mean():.4f}")

# 7) Hold-out 테스트
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc_te = accuracy_score(y_test, y_pred)
f1_te  = f1_score(y_test, y_pred)
auc_te = roc_auc_score(y_test, y_prob)
comp_te = (acc_te + f1_te + auc_te) / 3

print("\nHold-out Test 결과")
print(f"Accuracy : {acc_te:.4f}")
print(f"F1 Score : {f1_te:.4f}")
print(f"ROC AUC  : {auc_te:.4f}")
print(f"Composite: {comp_te:.4f}")


5-Fold CV 결과
[Fold 1] Accuracy: 0.6619, F1: 0.6526, AUC: 0.7208, Composite: 0.6784
[Fold 2] Accuracy: 0.6410, F1: 0.6395, AUC: 0.7031, Composite: 0.6612
[Fold 3] Accuracy: 0.6470, F1: 0.6415, AUC: 0.6964, Composite: 0.6616
[Fold 4] Accuracy: 0.6529, F1: 0.6534, AUC: 0.7119, Composite: 0.6727
[Fold 5] Accuracy: 0.6357, F1: 0.6284, AUC: 0.6977, Composite: 0.6539

평균 Composite Score: 0.6656

Hold-out Test 결과
Accuracy : 0.6613
F1 Score : 0.6546
ROC AUC  : 0.7204
Composite: 0.6787


In [5]:
data_path = '/content/drive/MyDrive/Colab Notebooks/패턴인식/'
df = pd.read_csv(data_path + 'train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)
y = df['y']

# 3) Train/Validation & Hold-out 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 불필요한 열 제거
drop_cols = [
    'n_non_stop_words',
    'global_rate_positive_words', 'global_rate_negative_words'
]
X = X.drop(columns=[col for col in drop_cols if col in X.columns])


# 4) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')) # 스케일X
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 5) 모델 파이프라인 정의 (EBM 사용)
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', ExplainableBoostingClassifier(
        interactions=10,
        learning_rate=0.01,
        random_state=42,
        n_jobs=-1
    ))
])

# 6) 5-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipe,
    X_trainval, y_trainval,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV 결과")
for i, (a, f, r) in enumerate(zip(acc, f1, auc), start=1):
    print(f"[Fold {i}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {r:.4f}, Composite: {comp[i-1]:.4f}")
print(f"\n평균 Composite Score: {comp.mean():.4f}")

# 7) Hold-out 테스트
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc_te = accuracy_score(y_test, y_pred)
f1_te  = f1_score(y_test, y_pred)
auc_te = roc_auc_score(y_test, y_prob)
comp_te = (acc_te + f1_te + auc_te) / 3

print("\nHold-out Test 결과")
print(f"Accuracy : {acc_te:.4f}")
print(f"F1 Score : {f1_te:.4f}")
print(f"ROC AUC  : {auc_te:.4f}")
print(f"Composite: {comp_te:.4f}")


5-Fold CV 결과
[Fold 1] Accuracy: 0.6602, F1: 0.6515, AUC: 0.7196, Composite: 0.6771
[Fold 2] Accuracy: 0.6501, F1: 0.6456, AUC: 0.7079, Composite: 0.6678
[Fold 3] Accuracy: 0.6436, F1: 0.6383, AUC: 0.7016, Composite: 0.6612
[Fold 4] Accuracy: 0.6577, F1: 0.6569, AUC: 0.7196, Composite: 0.6780
[Fold 5] Accuracy: 0.6408, F1: 0.6358, AUC: 0.7009, Composite: 0.6592

평균 Composite Score: 0.6687

Hold-out Test 결과
Accuracy : 0.6592
F1 Score : 0.6543
ROC AUC  : 0.7206
Composite: 0.6781


In [10]:
data_path = '/content/drive/MyDrive/Colab Notebooks/패턴인식/'
df = pd.read_csv(data_path + 'train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)

# 파생 변수 추가 + 열 제거
drop_cols = ['n_non_stop_words', 'global_rate_positive_words', 'global_rate_negative_words']
X = X.drop(columns=[col for col in drop_cols if col in X.columns])
X['sentiment_balance'] = X['rate_positive_words'] - X['rate_negative_words']
X['polarity_spread'] = X['avg_positive_polarity'] - X['avg_negative_polarity']

# train/test 분리
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 4) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 5) 모델 파이프라인 정의 (EBM 사용)
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', ExplainableBoostingClassifier(
        interactions=10,
        learning_rate=0.01,
        random_state=42,
        n_jobs=-1
    ))
])

# 6) 5-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipe,
    X_trainval, y_trainval,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV 결과")
for i, (a, f, r) in enumerate(zip(acc, f1, auc), start=1):
    print(f"[Fold {i}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {r:.4f}, Composite: {comp[i-1]:.4f}")
print(f"\n평균 Composite Score: {comp.mean():.4f}")

# 7) Hold-out 테스트
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc_te = accuracy_score(y_test, y_pred)
f1_te  = f1_score(y_test, y_pred)
auc_te = roc_auc_score(y_test, y_prob)
comp_te = (acc_te + f1_te + auc_te) / 3

print("\nHold-out Test 결과")
print(f"Accuracy : {acc_te:.4f}")
print(f"F1 Score : {f1_te:.4f}")
print(f"ROC AUC  : {auc_te:.4f}")
print(f"Composite: {comp_te:.4f}")


5-Fold CV 결과
[Fold 1] Accuracy: 0.6622, F1: 0.6532, AUC: 0.7194, Composite: 0.6782
[Fold 2] Accuracy: 0.6489, F1: 0.6442, AUC: 0.7078, Composite: 0.6670
[Fold 3] Accuracy: 0.6436, F1: 0.6387, AUC: 0.7014, Composite: 0.6612
[Fold 4] Accuracy: 0.6579, F1: 0.6573, AUC: 0.7196, Composite: 0.6783
[Fold 5] Accuracy: 0.6427, F1: 0.6373, AUC: 0.7009, Composite: 0.6603

평균 Composite Score: 0.6690

Hold-out Test 결과
Accuracy : 0.6581
F1 Score : 0.6533
ROC AUC  : 0.7206
Composite: 0.6773


In [12]:
data_path = '/content/drive/MyDrive/Colab Notebooks/패턴인식/'
df = pd.read_csv(data_path + 'train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)

# kw_ 단순화 + 열 제거
drop_cols = ['n_non_stop_words', 'global_rate_positive_words', 'global_rate_negative_words']
X = X.drop(columns=[col for col in drop_cols if col in X.columns])

kw_cols = [col for col in X.columns if col.startswith('kw_')]
drop_kw = [col for col in kw_cols if col not in ['kw_min_min', 'kw_max_max', 'kw_avg_avg']]
X = X.drop(columns=drop_kw)


# train/test 분리
y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 4) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 5) 모델 파이프라인 정의 (EBM 사용)
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', ExplainableBoostingClassifier(
        interactions=10,
        learning_rate=0.01,
        random_state=42,
        n_jobs=-1
    ))
])

# 6) 5-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipe,
    X_trainval, y_trainval,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV 결과")
for i, (a, f, r) in enumerate(zip(acc, f1, auc), start=1):
    print(f"[Fold {i}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {r:.4f}, Composite: {comp[i-1]:.4f}")
print(f"\n평균 Composite Score: {comp.mean():.4f}")

# 7) Hold-out 테스트
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc_te = accuracy_score(y_test, y_pred)
f1_te  = f1_score(y_test, y_pred)
auc_te = roc_auc_score(y_test, y_prob)
comp_te = (acc_te + f1_te + auc_te) / 3

print("\nHold-out Test 결과")
print(f"Accuracy : {acc_te:.4f}")
print(f"F1 Score : {f1_te:.4f}")
print(f"ROC AUC  : {auc_te:.4f}")
print(f"Composite: {comp_te:.4f}")


5-Fold CV 결과
[Fold 1] Accuracy: 0.6610, F1: 0.6530, AUC: 0.7157, Composite: 0.6766
[Fold 2] Accuracy: 0.6399, F1: 0.6345, AUC: 0.6978, Composite: 0.6574
[Fold 3] Accuracy: 0.6425, F1: 0.6369, AUC: 0.6972, Composite: 0.6589
[Fold 4] Accuracy: 0.6613, F1: 0.6587, AUC: 0.7123, Composite: 0.6774
[Fold 5] Accuracy: 0.6433, F1: 0.6375, AUC: 0.6978, Composite: 0.6595

평균 Composite Score: 0.6660

Hold-out Test 결과
Accuracy : 0.6599
F1 Score : 0.6546
ROC AUC  : 0.7196
Composite: 0.6780


In [14]:
data_path = '/content/drive/MyDrive/Colab Notebooks/패턴인식/'
df = pd.read_csv(data_path + 'train.csv')

X = df.drop(['id', 'shares', 'y'], axis=1)

# 파생 변수 추가 + 열 제거 2
drop_cols = ['n_non_stop_words', 'global_rate_positive_words', 'global_rate_negative_words']
X = X.drop(columns=[col for col in drop_cols if col in X.columns])

X['ref_link_ratio'] = X['num_self_hrefs'] / (X['num_hrefs'] + 1)
X['media_density'] = (X['num_imgs'] + X['num_videos']) / (X['n_tokens_content'] + 1)
X['selfref_dispersion'] = X['self_reference_max_shares'] - X['self_reference_min_shares']

y = df['y']
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# 4) 전처리 파이프라인
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = ['data_channel', 'weekday']

numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',     OneHotEncoder(drop='first', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])

# 5) 모델 파이프라인 정의 (EBM 사용)
pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', ExplainableBoostingClassifier(
        interactions=10,
        learning_rate=0.01,
        random_state=42,
        n_jobs=-1
    ))
])

# 6) 5-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1', 'roc_auc']

cv_results = cross_validate(
    pipe,
    X_trainval, y_trainval,
    cv=cv,
    scoring=scoring,
    n_jobs=-1
)

acc = cv_results['test_accuracy']
f1  = cv_results['test_f1']
auc = cv_results['test_roc_auc']
comp = (acc + f1 + auc) / 3

print("5-Fold CV 결과")
for i, (a, f, r) in enumerate(zip(acc, f1, auc), start=1):
    print(f"[Fold {i}] Accuracy: {a:.4f}, F1: {f:.4f}, AUC: {r:.4f}, Composite: {comp[i-1]:.4f}")
print(f"\n평균 Composite Score: {comp.mean():.4f}")

# 7) Hold-out 테스트
pipe.fit(X_trainval, y_trainval)
y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:,1]

acc_te = accuracy_score(y_test, y_pred)
f1_te  = f1_score(y_test, y_pred)
auc_te = roc_auc_score(y_test, y_prob)
comp_te = (acc_te + f1_te + auc_te) / 3

print("\nHold-out Test 결과")
print(f"Accuracy : {acc_te:.4f}")
print(f"F1 Score : {f1_te:.4f}")
print(f"ROC AUC  : {auc_te:.4f}")
print(f"Composite: {comp_te:.4f}")


5-Fold CV 결과
[Fold 1] Accuracy: 0.6622, F1: 0.6544, AUC: 0.7221, Composite: 0.6796
[Fold 2] Accuracy: 0.6506, F1: 0.6461, AUC: 0.7078, Composite: 0.6682
[Fold 3] Accuracy: 0.6433, F1: 0.6381, AUC: 0.7011, Composite: 0.6608
[Fold 4] Accuracy: 0.6616, F1: 0.6608, AUC: 0.7206, Composite: 0.6810
[Fold 5] Accuracy: 0.6464, F1: 0.6413, AUC: 0.7019, Composite: 0.6632

평균 Composite Score: 0.6706

Hold-out Test 결과
Accuracy : 0.6644
F1 Score : 0.6570
ROC AUC  : 0.7239
Composite: 0.6818
