
# 화합물 독성예측 (최종버전) — 전체 데이터로 학습 + 예측 CSV 생성

이 노트북은 **train.csv** 전체로 학습하고, **predict_input.csv**에 대해 독성 예측 결과를 생성합니다.

- NaN/Inf 안전 처리 (라벨/피처)
- SimpleImputer(median) + StandardScaler + (옵션)다항식 특성
- 모델 후보: LogisticRegression, RandomForest, HistGradientBoosting, SVC
- 성능 비교 후 F1 기준 최적 모델 선택
- `predict_output.csv`로 예측 저장


In [None]:

# !pip install -q scikit-learn pandas numpy matplotlib joblib

import os, joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, classification_report,
                             confusion_matrix)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


## 1) 데이터 로드 및 전처리

In [None]:

# 파일 경로 자동 탐색
train_path = next((p for p in ["/mnt/data/train.csv", "./train.csv", "/content/train.csv"] if Path(p).exists()), None)
pred_path = next((p for p in ["/mnt/data/predict_input.csv", "./predict_input.csv", "/content/predict_input.csv"] if Path(p).exists()), None)
assert train_path, "train.csv 파일이 필요합니다."
assert pred_path, "predict_input.csv 파일이 필요합니다."

print(f"[train 파일] {train_path}")
print(f"[predict 파일] {pred_path}")

train_df = pd.read_csv(train_path)
pred_df = pd.read_csv(pred_path)

# 라벨 정리
train_df['label_num'] = pd.to_numeric(train_df['label'], errors='coerce')
valid_mask = train_df['label_num'].isin([0,1])
print("유효하지 않은 라벨 제거:", (~valid_mask).sum())
train_df = train_df[valid_mask].copy()
y = train_df['label_num'].astype(int)

# 피처 선택 + 파생
base_cols = [c for c in ['MolWt','clogp','sa_score','qed'] if c in train_df.columns]
if all(c in train_df.columns for c in ['MolWt','clogp']):
    train_df['MW_x_clogP'] = train_df['MolWt'] * train_df['clogp']
    base_cols.append('MW_x_clogP')

X = train_df[base_cols].replace([np.inf,-np.inf], np.nan)
print("사용 피처:", base_cols)
print("결측치 개수:\n", X.isna().sum())


## 2) 모델 학습 및 성능 평가 (CV 기반)

In [None]:

from sklearn.model_selection import cross_val_score

steps = [('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]
if len(base_cols) > 1:
    steps.append(('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)))
preprocess = Pipeline(steps=steps)

pipelines = {
    "LogReg": Pipeline([("prep", preprocess),
                        ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))]),
    "RF": Pipeline([("prep", preprocess),
                    ("clf", RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=RANDOM_STATE))]),
    "HGB": Pipeline([("prep", preprocess),
                     ("clf", HistGradientBoostingClassifier(learning_rate=0.1, random_state=RANDOM_STATE))]),
    "SVC": Pipeline([("prep", preprocess),
                     ("clf", SVC(C=1.0, kernel='rbf', probability=True, random_state=RANDOM_STATE))])
}

param_spaces = {
    "LogReg": {"clf__C": np.logspace(-2, 2, 10)},
    "RF": {"clf__n_estimators":[200,300,400], "clf__max_depth":[None,5,10,15], "clf__min_samples_leaf":[1,2,4]},
    "HGB": {"clf__learning_rate":[0.05,0.1,0.2], "clf__max_depth":[None,5,10], "clf__l2_regularization":[0.0,0.01,0.1]},
    "SVC": {"clf__C": np.logspace(-2, 2, 6), "clf__gamma":["scale","auto"]}
}

results, best_models = {}, {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, pipe in pipelines.items():
    print(f"\n=== {name} RandomizedSearchCV (CV=5) ===")
    search = RandomizedSearchCV(pipe, param_spaces[name], n_iter=10, scoring='f1', n_jobs=-1, cv=cv, random_state=RANDOM_STATE, verbose=1)
    search.fit(X, y)
    best_models[name] = search.best_estimator_
    cv_f1 = cross_val_score(search.best_estimator_, X, y, cv=cv, scoring='f1').mean()
    print(f"[{name}] best params:", search.best_params_)
    print(f"[{name}] mean CV F1: {cv_f1:.4f}")
    results[name] = {"cv_f1": cv_f1}

summary = pd.DataFrame(results).T.sort_values("cv_f1", ascending=False)
summary


## 3) 최고 모델 선택 및 전체 데이터 재학습

In [None]:

best_name = summary.index[0]
best_model = best_models[best_name]
print(f"[BEST MODEL] {best_name}")

# 전체 데이터로 최종 재학습
best_model.fit(X, y)
joblib.dump(best_model, "best_toxicity_model.joblib")
print("모델 저장 완료: best_toxicity_model.joblib")


## 4) 예측 수행 및 저장

In [None]:

def prepare_features(df_in: pd.DataFrame, base_cols=None) -> pd.DataFrame:
    if base_cols is None:
        base_cols = [c for c in ['MolWt','clogp','sa_score','qed'] if c in df_in.columns]
    df_in = df_in.copy()
    if all(c in df_in.columns for c in ['MolWt','clogp']):
        df_in['MW_x_clogP'] = df_in['MolWt'] * df_in['clogp']
        if 'MW_x_clogP' not in base_cols:
            base_cols = [*base_cols, 'MW_x_clogP']
    return df_in[base_cols].replace([np.inf,-np.inf], np.nan)

X_pred = prepare_features(pred_df, base_cols)
pred_label = best_model.predict(X_pred)
pred_proba = best_model.predict_proba(X_pred)[:,1]

out_df = pred_df.copy()
out_df['pred_label'] = pred_label
out_df['pred_proba'] = pred_proba
out_df.to_csv("predict_output.csv", index=False)
print("✅ 예측 결과 저장 완료: predict_output.csv")
out_df.head()
