# Toxicity Prediction Ensemble Notebook
This notebook reproduces the full functionality of `tox_pred.py`.
It loads drug feature datasets, trains base models, and builds an ensemble classifier to predict drug withdrawal risk.

In [None]:
!pip install pandas numpy scikit-learn tpot

In [1]:
import numpy as np
import pandas as pd
import os, glob, math, random, statistics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score,
    precision_score, recall_score, matthews_corrcoef
)
from sklearn.feature_selection import VarianceThreshold, GenericUnivariateSelect, chi2
from tpot import TPOTClassifier

AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
labels = pd.read_csv('tox_labels.csv')
drug_features = pd.read_csv('drug_features.csv')
fp = pd.read_csv('fp.csv')
targets = pd.read_csv('targetsall.csv')
sages = pd.read_csv('sages.csv')
print('Datasets loaded successfully.')

In [None]:
def prepare_data(features, labels, label_col=None, test_size=0.3, random_state=42):
    if label_col is None:
        label_col = labels.columns[-1]
    y = labels[label_col]
    X = features.copy()
    if 'Drug_ID' in X.columns:
        X = X.drop(columns=['Drug_ID'])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

def select_features(X_train, X_test, threshold=0.1):
    selector = VarianceThreshold(threshold=threshold)
    X_train_sel = selector.fit_transform(X_train)
    X_test_sel = selector.transform(X_test)
    return X_train_sel, X_test_sel

def train_rf(X_train, X_test, y_train, y_test, model_id):
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    pred_train = rf.predict_proba(X_train)[:,1]
    pred_test = rf.predict_proba(X_test)[:,1]
    os.makedirs('ensemble_train_603010', exist_ok=True)
    pd.DataFrame(pred_train, columns=['pred']).to_csv(f'ensemble_train_603010/{model_id}-level2_train.csv', index=False)
    pd.DataFrame(pred_test, columns=['pred']).to_csv(f'ensemble_train_603010/{model_id}-level2_test.csv', index=False)
    print(f'Model {model_id} trained and saved.')
    return rf

In [None]:
datasets = [drug_features, fp, targets, sages]
names = ['Chemical', 'Fingerprint', 'Targets', 'SAGES']
models = []
for i, (data, name) in enumerate(zip(datasets, names)):
    print(f'\nTraining base model for {name} features...')
    X_train, X_test, y_train, y_test = prepare_data(data, labels)
    X_train_sel, X_test_sel = select_features(X_train, X_test)
    model = train_rf(X_train_sel, X_test_sel, y_train, y_test, i)
    models.append(model)
print('All base models trained.')

In [None]:
train_files = sorted(glob.glob('ensemble_train_603010/*train.csv'))
test_files = sorted(glob.glob('ensemble_train_603010/*test.csv'))
X_train_ens = np.column_stack([pd.read_csv(f)['pred'].values for f in train_files])
X_test_ens = np.column_stack([pd.read_csv(f)['pred'].values for f in test_files])
y_train = labels.iloc[:X_train_ens.shape[0], -1]
y_test = labels.iloc[y_train.shape[0]:y_train.shape[0]+X_test_ens.shape[0], -1]
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, random_state=42)
tpot.fit(X_train_ens, y_train)
ensemble_preds = tpot.predict(X_test_ens)
print('\n=== Ensemble Evaluation ===')
print('Accuracy:', accuracy_score(y_test, ensemble_preds))
print('ROC-AUC:', roc_auc_score(y_test, ensemble_preds))
print('F1:', f1_score(y_test, ensemble_preds))
print('Precision:', precision_score(y_test, ensemble_preds))
print('Recall:', recall_score(y_test, ensemble_preds))
print('MCC:', matthews_corrcoef(y_test, ensemble_preds))
tpot.export('best_ensemble_pipeline.py')
print('Best ensemble pipeline exported.')