In [7]:
# --- Imports ---
import pandas as pd
import numpy as np
import os
import warnings
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import plotly.figure_factory as ff



# --- Suppress Warnings ---
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# --- Drug-side effect mapping ---
drug_side_effects = {
    'Amiodarone': ['Bradycardia', 'Hypotension', 'Hypothyroidism'],
    'Aspirin': ['Hematochezia'],
    'Atenolol': ['Palpitations', 'Asthma', 'Vertigo', 'Cold extremities'],
    'Captopril': ['Hypotension'],
    'Dexmedetomidine': ['Bradycardia'],
    'Sotalol': ['Bradycardia'],
    'Flecainide': ['Ventricular dysfunction'],
    'Furosemide': ['Hypokalemia', 'Hypovolemia', 'Bone fractures'],
    'Heparin': ['Thrombocytopenia', 'Postoperative bleeding'],
    'Indomethacin': ['Necrotizing enterocolitis', 'Gastrointestinal perforation', 'Oliguria',
                     'Anuria', 'Gastrointestinal hemorrhage', 'Intracerebral hemorrhage',
                     'Elevation of serum creatinine', 'Thrombocytopenia'],
    'Ibuprofen': ['Necrotizing enterocolitis', 'Tachypnoea', 'Retinopathy of prematurity',
                  'Intraventricular hemorrhage', 'Gastrointestinal hemorrhage'],
    'Iloprost': ['Facial flushing'],
    'Sildenafil': ['Facial flushing'],
    'Tadalafil': ['Headache'],
    'Prostaglandin E1': ['Apnea', 'Hypoventilation', 'Fever', 'Hyperthermia', 'Facial flushing']
}

# --- Feature combinations ---
feature_combinations = {
    'Clinical Only': ['clinical'],
    'Clinical+Chemical': ['clinical', 'chemical'],
    'Clinical+Target': ['clinical', 'target'],
    'Clinical+Enzyme': ['clinical', 'enzyme'],
    'Clinical+Pathway': ['clinical', 'pathway'],
    'Clinical+Target+Enzyme': ['clinical', 'target', 'enzyme'],
    'Clinical+Target+Pathway': ['clinical', 'target', 'pathway'],
    'Clinical+Enzyme+Pathway': ['clinical', 'enzyme', 'pathway'],
    'Clinical+Therapeutic': ['clinical', 'therapeutic'],
    'Clinical+Phenotype': ['clinical', 'phenotype'],
    'Clinical+Therapeutic+Phenotype': ['clinical', 'therapeutic', 'phenotype']
}

def load_data():
    filepath =  'balanced_dataset2.xlsx'
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Data file not found at: {filepath}")
    
    df = pd.read_excel(filepath)
    df = df.dropna(subset=['Drug name'])

    if 'Age' in df.columns and 'Weight' in df.columns:
        df['Age_Weight_Ratio'] = df['Age'] / (df['Weight'].replace(0, 0.1))
    
    return df

def preprocess_data(df, drug_name, target_col, feature_groups):
    drug_data = df[df['Drug name'] == drug_name].copy()
    available_features = {
        'clinical': ['Age', 'Sex', 'Weight', 'Comorbidity', 'Polypharmacy', 'Age_Weight_Ratio'],
        'chemical': ['Chemical Class'],
        'target': ['Target Protein'],
        'enzyme': ['Enzyme'],
        'pathway': ['Pathway Category'],
        'therapeutic': ['Therapeutic Class'],
        'phenotype': ['Phenotype']
    }

    selected_features = []
    for group in feature_groups:
        if group in available_features:
            selected_features.extend([f for f in available_features[group] if f in drug_data.columns])
    
    if not selected_features:
        raise ValueError(f"No valid features found for groups: {feature_groups}")
    
    X = drug_data[selected_features]
    y = drug_data[target_col].astype(int)
    
    if len(y.unique()) < 2:
        raise ValueError(f"Insufficient classes in target '{target_col}' for drug '{drug_name}'")
    
    return X, y

def create_model_pipeline(model, numeric_features, categorical_features):
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
    return ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])

def evaluate_models_and_plot(X, y, drug, side_effect, feature_name):
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

     
    models = {
        'SVM': SVC(probability=True, random_state=42),
        'MLP': MLPClassifier(random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Logistic': LogisticRegression(random_state=42, class_weight='balanced'),
        'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    }

    fig = go.Figure()
    colors = px.colors.qualitative.Set1
    aucs = []
    metrics_table = []

    for i, (name, model) in enumerate(models.items()):
        try:
            param_grid = {
                'SVM': {'classifier__C': [0.1, 1, 10], 'classifier__gamma': ['scale', 'auto']},
                'MLP': {'classifier__hidden_layer_sizes': [(150,), (100,)], 'classifier__alpha': [0.0001, 0.001]},
                'KNN': {'classifier__n_neighbors': [3, 5, 7]},
                'Logistic': {'classifier__C': [0.1, 1, 10]},
                'Random Forest': {'classifier__n_estimators': [100, 200], 'classifier__max_depth': [10, 20]},
            }
       
            base_pipeline = create_model_pipeline(model, numeric_features, categorical_features)
            search = GridSearchCV(base_pipeline, param_grid[name], cv=5, scoring='roc_auc', n_jobs=-1)
            search.fit(X, y)
            pipeline = search.best_estimator_
            best_model = pipeline.named_steps['classifier']

            y_score = pipeline.predict_proba(X)[:, 1]
            fpr, tpr, _ = roc_curve(y, y_score)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)

            y_pred = pipeline.predict(X)
            tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
            sensitivity = tp / (tp + fn)
            specificity = tn / (tn + fp)
            accuracy = (tp + tn) / (tp + tn + fp + fn)
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn)
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            metrics_table.append([name, sensitivity, specificity, accuracy, recall, f1, roc_auc])

            fig.add_trace(go.Scatter(
                x=fpr, y=tpr, mode='lines',
                name=f'{name} (AUC={roc_auc:.2f})',
                line=dict(color=colors[i % len(colors)], width=2)
            ))
            print(f"        {name}: AUC = {roc_auc:.3f}")
        except Exception as e:
            print(f"        Error with {name}: {str(e)}")

    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1], mode='lines',
        name='Random (AUC=0.50)', line=dict(color='black', dash='dash')
    ))
    fig.update_layout(
        title=f'{drug} - {side_effect} [{feature_name}]',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        legend=dict(orientation='h', y=-0.2),
        template='plotly_white',
        height=500
    )

    out_name = f'AUROC_{drug}_{side_effect}_{feature_name}.html'.replace(" ", "_")
    fig.write_html(out_name)
    print(f"        Saved plot to: {out_name}")

    metrics_df = pd.DataFrame(metrics_table, columns=['Model', 'Sensitivity', 'Specificity', 'Accuracy', 'Recall', 'F1 Score', 'AUROC'])
    print("\nMetrics Table:")
    print(metrics_df)

    # Heatmap of AUCs
    heatmap_df = pd.DataFrame([aucs], columns=models.keys(), index=[side_effect])
    heatmap_fig = ff.create_annotated_heatmap(
        z=heatmap_df.values.tolist(),
        x=heatmap_df.columns.tolist(),
        y=heatmap_df.index.tolist(),
        colorscale='Blues',
        showscale=True
    )
    heatmap_fig.update_layout(
        title_text=f'AUC Heatmap: {drug} - {side_effect}',
        margin=dict(t=50, l=100)
    )
    heatmap_fig.write_html(f"Heatmap_{drug}_{side_effect}_{feature_name}.html".replace(" ", "_"))
    
    return metrics_df

def main():
    df = load_data()
    print("✅ Data loaded. Shape:", df.shape)

    for drug, side_effects in drug_side_effects.items():
        print(f"\n🔬 Drug: {drug}")
        for side_effect in side_effects:
            print(f"  ▶ Side effect: {side_effect}")
            for feature_name, feature_groups in feature_combinations.items():
                print(f"    🔹 Feature Set: {feature_name}")
                try:
                    X, y = preprocess_data(df, drug, side_effect, feature_groups)
                    evaluate_models_and_plot(X, y, drug, side_effect, feature_name)
                except Exception as e:
                    print(f"    ⚠ Skipping: {e}")

if __name__ == '__main__':
    main()

✅ Data loaded. Shape: (3592, 45)

🔬 Drug: Amiodarone
  ▶ Side effect: Bradycardia
    🔹 Feature Set: Clinical Only
        SVM: AUC = 0.844


KeyboardInterrupt: 