In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import lightgbm as lgb

df = pd.read_csv("Desktop/shireen/finaldescriptors.csv")

X = df.drop(columns=['SMILES', 'Label'])
y = df['Label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize Data for SVM
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define Base Classifiers
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
lightgbm = lgb.LGBMClassifier(random_state=42, num_iterations=100)
svm = SVC(probability=True, random_state=42)

# Grid Search Parameters (Reduced Search Space)
param_grid_lightgbm = {
    'num_leaves': [31],
    'learning_rate': [0.1],
    'n_estimators': [100]
}
param_grid_svm = {
    'C': [1, 10],
    'gamma': [0.01, 0.1],
    'kernel': ['rbf']
}
param_grid_rf = {
    'n_estimators': [100],
    'max_depth': [10, None],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Perform Grid Search (Parallel Processing Enabled)
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_lightgbm = GridSearchCV(lightgbm, param_grid_lightgbm, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)

# Train Models
grid_rf.fit(X_train, y_train)
grid_lightgbm.fit(X_train, y_train)
grid_svm.fit(X_train, y_train)

# Use Early Stopping for LightGBM
#lightgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)

# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('lightgbm', grid_lightgbm.best_estimator_),
        ('svm', grid_svm.best_estimator_),
        ('rf', grid_rf.best_estimator_)
    ],
    final_estimator=LogisticRegression(max_iter=500),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
)

# Train Stacking Model
stacking_clf.fit(X_train, y_train)

# Evaluate Models
models = {
    'Random Forest': grid_rf.best_estimator_,
    'LightGBM': grid_lightgbm.best_estimator_,
    'SVM': grid_svm.best_estimator_,
    'Stacking Classifier': stacking_clf
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    if hasattr(model, "predict_proba"):  # Handle SVM without predict_proba
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc = "N/A"
    
    print(f'\n{name} Accuracy: {accuracy:.2f}')
    print(f'{name} AUC-ROC: {auc}')
    print(f'{name} Classification Report:\n{classification_report(y_test, y_pred)}')
