In [None]:

---

### 6. **`molecular_activity_prediction.ipynb`**

```python
# Cell 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_preprocessing import prepare_data
from src.model_training import train_random_forest, train_xgboost, evaluate_model
from src.utils import balance_data

# Cell 2: Load data
df = prepare_data("data/fingerprints.csv")

# Cell 3: Features and target
X = df.drop(columns=["NR-AhR"])
y = df["NR-AhR"]

# Cell 4: Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Cell 5: Balance training data with SMOTE
X_train_bal, y_train_bal = balance_data(X_train, y_train)

# Cell 6: Train Random Forest
print("Starting Random Forest Grid Search...")
rf_model, rf_params, rf_score = train_random_forest(X_train_bal, y_train_bal)
print(f"Best RF parameters: {rf_params}")
print(f"Best RF CV accuracy: {rf_score:.4f}")

# Cell 7: Evaluate Random Forest
print("Random Forest Classification Report:")
evaluate_model(rf_model, X_test, y_test)

# Cell 8: Train XGBoost
print("Starting XGBoost Grid Search...")
xgb_model, xgb_params, xgb_score = train_xgboost(X_train_bal, y_train_bal)
print(f"Best XGB parameters: {xgb_params}")
print(f"Best XGB CV accuracy: {xgb_score:.4f}")

# Cell 9: Evaluate XGBoost
print("XGBoost Classification Report:")
evaluate_model(xgb_model, X_test, y_test)


In [1]:
# Install SHAP if not already installed
# pip install shap

import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your data
df = pd.read_csv("data/fingerprints.csv")  # Update the path if needed
X = df.drop(columns=["NR-AhR"])            # Feature columns
y = df["NR-AhR"]                           # Target column

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
model = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=5, min_samples_leaf=1, random_state=42)
model.fit(X_train, y_train)

# SHAP analysis
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary for class 0 (non-toxic by default)
plt.title("SHAP Summary Plot for Class 0 (Non-toxic)")
shap.summary_plot(shap_values[0], X_test)


ValueError: Input y contains NaN.