In [5]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from time import time
import re
import seaborn as sns
import torch
import pickle

# Configure pandas and numpy settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
np.set_printoptions(precision=15)

# Base random seed for reproducibility
BASE_SEED = 280903571

In [6]:

class ChemistryAwareAnalysis:
    """
    A class for analyzing chemical reaction data using machine learning.
    Implements uniform stratified sampling and XGBoost regression with multiple random seeds.
    """
    
    def __init__(self, embeddings_path, data_path):
        """
        Initialize the analysis with embeddings and data paths.
        Also initializes scalers for selectivity and ddG values.
        
        Args:
            embeddings_path (str): Path to the JSON file containing molecular embeddings
            data_path (str): Path to the CSV file containing reaction data
        """
        print(f"Initializing ChemistryAwareAnalysis...")
        self.embeddings = self.load_embeddings(embeddings_path)
        print(f"Loaded embeddings for {len(self.embeddings)} molecules")
        
        # Load reaction data
        self.Y_df = pd.read_csv(data_path, dtype={
            'catalyst_id': str,
            'imine_id': str,
            'thiol_id': str,
            'product_id': str
        })
        print(f"Loaded data with {len(self.Y_df)} entries")
        
        # Initialize and fit scalers on full dataset
        self.ee_scaler = MinMaxScaler()
        self.ddg_scaler = MinMaxScaler()
        
        self.ee_scaler.fit(self.Y_df['selectivity_ee_percent'].values.reshape(-1, 1))
        self.ddg_scaler.fit(self.Y_df['selectivity_ddGact_kcal'].values.reshape(-1, 1))
        
        # Print data ranges
        print("\nData Ranges (Original Scale):")
        print(f"ddG range: [{self.Y_df['selectivity_ddGact_kcal'].min():.2f}, "
              f"{self.Y_df['selectivity_ddGact_kcal'].max():.2f}] kcal/mol")
        print(f"ee% range: [{self.Y_df['selectivity_ee_percent'].min():.2f}, "
              f"{self.Y_df['selectivity_ee_percent'].max():.2f}]%")
        
        self.stored_results = None

    @staticmethod
    def load_embeddings(file_path):
        """
        Load and process embeddings from JSON file.
        """
        print(f"Loading embeddings from {file_path}")
        with open(file_path, 'r') as f:
            raw_embeddings = json.load(f)
        
        # Clean embedding keys by removing family prefix
        embeddings = {}
        family_pattern = re.compile(r'^family\d+_')
        for key, value in raw_embeddings.items():
            stripped_key = family_pattern.sub('', key)
            embeddings[stripped_key] = np.array(value)
        
        return embeddings

    def prepare_data(self):
        """
        Prepare data including scaling of selectivity and ddG values.
        """
        print("\nPreparing data...")
        X_data = []
        Y_data = []
        ee_values = []
        reaction_handles = []
        
        missing_molecules = set()
        for _, row in self.Y_df.iterrows():
            molecule_ids = [
                row['catalyst_id'],
                row['imine_id'],
                row['thiol_id'],
                row['product_id']
            ]
            
            if all(id in self.embeddings for id in molecule_ids):
                combined_embedding = np.concatenate([
                    self.embeddings[id] for id in molecule_ids
                ])
                X_data.append(combined_embedding)
                
                scaled_ddg = self.ddg_scaler.transform([[row['selectivity_ddGact_kcal']]])[0][0]
                scaled_ee = self.ee_scaler.transform([[row['selectivity_ee_percent']]])[0][0]
                
                Y_data.append(scaled_ddg)
                ee_values.append(scaled_ee)
                reaction_handles.append(row['reaction_handle'])
            else:
                missing_ids = [id for id in molecule_ids if id not in self.embeddings]
                missing_molecules.update(missing_ids)
        
        print(f"Prepared {len(X_data)} samples")
        if missing_molecules:
            print(f"Missing embeddings for {len(missing_molecules)} molecules: {missing_molecules}")
        print(f"Embedding dimension: {X_data[0].shape}")
        
        return (np.array(X_data), np.array(Y_data), 
                np.array(ee_values), np.array(reaction_handles))

    def split_data_stratified(self, X, Y, ee_values, reaction_handles, 
                            test_fraction=0.25, n_bins=15, random_seed=None):
        """
        Uniform stratified split ensuring balanced representation across energy range.
        """
        if random_seed is not None:
            np.random.seed(random_seed)
        
        print("\nPerforming uniform stratified split...")
        
        # Create energy bins
        Y_bins = pd.qcut(Y, n_bins, labels=False)
        
        train_indices = []
        test_indices = []
        
        print("\nBin statistics:")
        for bin_idx in range(n_bins):
            bin_mask = Y_bins == bin_idx
            bin_indices = np.where(bin_mask)[0]
            
            # Get original scale energy values for this bin
            bin_energies = self.ddg_scaler.inverse_transform(
                Y[bin_indices].reshape(-1, 1)).flatten()
            
            print(f"\nBin {bin_idx}:")
            print(f"Energy range: {bin_energies.min():.2f} to {bin_energies.max():.2f} kcal/mol")
            print(f"Number of samples: {len(bin_indices)}")
            
            n_test = int(len(bin_indices) * test_fraction)
            test_idx = np.random.choice(bin_indices, 
                                      size=n_test, 
                                      replace=False)
            train_idx = np.setdiff1d(bin_indices, test_idx)
            
            test_indices.extend(test_idx)
            train_indices.extend(train_idx)
            
            print(f"Train samples: {len(train_idx)}")
            print(f"Test samples: {len(test_idx)}")
        
        # Convert to arrays
        train_indices = np.array(train_indices)
        test_indices = np.array(test_indices)
        
        # Print energy ranges
        train_energies = self.ddg_scaler.inverse_transform(
            Y[train_indices].reshape(-1, 1)).flatten()
        test_energies = self.ddg_scaler.inverse_transform(
            Y[test_indices].reshape(-1, 1)).flatten()
        
        return {
            'X_train': X[train_indices],
            'X_test': X[test_indices],
            'Y_train': Y[train_indices],
            'Y_test': Y[test_indices],
            'train_handles': reaction_handles[train_indices],
            'test_handles': reaction_handles[test_indices],
            'train_ee': ee_values[train_indices],
            'test_ee': ee_values[test_indices]
        }

    def train_and_evaluate(self, split_data, split_name="", random_seed=None):
        """
        Train XGBoost model and evaluate performance.
        """
        if random_seed is not None:
            np.random.seed(random_seed)
            
        pipe_random = Pipeline(steps=[
            ('preprocess', VarianceThreshold(1e-3)),
            ('feature_selection', SelectFromModel(
                RandomForestRegressor(
                    n_estimators=1000, 
                    n_jobs=64, 
                    random_state=random_seed
                ),
                max_features=30
            )),
            ('model', XGBRegressor(
                objective='reg:squarederror',
                random_state=random_seed,
                n_jobs=64,
                tree_method='hist'
            ))
        ])

        param_dict = {
            'model__learning_rate': np.logspace(-4, -1, 20),
            'model__n_estimators': [100, 200, 300, 400, 500, 750, 1000],
            'model__max_depth': [3, 4, 5, 6, 7, 8],
            'model__subsample': np.linspace(0.6, 1.0, 5),
            'model__colsample_bytree': np.linspace(0.6, 1.0, 5),
            'model__reg_alpha': np.logspace(-4, 1, 10),
            'model__reg_lambda': np.logspace(-4, 1, 10),
            'model__min_child_weight': [1, 2, 3, 4, 5],
            'model__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
        }

        search = RandomizedSearchCV(
            estimator=pipe_random,
            param_distributions=param_dict,
            n_iter=100,
            cv=5,
            n_jobs=64,
            verbose=3,
            scoring=['neg_mean_absolute_error', 'r2'],
            refit='neg_mean_absolute_error',
            random_state=random_seed
        )

        print(f"\nTraining XGBoost model with {split_name} split...")
        t0 = time()
        search.fit(split_data['X_train'], split_data['Y_train'])
        training_time = time() - t0
        
        best_model = search.best_estimator_
        Y_pred_train = best_model.predict(split_data['X_train'])
        Y_pred_test = best_model.predict(split_data['X_test'])
        
        Y_train_unscaled = self.ddg_scaler.inverse_transform(
            split_data['Y_train'].reshape(-1, 1)).flatten()
        Y_test_unscaled = self.ddg_scaler.inverse_transform(
            split_data['Y_test'].reshape(-1, 1)).flatten()
        Y_pred_train_unscaled = self.ddg_scaler.inverse_transform(
            Y_pred_train.reshape(-1, 1)).flatten()
        Y_pred_test_unscaled = self.ddg_scaler.inverse_transform(
            Y_pred_test.reshape(-1, 1)).flatten()
        
        results = {
            'train_r2': r2_score(Y_train_unscaled, Y_pred_train_unscaled),
            'test_r2': r2_score(Y_test_unscaled, Y_pred_test_unscaled),
            'train_mae': mean_absolute_error(Y_train_unscaled, Y_pred_train_unscaled),
            'test_mae': mean_absolute_error(Y_test_unscaled, Y_pred_test_unscaled),
            'model': best_model,
            'Y_pred_train': Y_pred_train_unscaled,
            'Y_pred_test': Y_pred_test_unscaled,
            'Y_train': Y_train_unscaled,
            'Y_test': Y_test_unscaled,
            'training_time': training_time,
            'random_seed': random_seed,
            'best_params': best_model.get_params()
        }
        
        return results

    def plot_aggregated_results(self, all_results, split_name):
        """Plot aggregated results from multiple seeds with error bars."""
        # Calculate means and standard deviations
        Y_train_mean = np.mean([r['Y_train'] for r in all_results], axis=0)
        Y_test_mean = np.mean([r['Y_test'] for r in all_results], axis=0)
        Y_pred_train_mean = np.mean([r['Y_pred_train'] for r in all_results], axis=0)
        Y_pred_test_mean = np.mean([r['Y_pred_test'] for r in all_results], axis=0)
        
        Y_pred_train_std = np.std([r['Y_pred_train'] for r in all_results], axis=0)
        Y_pred_test_std = np.std([r['Y_pred_test'] for r in all_results], axis=0)
        
        train_r2_mean = np.mean([r['train_r2'] for r in all_results])
        test_r2_mean = np.mean([r['test_r2'] for r in all_results])
        train_mae_mean = np.mean([r['train_mae'] for r in all_results])
        test_mae_mean = np.mean([r['test_mae'] for r in all_results])
        
        train_r2_std = np.std([r['train_r2'] for r in all_results])
        test_r2_std = np.std([r['test_r2'] for r in all_results])
        train_mae_std = np.std([r['train_mae'] for r in all_results])
        test_mae_std = np.std([r['test_mae'] for r in all_results])
        
        # Create plots
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 20))
        
        # 1. Predictions vs Actual with error bars
        ax1.errorbar(Y_train_mean, Y_pred_train_mean,
                    yerr=Y_pred_train_std,
                    fmt='o', alpha=0.6, 
                    label=f"Train (R²={train_r2_mean:.3f}±{train_r2_std:.3f},\n       MAE={train_mae_mean:.3f}±{train_mae_std:.3f})",
                    color="gray", ecolor='lightgray')
        ax1.errorbar(Y_test_mean, Y_pred_test_mean,
                    yerr=Y_pred_test_std,
                    fmt='o', alpha=0.6, 
                    label=f"Test (R²={test_r2_mean:.3f}±{test_r2_std:.3f},\n      MAE={test_mae_mean:.3f}±{test_mae_std:.3f})",
                    color="blue", ecolor='lightblue')
        
        min_val = min(Y_train_mean.min(), Y_test_mean.min())
        max_val = max(Y_train_mean.max(), Y_test_mean.max())
        ax1.plot([min_val, max_val], [min_val, max_val], 'k--', alpha=0.5)
        
        ax1.set_title(f"EquiCat Prediction ({split_name})\nAveraged over {len(all_results)} seeds", fontsize=28)
        ax1.set_xlabel("ΔΔG (Observed)", fontsize=26)
        ax1.set_ylabel("ΔΔG (Predicted)", fontsize=26)
        ax1.legend(fontsize=19)
        ax1.grid(True, alpha=0.3)

        ax1.tick_params(axis='both', which='major', labelsize=22)
        
        # 2. Energy Distribution
        for results in all_results:
            ax2.hist(results['Y_train'], bins=30, alpha=0.1, color='gray', density=True)
            ax2.hist(results['Y_test'], bins=30, alpha=0.1, color='blue', density=True)
        
        ax2.hist(Y_train_mean, bins=30, alpha=0.5, label='Train', color='gray', density=True)
        ax2.hist(Y_test_mean, bins=30, alpha=0.5, label='Test', color='blue', density=True)
        ax2.set_title("Energy Distribution", fontsize=28)
        ax2.set_xlabel("ΔΔG", fontsize=26)
        ax2.set_ylabel("Density", fontsize=26)
        ax2.legend(fontsize=19)
        ax2.grid(True, alpha=0.3)

        ax2.tick_params(axis='both', which='major', labelsize=22)
        
        # 3. Residuals Plot
        train_residuals = Y_pred_train_mean - Y_train_mean  # Prediction minus actual for training
        test_residuals = Y_pred_test_mean - Y_test_mean    # Prediction minus actual for testing

        # Scatter plot of residuals
        ax3.scatter(Y_train_mean, train_residuals, 
                color='gray', alpha=0.6, label='Train',
                marker='o', s=50)  # Add size parameter for better visibility
        ax3.scatter(Y_test_mean, test_residuals, 
                color='blue', alpha=0.6, label='Test',
                marker='o', s=50)

        # Zero line for reference
        ax3.axhline(y=0, color='k', linestyle='--', alpha=0.5)

        # Titles and labels
        ax3.set_title("Residuals Plot", fontsize=28)
        ax3.set_xlabel("ΔΔG (Observed)", fontsize=26)
        ax3.set_ylabel("Residuals", fontsize=26)

        # Add error bands
        # Sort the arrays for proper fill_between plotting
        train_sort_idx = np.argsort(Y_train_mean)
        test_sort_idx = np.argsort(Y_test_mean)

        ax3.fill_between(Y_train_mean[train_sort_idx], 
                        train_residuals[train_sort_idx] - Y_pred_train_std[train_sort_idx],
                        train_residuals[train_sort_idx] + Y_pred_train_std[train_sort_idx],
                        color='gray', alpha=0.2)
        ax3.fill_between(Y_test_mean[test_sort_idx], 
                        test_residuals[test_sort_idx] - Y_pred_test_std[test_sort_idx],
                        test_residuals[test_sort_idx] + Y_pred_test_std[test_sort_idx],
                        color='blue', alpha=0.2)

        # Add legend and grid
        ax3.legend(fontsize=19)
        ax3.grid(True, alpha=0.3)
        ax3.tick_params(axis='both', which='major', labelsize=22)

        # 4. Error Distribution
        for results in all_results:
            train_res = results['Y_pred_train'] - results['Y_train']
            test_res = results['Y_pred_test'] - results['Y_test']
            ax4.hist(train_res, bins=30, alpha=0.1, color='gray', density=True)
            ax4.hist(test_res, bins=30, alpha=0.1, color='blue', density=True)
        
        ax4.hist(train_residuals, bins=30, alpha=0.5, label='Train', color='gray', density=True)
        ax4.hist(test_residuals, bins=30, alpha=0.5, label='Test', color='blue', density=True)
        ax4.set_title("Error Distribution", fontsize=28)
        ax4.set_xlabel("Prediction Error", fontsize=26)
        ax4.set_ylabel("Density", fontsize=26)
        ax4.legend(fontsize=19)
        ax4.grid(True, alpha=0.3)
        ax4.tick_params(axis='both', which='major', labelsize=22)
        
        plt.tight_layout()
        plt.savefig(f"results_{split_name}_averaged.png", dpi=800, bbox_inches='tight')
        plt.close()
        print(f"\nAveraged plot saved as results_{split_name}_averaged.png")

    def save_aggregated_info(self, all_results, split_name):
        """Save detailed information about the aggregated results."""
        # Calculate aggregated metrics
        train_r2_mean = np.mean([r['train_r2'] for r in all_results])
        test_r2_mean = np.mean([r['test_r2'] for r in all_results])
        train_mae_mean = np.mean([r['train_mae'] for r in all_results])
        test_mae_mean = np.mean([r['test_mae'] for r in all_results])
        
        train_r2_std = np.std([r['train_r2'] for r in all_results])
        test_r2_std = np.std([r['test_r2'] for r in all_results])
        train_mae_std = np.std([r['train_mae'] for r in all_results])
        test_mae_std = np.std([r['test_mae'] for r in all_results])
        
        with open(f'split_info_{split_name}_averaged.txt', 'w') as f:
            f.write(f"Aggregated Analysis Report: {split_name}\n")
            f.write("=" * 50 + "\n\n")
            
            # Random Seeds
            f.write("Random Seeds Used:\n")
            f.write("-" * 20 + "\n")
            for i, results in enumerate(all_results):
                f.write(f"Seed {i+1}: {results['random_seed']}\n")
            f.write("\n")
            
            # Model Performance
            f.write("Model Performance (averaged over seeds):\n")
            f.write("-" * 20 + "\n")
            f.write(f"Training R²: {train_r2_mean:.5f} ± {train_r2_std:.5f}\n")
            f.write(f"Test R²: {test_r2_mean:.5f} ± {test_r2_std:.5f}\n")
            f.write(f"Training MAE: {train_mae_mean:.5f} ± {train_mae_std:.5f} kcal/mol\n")
            f.write(f"Test MAE: {test_mae_mean:.5f} ± {test_mae_std:.5f} kcal/mol\n\n")
            
            # Individual Seed Results
            f.write("Individual Seed Results:\n")
            f.write("-" * 20 + "\n")
            for i, results in enumerate(all_results):
                f.write(f"\nSeed {results['random_seed']}:\n")
                f.write(f"Training R²: {results['train_r2']:.5f}\n")
                f.write(f"Test R²: {results['test_r2']:.5f}\n")
                f.write(f"Training MAE: {results['train_mae']:.5f} kcal/mol\n")
                f.write(f"Test MAE: {results['test_mae']:.5f} kcal/mol\n")
                f.write(f"Training Time: {results['training_time']:.2f} seconds\n")


    def save_results(self, filename="stored_results.pkl"):
        """Save the stored results to a file."""
        if self.stored_results is not None:
            with open(filename, 'wb') as f:
                pickle.dump(self.stored_results, f)
            print(f"Results saved to {filename}")
        else:
            print("No results to save")

    def load_results(self, filename="stored_results.pkl"):
        """Load stored results from a file."""
        try:
            with open(filename, 'rb') as f:
                self.stored_results = pickle.load(f)
            print(f"Results loaded from {filename}")
            
            # Print summary statistics of loaded results
            train_r2_mean = np.mean([r['train_r2'] for r in self.stored_results])
            test_r2_mean = np.mean([r['test_r2'] for r in self.stored_results])
            train_mae_mean = np.mean([r['train_mae'] for r in self.stored_results])
            test_mae_mean = np.mean([r['test_mae'] for r in self.stored_results])
            
            print("\nLoaded Results Summary:")
            print(f"Number of models: {len(self.stored_results)}")
            print(f"Average Training R²: {train_r2_mean:.4f}")
            print(f"Average Test R²: {test_r2_mean:.4f}")
            print(f"Average Training MAE: {train_mae_mean:.4f}")
            print(f"Average Test MAE: {test_mae_mean:.4f}")
            
        except FileNotFoundError:
            print(f"No results file found at {filename}")
            self.stored_results = None

In [7]:
def main():
    """Main function to run the complete analysis pipeline with multiple seeds."""
    print("Starting Chemistry Analysis Pipeline...")
    print("=" * 50)
    
    # Generate random seeds
    np.random.seed(BASE_SEED)
    seeds = np.random.randint(0, 1000000, size=3)
    print("\nRandom seeds for reproducibility:")
    print(seeds)
    
    # Initialize analysis
    analysis = ChemistryAwareAnalysis(
        embeddings_path='/Users/utkarsh/MMLI/equicat/develop_op/final_molecule_embeddings.json',
        data_path='/Users/utkarsh/MMLI/equicat/science/Y_DATA.csv'
    )
    
    # Try to load existing results first
    analysis.load_results()
    
    # If no results loaded, run the training
    if analysis.stored_results is None:
        # Prepare data
        print("\nPreparing dataset...")
        X, Y, ee_values, reaction_handles = analysis.prepare_data()
        print(f"Dataset preparation complete.")
        print(f"Total samples: {len(X)}")
        print(f"Feature dimension: {X.shape[1]}")
        
        # Run analysis for each seed
        all_results = []
        for i, seed in enumerate(seeds):
            print(f"\nRunning analysis for seed {i+1}/{len(seeds)} (seed value: {seed})")
            split_data = analysis.split_data_stratified(
                X, Y, ee_values, reaction_handles,
                test_fraction=0.25,
                n_bins=15,
                random_seed=seed
            )
            results = analysis.train_and_evaluate(
                split_data,
                split_name=f"stratified_seed_{seed}",
                random_seed=seed
            )
            all_results.append(results)
        
        # Store and save results
        analysis.stored_results = all_results
        analysis.save_results()
    
    # Plot and save aggregated results
    analysis.plot_aggregated_results(analysis.stored_results, "stratified_xgb_multi_seed")
    analysis.save_aggregated_info(analysis.stored_results, "stratified_xgb_multi_seed")
    
    # Calculate and print aggregated metrics
    train_r2_mean = np.mean([r['train_r2'] for r in analysis.stored_results])
    test_r2_mean = np.mean([r['test_r2'] for r in analysis.stored_results])
    train_mae_mean = np.mean([r['train_mae'] for r in analysis.stored_results])
    test_mae_mean = np.mean([r['test_mae'] for r in analysis.stored_results])
    
    train_r2_std = np.std([r['train_r2'] for r in analysis.stored_results])
    test_r2_std = np.std([r['test_r2'] for r in analysis.stored_results])
    train_mae_std = np.std([r['train_mae'] for r in analysis.stored_results])
    test_mae_std = np.std([r['test_mae'] for r in analysis.stored_results])
    
    print("\nAnalysis pipeline completed!")
    print("=" * 50)
    print("\nAggregated Results Summary:")
    print(f"Training R²: {train_r2_mean:.4f} ± {train_r2_std:.4f}")
    print(f"Test R²: {test_r2_mean:.4f} ± {test_r2_std:.4f}")
    print(f"Training MAE: {train_mae_mean:.4f} ± {train_mae_std:.4f} kcal/mol")
    print(f"Test MAE: {test_mae_mean:.4f} ± {test_mae_std:.4f} kcal/mol")

if __name__ == "__main__":
    main()

Starting Chemistry Analysis Pipeline...

Random seeds for reproducibility:
[715105 349129 101191]
Initializing ChemistryAwareAnalysis...
Loading embeddings from /Users/utkarsh/MMLI/equicat/develop_op/final_molecule_embeddings.json
Loaded embeddings for 835 molecules
Loaded data with 1075 entries

Data Ranges (Original Scale):
ddG range: [-0.42, 3.14] kcal/mol
ee% range: [-34.00, 99.00]%
No results file found at stored_results.pkl

Preparing dataset...

Preparing data...
Prepared 1050 samples
Missing embeddings for 1 molecules: {'181_i'}
Embedding dimension: (768,)
Dataset preparation complete.
Total samples: 1050
Feature dimension: 768

Running analysis for seed 1/3 (seed value: 715105)

Performing uniform stratified split...

Bin statistics:

Bin 0:
Energy range: -0.42 to 0.02 kcal/mol
Number of samples: 73
Train samples: 55
Test samples: 18

Bin 1:
Energy range: 0.02 to 0.19 kcal/mol
Number of samples: 68
Train samples: 51
Test samples: 17

Bin 2:
Energy range: 0.22 to 0.37 kcal/mol




[CV 2/5] END model__colsample_bytree=0.6, model__gamma=0, model__learning_rate=0.06951927961775606, model__max_depth=5, model__min_child_weight=4, model__n_estimators=100, model__reg_alpha=0.004641588833612782, model__reg_lambda=0.00035938136638046257, model__subsample=0.6; neg_mean_absolute_error: (test=-0.072) r2: (test=-8.940) total time=12.1min
[CV 1/5] END model__colsample_bytree=0.9, model__gamma=0.2, model__learning_rate=0.04832930238571752, model__max_depth=3, model__min_child_weight=3, model__n_estimators=300, model__reg_alpha=0.004641588833612782, model__reg_lambda=2.782559402207126, model__subsample=0.9; neg_mean_absolute_error: (test=-0.241) r2: (test=-27.219) total time=10.1min
[CV 3/5] END model__colsample_bytree=1.0, model__gamma=0.1, model__learning_rate=0.00020691380811147902, model__max_depth=4, model__min_child_weight=3, model__n_estimators=100, model__reg_alpha=0.00035938136638046257, model__reg_lambda=0.016681005372000592, model__subsample=0.6; neg_mean_absolute_er



[CV 3/5] END model__colsample_bytree=0.8, model__gamma=0.5, model__learning_rate=0.0012742749857031334, model__max_depth=7, model__min_child_weight=2, model__n_estimators=100, model__reg_alpha=0.016681005372000592, model__reg_lambda=0.0001, model__subsample=1.0; neg_mean_absolute_error: (test=-0.032) r2: (test=0.327) total time=12.1min
[CV 1/5] END model__colsample_bytree=0.8, model__gamma=0.5, model__learning_rate=0.0012742749857031334, model__max_depth=7, model__min_child_weight=2, model__n_estimators=100, model__reg_alpha=0.016681005372000592, model__reg_lambda=0.0001, model__subsample=1.0; neg_mean_absolute_error: (test=-0.301) r2: (test=-36.178) total time=12.1min
[CV 5/5] END model__colsample_bytree=0.8, model__gamma=0.5, model__learning_rate=0.0012742749857031334, model__max_depth=7, model__min_child_weight=2, model__n_estimators=100, model__reg_alpha=0.016681005372000592, model__reg_lambda=0.0001, model__subsample=1.0; neg_mean_absolute_error: (test=-0.341) r2: (test=-5.235) to

In [8]:
# To load and replot existing results:
analysis = ChemistryAwareAnalysis(
    embeddings_path='/Users/utkarsh/MMLI/equicat/develop_op/final_molecule_embeddings.json',
    data_path='/Users/utkarsh/MMLI/equicat/science/Y_DATA.csv'
)
analysis.load_results()
analysis.plot_aggregated_results(analysis.stored_results, "stratified_xgb_multi_seed")
analysis.save_aggregated_info(analysis.stored_results, "stratified_xgb_multi_seed")

Initializing ChemistryAwareAnalysis...
Loading embeddings from /Users/utkarsh/MMLI/equicat/develop_op/final_molecule_embeddings.json
Loaded embeddings for 835 molecules
Loaded data with 1075 entries

Data Ranges (Original Scale):
ddG range: [-0.42, 3.14] kcal/mol
ee% range: [-34.00, 99.00]%
Results loaded from stored_results.pkl

Loaded Results Summary:
Number of models: 3
Average Training R²: 0.9381
Average Test R²: 0.8998
Average Training MAE: 0.1214
Average Test MAE: 0.1533

Averaged plot saved as results_stratified_xgb_multi_seed_averaged.png
