In [12]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel, VarianceThreshold
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from time import time
import scipy.stats as stats
import re

# Set up display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Define base seed and number of runs
BASE_SEED = 107843579
NUM_RUNS = 3

In [13]:
# Function to load embeddings from JSON file and strip family prefix
def load_embeddings(file_path):
    with open(file_path, 'r') as f:
        raw_embeddings = json.load(f)
    
    embeddings = {}
    family_pattern = re.compile(r'^family\d+_')
    for key, value in raw_embeddings.items():
        stripped_key = family_pattern.sub('', key)
        embeddings[stripped_key] = np.array(value)
    
    return embeddings

# Function to create dataset
def create_dataset(embeddings, Y_df):
    X_data = []
    Y_data = []
    
    for _, row in Y_df.iterrows():
        catalyst_id = row['catalyst_id']
        imine_id = row['imine_id']
        thiol_id = row['thiol_id']
        product_id = row['product_id']
        
        if all(id in embeddings for id in [catalyst_id, imine_id, thiol_id, product_id]):
            combined_embedding = np.concatenate([
                embeddings[catalyst_id],
                embeddings[imine_id],
                embeddings[thiol_id],
                embeddings[product_id]
            ])
            X_data.append(combined_embedding)
            Y_data.append(row['selectivity_ddGact_kcal'])
    
    return np.array(X_data), np.array(Y_data)

In [14]:
def train_and_evaluate(X_data, Y_data, seed):
    X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.7, random_state=seed)
    
    f_select_model = RandomForestRegressor(n_estimators=1000, n_jobs=64, random_state=seed)
    
    pipe_random = Pipeline(steps=[
        ('preprocess', VarianceThreshold(1e-3)),
        ('feature_selection', SelectFromModel(f_select_model, max_features=30)),
        ('model', GradientBoostingRegressor(n_estimators=500, learning_rate=0.01))
    ])
    
    param_dict = {
        'model__learning_rate': np.logspace(-4, 0, 20),
        'model__subsample': np.linspace(0.5, 1.0, 10),
        'model__min_weight_fraction_leaf': (np.random.uniform(0.0, .5, 20)),
        'model__max_depth': [3,4,5,6,7,8,9,10,None],
        'model__max_features': (np.random.uniform(0.01, .99, 20)),
        'model__max_leaf_nodes': [10,100,1000,None],
        'model__n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    }
    
    search = RandomizedSearchCV(pipe_random, cv=5, param_distributions=param_dict,
                              n_iter=100, n_jobs=64, verbose=3, refit='neg_mean_absolute_error',
                              scoring=['neg_mean_absolute_error', 'r2'])
    
    search.fit(X_train, Y_train)
    best_model = search.best_estimator_
    
    Y_pred_train = best_model.predict(X_train)
    Y_pred_test = best_model.predict(X_test)
    
    results = {
        'model': best_model,
        'train_data': (X_train, Y_train, Y_pred_train),
        'test_data': (X_test, Y_test, Y_pred_test),
        'metrics': {
            'train_r2': r2_score(Y_train, Y_pred_train),
            'test_r2': r2_score(Y_test, Y_pred_test),
            'train_mae': mean_absolute_error(Y_train, Y_pred_train),
            'test_mae': mean_absolute_error(Y_test, Y_pred_test)
        }
    }
    
    return results

In [15]:
def plot_aggregated_results(all_results, title, file_dpi=800):
    plt.figure(figsize=(10, 6))
    
    # Plot training data
    train_y_true = []
    train_y_pred = []
    for result in all_results:
        _, y_true, y_pred = result['train_data']
        train_y_true.extend(y_true)
        train_y_pred.extend(y_pred)
    
    # Plot test data
    test_y_true = []
    test_y_pred = []
    for result in all_results:
        _, y_true, y_pred = result['test_data']
        test_y_true.extend(y_true)
        test_y_pred.extend(y_pred)
    
    # Calculate average metrics
    train_r2_mean = np.mean([r['metrics']['train_r2'] for r in all_results])
    train_r2_std = np.std([r['metrics']['train_r2'] for r in all_results])
    test_r2_mean = np.mean([r['metrics']['test_r2'] for r in all_results])
    test_r2_std = np.std([r['metrics']['test_r2'] for r in all_results])
    
    train_mae_mean = np.mean([r['metrics']['train_mae'] for r in all_results])
    train_mae_std = np.std([r['metrics']['train_mae'] for r in all_results])
    test_mae_mean = np.mean([r['metrics']['test_mae'] for r in all_results])
    test_mae_std = np.std([r['metrics']['test_mae'] for r in all_results])
    
    plt.scatter(train_y_true, train_y_pred, color='gray', alpha=0.5,
               label=f'Train (R² = {train_r2_mean:.3f} ± {train_r2_std:.3f})')
    plt.scatter(test_y_true, test_y_pred, color='blue', alpha=0.5,
               label=f'Test (R² = {test_r2_mean:.3f} ± {test_r2_std:.3f})')
    
    plt.plot([-3, 3], [-3, 3], 'k--', alpha=0.5)
    plt.xlabel('Observed')
    plt.ylabel('Predicted')
    plt.title(title)
    plt.legend()
    plt.xlim(-3, 3)
    plt.ylim(-3, 3)
    
    plt.savefig(f"{title}_aggregated.png", dpi=file_dpi)
    plt.close()
    
    # Print aggregated metrics
    print("\nAggregated Results Summary:")
    print(f"Training R²: {train_r2_mean:.4f} ± {train_r2_std:.4f}")
    print(f"Test R²: {test_r2_mean:.4f} ± {test_r2_std:.4f}")
    print(f"Training MAE: {train_mae_mean:.4f} ± {train_mae_std:.4f} kcal/mol")
    print(f"Test MAE: {test_mae_mean:.4f} ± {test_mae_std:.4f} kcal/mol")

In [16]:
def main():
    # Load data
    try:
        embeddings = load_embeddings('/Users/utkarsh/MMLI/equicat/large/final_molecule_embeddings.json')
        Y_df = pd.read_csv('/Users/utkarsh/MMLI/equicat/science/Y_DATA.csv', dtype={
            'catalyst_id': str,
            'imine_id': str,
            'thiol_id': str,
            'product_id': str
        })
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return
    
    # Create dataset
    X_data, Y_data = create_dataset(embeddings, Y_df)
    
    # Perform multiple runs with different seeds
    all_results = []
    for i in range(NUM_RUNS):
        print(f"\nStarting Run {i+1}/{NUM_RUNS}")
        seed = BASE_SEED + i * 2  # Use a prime number offset
        np.random.seed(seed)
        results = train_and_evaluate(X_data, Y_data, seed)
        all_results.append(results)
    
    # Plot aggregated results
    plot_aggregated_results(all_results, "EQUICAT Prediction (Aggregated)")
    
    # Create ensemble predictor function
    def predict_ddg_ensemble(catalyst_id, imine_id, thiol_id, product_id):
        if all(id in embeddings for id in [catalyst_id, imine_id, thiol_id, product_id]):
            combined_embedding = np.concatenate([
                embeddings[catalyst_id],
                embeddings[imine_id],
                embeddings[thiol_id],
                embeddings[product_id]
            ])
            predictions = []
            for result in all_results:
                pred = result['model'].predict([combined_embedding])[0]
                predictions.append(pred)
            mean_pred = np.mean(predictions)
            std_pred = np.std(predictions)
            return mean_pred, std_pred
        else:
            return "One or more components not found in embeddings"
    
    # Example prediction
    example_row = Y_df.iloc[3]
    pred_mean, pred_std = predict_ddg_ensemble(
        example_row['catalyst_id'],
        example_row['imine_id'],
        example_row['thiol_id'],
        example_row['product_id']
    )
    print(f"\nPrediction for reaction {example_row['reaction_handle']}: "
          f"{pred_mean:.4f} ± {pred_std:.4f} kcal/mol")

if __name__ == "__main__":
    main()


Starting Run 1/3
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END model__learning_rate=0.0001623776739188721, model__max_depth=9, model__max_features=0.8187225210610961, model__max_leaf_nodes=100, model__min_weight_fraction_leaf=0.21653137156334307, model__n_estimators=200, model__subsample=0.6111111111111112; neg_mean_absolute_error: (test=-0.552) r2: (test=-0.015) total time=  58.9s
[CV 2/5] END model__learning_rate=0.0001623776739188721, model__max_depth=9, model__max_features=0.8187225210610961, model__max_leaf_nodes=100, model__min_weight_fraction_leaf=0.21653137156334307, model__n_estimators=200, model__subsample=0.6111111111111112; neg_mean_absolute_error: (test=-0.564) r2: (test=0.021) total time= 1.4min
[CV 1/5] END model__learning_rate=0.002976351441631319, model__max_depth=8, model__max_features=0.023550864428603546, model__max_leaf_nodes=1000, model__min_weight_fraction_leaf=0.171578384001023, model__n_estimators=300, model__subsample=0.722222222



[CV 1/5] END model__learning_rate=0.002976351441631319, model__max_depth=9, model__max_features=0.9677901238205576, model__max_leaf_nodes=100, model__min_weight_fraction_leaf=0.10115160702880205, model__n_estimators=800, model__subsample=0.9444444444444444; neg_mean_absolute_error: (test=-0.278) r2: (test=0.685) total time= 3.9min
[CV 2/5] END model__learning_rate=0.002976351441631319, model__max_depth=9, model__max_features=0.9677901238205576, model__max_leaf_nodes=100, model__min_weight_fraction_leaf=0.10115160702880205, model__n_estimators=800, model__subsample=0.9444444444444444; neg_mean_absolute_error: (test=-0.236) r2: (test=0.801) total time= 4.0min
[CV 3/5] END model__learning_rate=0.002976351441631319, model__max_depth=9, model__max_features=0.9677901238205576, model__max_leaf_nodes=100, model__min_weight_fraction_leaf=0.10115160702880205, model__n_estimators=800, model__subsample=0.9444444444444444; neg_mean_absolute_error: (test=-0.276) r2: (test=0.737) total time= 4.0min
[