In [32]:
import yaml
import json
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LassoLarsCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt
from time import time
from copy import deepcopy
from pprint import pprint

In [33]:
# Load YAML file
with open("/Users/utkarsh/MMLI/equicat/science/Science_2019_reaction_handles.yml", 'r') as file:
    reaction_handles = yaml.safe_load(file)

# Extract relevant data from YAML
uts_reaction_handles = reaction_handles["Study 1:"]["UTS"]
train_handles = reaction_handles["Study 2:"]["Train"]
unseen_cat_handles = reaction_handles["Study 2:"]["Test: unseen catalysts"]
unseen_subs_handles = reaction_handles["Study 2:"]["Test: unseen substrates"]
unseen_cat_and_subs_handles = reaction_handles["Study 2:"]["Test: unseen subs and catalysts"]

print("Number of train handles:", len(train_handles))

Number of train handles: 384


In [34]:
# Function to load embeddings from JSON file and strip family prefix
def load_embeddings(file_path):
    with open(file_path, 'r') as f:
        raw_embeddings = json.load(f)
    
    embeddings = {}
    family_pattern = re.compile(r'^family\d+_')
    for key, value in raw_embeddings.items():
        stripped_key = family_pattern.sub('', key)
        embeddings[stripped_key] = np.array(value)
    
    return embeddings

# Load embeddings
embeddings = load_embeddings('/Users/utkarsh/MMLI/equicat/develop_op/final_molecule_embeddings.json')
print(f"Loaded embeddings for {len(embeddings)} entities")

# Load Y data
Y_df = pd.read_csv('/Users/utkarsh/MMLI/equicat/science/Y_DATA.csv', dtype={
    'catalyst_id': str,
    'imine_id': str,
    'thiol_id': str,
    'product_id': str
})
print(f"Loaded Y data with {len(Y_df)} rows")

Loaded embeddings for 835 entities
Loaded Y data with 1075 rows


In [35]:
# Create X data using embeddings for catalyst, imine, thiol, and product
X_data = []
Y_data = []
valid_reaction_handles = []

for _, row in Y_df.iterrows():
    catalyst_id = row['catalyst_id']
    imine_id = row['imine_id']
    thiol_id = row['thiol_id']
    product_id = row['product_id']
    reaction_handle = row['reaction_handle']
    
    if all(id in embeddings for id in [catalyst_id, imine_id, thiol_id, product_id]):
        combined_embedding = np.concatenate([
            embeddings[catalyst_id],
            embeddings[imine_id],
            embeddings[thiol_id],
            embeddings[product_id]
        ])
        
        X_data.append(combined_embedding)
        Y_data.append(row['selectivity_ddGact_kcal'])
        valid_reaction_handles.append(reaction_handle)
    else:
        missing_ids = [id for id in [catalyst_id, imine_id, thiol_id, product_id] if id not in embeddings]
        print(f"Missing embedding for reaction: {reaction_handle} - Missing IDs: {missing_ids}")

# Convert to DataFrame
X_df = pd.DataFrame(X_data, index=valid_reaction_handles)
Y_series = pd.Series(Y_data, index=valid_reaction_handles)

print(f"Created dataset with {len(X_df)} samples and {X_df.shape[1]} features")

Missing embedding for reaction: 181_i_1_A - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_B - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_C - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_D - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_E - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_A - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_B - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_C - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_D - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_E - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_A - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_B - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_C - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_D - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_E - Missing IDs: ['181

In [36]:
# Helper function for plotting
def multiplot_and_print(estimator, X_train, Y_train, comb_partitions, title, verbose=1, file_dpi=800):
    predicted_train = estimator.predict(X_train)
    r2_train = r2_score(Y_train, predicted_train)
    mae_train = mean_absolute_error(Y_train, predicted_train)

    if verbose:
        print(title)
        print(f"Train R^2: {r2_train:0.5f}, train MAE: {mae_train:0.5f}")

    fig, ax = plt.subplots()

    ax.scatter(
        Y_train,
        predicted_train,
        color="gray",
        label=f"Train (r2= {r2_train:0.3f}, MAE={mae_train:0.3f})"
    )

    for part_name, part_data in comb_partitions.items():
        predicted_test = estimator.predict(part_data[0])
        r2_test = r2_score(part_data[1], predicted_test)
        mae_test = mean_absolute_error(part_data[1], predicted_test)

        if verbose:
            print(f"Test R^2: {r2_test:0.5f}, test MAE: {mae_test:0.5f}")

        ax.scatter(
            part_data[1],
            predicted_test,
            color=part_data[2],
            label=f"{part_name} (r2= {r2_test:0.3f}, MAE={mae_test:0.3f})"
        )

    ax.set_title(title)
    ax.set_xlabel("Observed $\Delta \Delta G^\u2021 [\mathrm{kcal\;mol^{-1}}]$")
    ax.set_ylabel("Predicted $\Delta \Delta G^\u2021 [\mathrm{kcal\;mol^{-1}}]$")
    ax.set_ylim(-3, 3)
    ax.set_xlim(-3, 3)

    plt.legend()
    plt.tight_layout()
    plt.savefig(title + ".png", dpi=file_dpi)
    plt.close()

In [37]:
# Prepare train and test data
valid_train_handles = [handle for handle in train_handles if handle in X_df.index]
X_train = X_df.loc[valid_train_handles]
Y_train = Y_df.loc[Y_df['reaction_handle'].isin(valid_train_handles), ["selectivity_ddGact_kcal"]]
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)

comb_partitions = {}

valid_unseen_subs_handles = [handle for handle in unseen_subs_handles if handle in X_df.index]
X_sub_test = X_df.loc[valid_unseen_subs_handles]
Y_sub_test = Y_df.loc[Y_df['reaction_handle'].isin(valid_unseen_subs_handles), ["selectivity_ddGact_kcal"]]
comb_partitions["Unseen substrates"] = (X_sub_test, Y_sub_test, "green")
print("X_sub_test shape:", X_sub_test.shape)
print("Y_sub_test shape:", Y_sub_test.shape)

valid_unseen_cat_handles = [handle for handle in unseen_cat_handles if handle in X_df.index]
X_cat_test = X_df.loc[valid_unseen_cat_handles]
Y_cat_test = Y_df.loc[Y_df['reaction_handle'].isin(valid_unseen_cat_handles), ["selectivity_ddGact_kcal"]]
comb_partitions["Unseen catalysts"] = (X_cat_test, Y_cat_test, "purple")
print("X_cat_test shape:", X_cat_test.shape)
print("Y_cat_test shape:", Y_cat_test.shape)

valid_unseen_cat_and_subs_handles = [handle for handle in unseen_cat_and_subs_handles if handle in X_df.index]
X_subcat_test = X_df.loc[valid_unseen_cat_and_subs_handles]
Y_subcat_test = Y_df.loc[Y_df['reaction_handle'].isin(valid_unseen_cat_and_subs_handles), ["selectivity_ddGact_kcal"]]
comb_partitions["Unseen subs and cats"] = (X_subcat_test, Y_subcat_test, "blue")
print("X_subcat_test shape:", X_subcat_test.shape)
print("Y_subcat_test shape:", Y_subcat_test.shape)

f_select_model = RandomForestRegressor(n_estimators=500, n_jobs=64, random_state=1234)

base_pipeline = Pipeline(steps=[
    ('variance', VarianceThreshold()),
    ('scaler', MinMaxScaler()),
    # ('feature_selection', SelectFromModel(f_select_model, max_features = 30)),
    ('feature_selection', SelectPercentile(mutual_info_regression, percentile=25)),
])

X_train shape: (368, 768)
Y_train shape: (368, 1)
X_sub_test shape: (207, 768)
Y_sub_test shape: (207, 1)
X_cat_test shape: (304, 768)
Y_cat_test shape: (304, 1)
X_subcat_test shape: (171, 768)
Y_subcat_test shape: (171, 1)


In [39]:
# Model training and evaluation
models = {
    "GBR": GradientBoostingRegressor(n_estimators=1000, ccp_alpha=1e-3),
    "SVR": SVR(kernel='poly', degree=3, epsilon=0.05),
    "RF": RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=7),
    "LL": LassoLarsCV(max_iter=5000, cv=5, n_jobs=-1)
}

In [42]:
for model_name, model in models.items():
    pipe = deepcopy(base_pipeline)
    pipe.steps.append(('model', model))
    
    print(f"\nTraining {model_name}...")
    t0 = time()
    pipe.fit(X_train, Y_train)
    print(f"{model_name}: Fitting took {time() - t0:.3f}s.")

    scores = cross_validate(pipe, X_train, Y_train, cv=5, scoring=['neg_mean_absolute_error', 'r2'], return_train_score=True)
    pprint(scores)
    print(f"{model_name} Q_2: {np.mean(scores['test_r2']):.5f}")

    multiplot_and_print(pipe, X_train, Y_train, comb_partitions, f"BPA_Combinatorial_{model_name}")

print("All models trained and evaluated successfully.")


Training GBR...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


GBR: Fitting took 3.038s.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


{'fit_time': array([2.93766022, 2.29408097, 2.26784682, 2.15340328, 2.22591877]),
 'score_time': array([0.00740504, 0.00371289, 0.00336695, 0.00327277, 0.00335431]),
 'test_neg_mean_absolute_error': array([-0.47937329, -0.58519114, -0.55550473, -0.48552131, -0.37001129]),
 'test_r2': array([-0.09696019, -1.17464649,  0.22379963, -0.07685155, -0.14446218]),
 'train_neg_mean_absolute_error': array([-0.34556285, -0.37381301, -0.32850108, -0.32639431, -0.33683587]),
 'train_r2': array([0.64779298, 0.59758756, 0.60192412, 0.64836692, 0.68800321])}
GBR Q_2: -0.25382
BPA_Combinatorial_GBR
Train R^2: 0.64349, train MAE: 0.34212
Test R^2: 0.69763, test MAE: 0.31181
Test R^2: -0.51103, test MAE: 0.67133
Test R^2: -0.46979, test MAE: 0.67540

Training SVR...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVR: Fitting took 0.643s.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([0.57016492, 0.574646  , 0.56280327, 0.5713768 , 0.62339497]),
 'score_time': array([0.00431013, 0.00393391, 0.00387311, 0.00392509, 0.00400305]),
 'test_neg_mean_absolute_error': array([-0.46927876, -0.52417942, -0.59829291, -0.658965  , -0.38143669]),
 'test_r2': array([-0.02960856, -0.77912023, -0.01447746, -1.08299862, -0.26897743]),
 'train_neg_mean_absolute_error': array([-0.34085145, -0.36982661, -0.32480749, -0.3224177 , -0.33132259]),
 'train_r2': array([0.56330931, 0.51145443, 0.54959292, 0.56382437, 0.61558212])}
SVR Q_2: -0.43504
BPA_Combinatorial_SVR
Train R^2: 0.56734, train MAE: 0.33809
Test R^2: 0.56545, test MAE: 0.36100
Test R^2: -0.12310, test MAE: 0.56659
Test R^2: -0.32787, test MAE: 0.66446

Training RF...


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


RF: Fitting took 1.488s.


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


{'fit_time': array([1.37665296, 1.24443269, 1.26924729, 1.2646358 , 1.28757024]),
 'score_time': array([0.04240179, 0.04070711, 0.04252768, 0.15978122, 0.03991985]),
 'test_neg_mean_absolute_error': array([-0.51304895, -0.5890203 , -0.55832038, -0.51720235, -0.36127287]),
 'test_r2': array([-0.21374161, -1.25706533,  0.19652702, -0.33471876,  0.07250606]),
 'train_neg_mean_absolute_error': array([-0.34065506, -0.36990315, -0.32489756, -0.32233113, -0.3322759 ]),
 'train_r2': array([0.65275486, 0.6031154 , 0.60916509, 0.65400685, 0.69288496])}
RF Q_2: -0.30730
BPA_Combinatorial_RF
Train R^2: 0.64936, train MAE: 0.33803
Test R^2: 0.71553, test MAE: 0.30251
Test R^2: -0.38820, test MAE: 0.65106
Test R^2: -0.36379, test MAE: 0.66301

Training LL...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LL: Fitting took 0.686s.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([0.82345819, 0.73108292, 0.71718216, 0.69422913, 0.7616601 ]),
 'score_time': array([0.00411177, 0.00333905, 0.00304794, 0.01127076, 0.00334096]),
 'test_neg_mean_absolute_error': array([-0.77561094, -0.74175659, -0.73385369, -0.6354781 , -0.36561611]),
 'test_r2': array([-1.55077393, -2.38521674, -0.0859622 , -0.94548584, -0.01997947]),
 'train_neg_mean_absolute_error': array([-0.34064838, -0.38436008, -0.352966  , -0.51164494, -0.58181791]),
 'train_r2': array([0.65278769, 0.57375571, 0.53714494, 0.        , 0.        ])}
LL Q_2: -0.99748
BPA_Combinatorial_LL
Train R^2: 0.00000, train MAE: 0.54042
Test R^2: -0.03286, test MAE: 0.58499
Test R^2: -0.04266, test MAE: 0.56389
Test R^2: -0.19093, test MAE: 0.63337
All models trained and evaluated successfully.
