In [61]:
import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LassoLarsCV
from xgboost import XGBRegressor
from copy import deepcopy
from time import time
from pprint import pprint
import yaml

In [62]:
# Load YAML file
# with open("/Users/utkarsh/MMLI/equicat/science/Science_2019_reaction_handles.yml", 'r') as file:
with open("/Users/utkarsh/MMLI/equicat/science/changed_handles.yml", 'r') as file:
    reaction_handles = yaml.safe_load(file)

# Extract relevant data from YAML
uts_reaction_handles = reaction_handles["Study 1:"]["UTS"]
train_handles = reaction_handles["Study 2:"]["Train"]
unseen_cat_handles = reaction_handles["Study 2:"]["Test: unseen catalysts"]
unseen_subs_handles = reaction_handles["Study 2:"]["Test: unseen substrates"]
unseen_cat_and_subs_handles = reaction_handles["Study 2:"]["Test: unseen subs and catalysts"]

print("Number of train handles:", len(train_handles))
print("Train handles:", train_handles)
print("Number of unseen catalysts:", len(unseen_cat_handles))
print("Unseen catalysts:", unseen_cat_handles)
print("Number of unseen substrates:", len(unseen_subs_handles))
print("Unseen substrates:", unseen_subs_handles)
print("Number of unseen catalysts and substrates:", len(unseen_cat_and_subs_handles))
print("Unseen catalysts and substrates:", unseen_cat_and_subs_handles)

Number of train handles: 384
Train handles: ['99_i_1_A', '99_i_1_B', '99_i_1_C', '99_i_1_D', '99_i_2_A', '99_i_2_B', '99_i_2_C', '99_i_2_D', '99_i_3_A', '99_i_3_B', '99_i_3_C', '99_i_3_D', '99_i_4_A', '99_i_4_B', '99_i_4_C', '99_i_4_D', '242_i_1_A', '242_i_1_B', '242_i_1_C', '242_i_1_D', '242_i_2_A', '242_i_2_B', '242_i_2_C', '242_i_2_D', '242_i_3_A', '242_i_3_B', '242_i_3_C', '242_i_3_D', '242_i_4_A', '242_i_4_B', '242_i_4_C', '242_i_4_D', '202_i_1_A', '202_i_1_B', '202_i_1_C', '202_i_1_D', '202_i_2_A', '202_i_2_B', '202_i_2_C', '202_i_2_D', '202_i_3_A', '202_i_3_B', '202_i_3_C', '202_i_3_D', '202_i_4_A', '202_i_4_B', '202_i_4_C', '202_i_4_D', '328_i_1_A', '328_i_1_B', '328_i_1_C', '328_i_1_D', '328_i_2_A', '328_i_2_B', '328_i_2_C', '328_i_2_D', '328_i_3_A', '328_i_3_B', '328_i_3_C', '328_i_3_D', '328_i_4_A', '328_i_4_B', '328_i_4_C', '328_i_4_D', '365_i_1_A', '365_i_1_B', '365_i_1_C', '365_i_1_D', '365_i_2_A', '365_i_2_B', '365_i_2_C', '365_i_2_D', '365_i_3_A', '365_i_3_B', '365_i_3_

In [63]:
# Function to load embeddings from JSON file and strip family prefix
def load_embeddings(file_path):
    with open(file_path, 'r') as f:
        raw_embeddings = json.load(f)
    
    embeddings = {}
    family_pattern = re.compile(r'^family\d+_')
    for key, value in raw_embeddings.items():
        stripped_key = family_pattern.sub('', key)
        embeddings[stripped_key] = np.array(value)
    
    return embeddings

# Load embeddings
embeddings = load_embeddings('/Users/utkarsh/MMLI/equicat/develop_op/final_molecule_embeddings.json')
print(f"Loaded embeddings for {len(embeddings)} entities")
print(list(embeddings.keys())[:5])

# Load Y data
Y_df = pd.read_csv('/Users/utkarsh/MMLI/equicat/science/Y_DATA.csv', dtype={
    'catalyst_id': str,
    'imine_id': str,
    'thiol_id': str,
    'product_id': str
})
print(f"Loaded Y data with {len(Y_df)} rows")
print(Y_df.head(5))

Loaded embeddings for 835 entities
['54_vi', '177_i', '83_vi', '15_i', '8_i']
Loaded Y data with 1075 rows
  reaction_handle catalyst_id imine_id thiol_id product_id  \
0         1_i_1_A         1_i        1        A        1_A   
1         1_i_1_B         1_i        1        B        1_B   
2         1_i_1_C         1_i        1        C        1_C   
3         1_i_1_D         1_i        1        D        1_D   
4         1_i_1_E         1_i        1        E        1_E   

   selectivity_ee_percent  selectivity_ddGact_kcal  
0                      76                 1.180364  
1                      40                 0.501960  
2                      50                 0.650844  
3                      78                 1.238605  
4                      80                 1.301689  


In [64]:
# Create X data using embeddings for catalyst, imine, thiol, and product
X_data = []
Y_data = []
valid_reaction_handles = []

for _, row in Y_df.iterrows():
    catalyst_id = row['catalyst_id']
    imine_id = row['imine_id']
    thiol_id = row['thiol_id']
    product_id = row['product_id']
    reaction_handle = row['reaction_handle']
    
    if all(id in embeddings for id in [catalyst_id, imine_id, thiol_id, product_id]):
        combined_embedding = np.concatenate([
            embeddings[catalyst_id],
            embeddings[imine_id],
            embeddings[thiol_id],
            embeddings[product_id]
        ])
        
        X_data.append(combined_embedding)
        Y_data.append(row['selectivity_ddGact_kcal'])
        valid_reaction_handles.append(reaction_handle)
    else:
        missing_ids = [id for id in [catalyst_id, imine_id, thiol_id, product_id] if id not in embeddings]
        print(f"Missing embedding for reaction: {reaction_handle} - Missing IDs: {missing_ids}")

# Convert to DataFrame
X_df = pd.DataFrame(X_data, index=valid_reaction_handles)
Y_series = pd.Series(Y_data, index=valid_reaction_handles)

print(f"Created dataset with {len(X_df)} samples and {X_df.shape[1]} features")
print(X_df.head(5))
print(Y_series.head(5))

Missing embedding for reaction: 181_i_1_A - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_B - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_C - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_D - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_1_E - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_A - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_B - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_C - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_D - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_2_E - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_A - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_B - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_C - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_D - Missing IDs: ['181_i']
Missing embedding for reaction: 181_i_3_E - Missing IDs: ['181

In [65]:
# # Helper function for plotting
# def multiplot_and_print(estimator, X_train, Y_train, comb_partitions, title, verbose=1, file_dpi=800):
#     predicted_train = estimator.predict(X_train)
#     r2_train = r2_score(Y_train, predicted_train)
#     mae_train = mean_absolute_error(Y_train, predicted_train)

#     if verbose:
#         print(title)
#         print(f"Train R^2: {r2_train:0.5f}, train MAE: {mae_train:0.5f}")

#     fig, ax = plt.subplots()

#     ax.scatter(
#         Y_train,
#         predicted_train,
#         color="gray",
#         label=f"Train (r2= {r2_train:0.3f}, MAE={mae_train:0.3f})"
#     )

#     for part_name, part_data in comb_partitions.items():
#         print("Part name:", part_name)
#         print("Part data:", part_data)
#         predicted_test = estimator.predict(part_data[0])
#         print(predicted_test)
#         r2_test = r2_score(part_data[1], predicted_test)
#         mae_test = mean_absolute_error(part_data[1], predicted_test)

#         if verbose:
#             print(f"Test R^2: {r2_test:0.5f}, test MAE: {mae_test:0.5f}")

#         ax.scatter(
#             part_data[1],
#             predicted_test,
#             color=part_data[2],
#             label=f"{part_name} (r2= {r2_test:0.3f}, MAE={mae_test:0.3f})"
#         )

#     ax.set_title(title)
#     ax.set_xlabel("Observed $\Delta \Delta G^\u2021 [\mathrm{kcal\;mol^{-1}}]$")
#     ax.set_ylabel("Predicted $\Delta \Delta G^\u2021 [\mathrm{kcal\;mol^{-1}}]$")
#     ax.set_ylim(-3, 3)
#     ax.set_xlim(-3, 3)

#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(title + ".png", dpi=file_dpi)
#     plt.close()


def multiplot_and_print(estimator, X_train, Y_train, comb_partitions, title, verbose=1, file_dpi=800):
    print(f"\n{title}")
    print(f"X_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")

    Y_train = Y_train.values.ravel()  # Ensure Y_train is 1D
    predicted_train = estimator.predict(X_train)
    r2_train = r2_score(Y_train, predicted_train)
    mae_train = mean_absolute_error(Y_train, predicted_train)

    if verbose:
        print(f"Train R^2: {r2_train:0.5f}, train MAE: {mae_train:0.5f}")

    fig, ax = plt.subplots(figsize=(10, 8))

    ax.scatter(
        Y_train,
        predicted_train,
        color="gray",
        alpha=0.5,
        label=f"Train (R^2= {r2_train:.3f}, MAE={mae_train:.3f})"
    )

    for part_name, (X_test, Y_test, color) in comb_partitions.items():
        print(f"\nPart name: {part_name}")
        print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")

        Y_test = Y_test.values.ravel()  # Ensure Y_test is 1D
        predicted_test = estimator.predict(X_test)
        r2_test = r2_score(Y_test, predicted_test)
        mae_test = mean_absolute_error(Y_test, predicted_test)

        if verbose:
            print(f"Test R^2: {r2_test:0.5f}, test MAE: {mae_test:0.5f}")

        ax.scatter(
            Y_test,
            predicted_test,
            color=color,
            alpha=0.7,
            label=f"{part_name} (R^2= {r2_test:.3f}, MAE={mae_test:.3f})"
        )

    ax.set_title(title)
    ax.set_xlabel("Observed $\Delta \Delta G^\u2021$ [kcal/mol]")
    ax.set_ylabel("Predicted $\Delta \Delta G^\u2021$ [kcal/mol]")
    ax.set_xlim(-3, 3)
    ax.set_ylim(-3, 3)
    ax.plot([-3, 3], [-3, 3], 'k--', alpha=0.5)  # Add diagonal line

    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{title}.png", dpi=file_dpi)
    plt.close()

    print("\nPlot saved as", f"{title}.png")

In [66]:
# Prepare train and test data
valid_train_handles = [handle for handle in train_handles if handle in X_df.index]
print("Number of valid train handles:", len(valid_train_handles))
X_train = X_df.loc[valid_train_handles]
Y_train = Y_df.loc[Y_df['reaction_handle'].isin(valid_train_handles), ["selectivity_ddGact_kcal"]]
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)

comb_partitions = {}

valid_unseen_subs_handles = [handle for handle in unseen_subs_handles if handle in X_df.index]
print("Number of valid unseen substrates:", len(valid_unseen_subs_handles))
X_sub_test = X_df.loc[valid_unseen_subs_handles]
Y_sub_test = Y_df.loc[Y_df['reaction_handle'].isin(valid_unseen_subs_handles), ["selectivity_ddGact_kcal"]]
comb_partitions["Unseen substrates"] = (X_sub_test, Y_sub_test, "green")
print("X_sub_test shape:", X_sub_test.shape)
print("Y_sub_test shape:", Y_sub_test.shape)

valid_unseen_cat_handles = [handle for handle in unseen_cat_handles if handle in X_df.index]
print("Number of valid unseen catalysts:", len(valid_unseen_cat_handles))
X_cat_test = X_df.loc[valid_unseen_cat_handles]
Y_cat_test = Y_df.loc[Y_df['reaction_handle'].isin(valid_unseen_cat_handles), ["selectivity_ddGact_kcal"]]
comb_partitions["Unseen catalysts"] = (X_cat_test, Y_cat_test, "purple")
print("X_cat_test shape:", X_cat_test.shape)
print("Y_cat_test shape:", Y_cat_test.shape)

valid_unseen_cat_and_subs_handles = [handle for handle in unseen_cat_and_subs_handles if handle in X_df.index]
print("Number of valid unseen catalysts and substrates:", len(valid_unseen_cat_and_subs_handles))
X_subcat_test = X_df.loc[valid_unseen_cat_and_subs_handles]
Y_subcat_test = Y_df.loc[Y_df['reaction_handle'].isin(valid_unseen_cat_and_subs_handles), ["selectivity_ddGact_kcal"]]
comb_partitions["Unseen subs and cats"] = (X_subcat_test, Y_subcat_test, "blue")
print("X_subcat_test shape:", X_subcat_test.shape)
print("Y_subcat_test shape:", Y_subcat_test.shape)

print(comb_partitions.keys())

# f_select_model = RandomForestRegressor(n_estimators=500, n_jobs=64, random_state=1234)

# base_pipeline = Pipeline(steps=[
#     ('variance', VarianceThreshold()),
#     ('scaler', MinMaxScaler()),
#     # ('feature_selection', SelectFromModel(f_select_model, max_features = 30)),
#     ('feature_selection', SelectPercentile(mutual_info_regression, percentile=25)),
# ])

Number of valid train handles: 368
X_train shape: (368, 768)
Y_train shape: (368, 1)
Number of valid unseen substrates: 304
X_sub_test shape: (304, 768)
Y_sub_test shape: (304, 1)
Number of valid unseen catalysts: 171
X_cat_test shape: (171, 768)
Y_cat_test shape: (171, 1)
Number of valid unseen catalysts and substrates: 207
X_subcat_test shape: (207, 768)
Y_subcat_test shape: (207, 1)
dict_keys(['Unseen substrates', 'Unseen catalysts', 'Unseen subs and cats'])


In [42]:
# plt.figure(figsize=(12, 6))
# plt.hist(Y_train, bins=30, alpha=0.5, label='Train')
# for name, (_, y, _) in comb_partitions.items():
#     plt.hist(y, bins=30, alpha=0.5, label=name)
# plt.legend()
# plt.title('Distribution of target variable across partitions')
# plt.savefig('target_distribution.png')
# plt.close()

In [67]:
# Define models
models = {
    "GBR": GradientBoostingRegressor(n_estimators=500, ccp_alpha=1e-3),
    "SVR": SVR(kernel='poly', degree=3, epsilon=0.05),
    "RF": RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=7),
    "LL": LassoLarsCV(max_iter=5000, cv=5, n_jobs=-1),
    # "XGB": XGBRegressor()
}

# XGBoost parameter grid
xgb_param_grid = {
    'model__learning_rate': np.logspace(-4, 0, 20),
    'model__subsample': np.linspace(0.5, 1.0, 10),
    'model__colsample_bytree': np.linspace(0.1, 1.0, 10),
    'model__max_depth': [3, 4, 5, 6, 7, 8, 9, 10, None],
    'model__min_child_weight': np.linspace(1, 10, 10),
    'model__gamma': np.linspace(0, 1, 10),
    'model__n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}

In [68]:
# Train and evaluate models
for model_name, model in models.items():
    pipe = deepcopy(base_pipeline)
    pipe.steps.append(('model', model))
    
    print(f"\nTraining {model_name}...")
    t0 = time()
    
    if model_name == "XGB":
        search = RandomizedSearchCV(pipe, cv=5, param_distributions=xgb_param_grid,
                                    n_iter=100, n_jobs=64, verbose=3, refit='neg_mean_absolute_error',
                                    scoring=['neg_mean_absolute_error', 'r2'])
        search.fit(X_train, Y_train)
        pipe = search.best_estimator_
        print("Best XGBoost parameters:")
        pprint(search.best_params_)
    else:
        pipe.fit(X_train, Y_train)
    
    print(f"{model_name}: Fitting took {time() - t0:.3f}s.")

    scores = cross_validate(pipe, X_train, Y_train, cv=5, scoring=['neg_mean_absolute_error', 'r2'], return_train_score=True)
    pprint(scores)
    print(f"{model_name} Q_2: {np.mean(scores['test_r2']):.5f}")

    multiplot_and_print(pipe, X_train, Y_train, comb_partitions, f"BPA_Combinatorial_{model_name}")

print("All models trained and evaluated successfully.")


Training GBR...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


GBR: Fitting took 2.092s.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


{'fit_time': array([1.44683814, 1.33224916, 1.36076784, 1.31889606, 1.2964499 ]),
 'score_time': array([0.00700998, 0.00314021, 0.00350809, 0.00308084, 0.00316   ]),
 'test_neg_mean_absolute_error': array([-0.47636851, -0.59242917, -0.53919032, -0.4986625 , -0.36463755]),
 'test_r2': array([-0.07327918, -1.26688212,  0.25352063, -0.20850658, -0.03549744]),
 'train_neg_mean_absolute_error': array([-0.34556285, -0.37381301, -0.32850108, -0.32639431, -0.33683587]),
 'train_r2': array([0.64779298, 0.59758756, 0.60192412, 0.64836692, 0.68800321])}
GBR Q_2: -0.26613

BPA_Combinatorial_GBR
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.64349, train MAE: 0.34212

Part name: Unseen substrates
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -0.48964, test MAE: 0.67457

Part name: Unseen catalysts
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -0.42712, test MAE: 0.68157

Part name: Unseen subs and cats
X_test shape: (207, 768), Y_test shape: (207, 1)
Test 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVR: Fitting took 0.647s.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([0.73351908, 0.606148  , 0.82069302, 0.55063295, 0.66144991]),
 'score_time': array([0.00568414, 0.00507808, 0.00370193, 0.00370908, 0.00376296]),
 'test_neg_mean_absolute_error': array([-0.45647728, -0.5301645 , -0.53074607, -0.68370553, -0.36675094]),
 'test_r2': array([ 0.08588953, -0.80649483,  0.15653902, -1.16466236, -0.0660978 ]),
 'train_neg_mean_absolute_error': array([-0.34098012, -0.36994758, -0.32492408, -0.32242144, -0.33143539]),
 'train_r2': array([0.55946022, 0.51674921, 0.55837175, 0.56569237, 0.61429592])}
SVR Q_2: -0.35897

BPA_Combinatorial_SVR
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.56919, train MAE: 0.33819

Part name: Unseen substrates
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -0.11459, test MAE: 0.57193

Part name: Unseen catalysts
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -0.31707, test MAE: 0.67044

Part name: Unseen subs and cats
X_test shape: (207, 768), Y_test shape: (207, 1)
Test 

  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


RF: Fitting took 1.052s.


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


{'fit_time': array([0.94578671, 0.88772798, 0.89824486, 0.89819622, 0.89653301]),
 'score_time': array([0.02945995, 0.02910781, 0.03048229, 0.02977991, 0.03067732]),
 'test_neg_mean_absolute_error': array([-0.51525397, -0.58610121, -0.55757102, -0.52389869, -0.36139845]),
 'test_r2': array([-0.22581374, -1.2505895 ,  0.19548978, -0.3717181 ,  0.06607441]),
 'train_neg_mean_absolute_error': array([-0.34069842, -0.36993406, -0.32487202, -0.32230933, -0.33221366]),
 'train_r2': array([0.65271655, 0.60308284, 0.60915832, 0.65397022, 0.69285508])}
RF Q_2: -0.31731

BPA_Combinatorial_RF
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.64932, train MAE: 0.33800

Part name: Unseen substrates
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -0.37540, test MAE: 0.64795

Part name: Unseen catalysts
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -0.35439, test MAE: 0.65942

Part name: Unseen subs and cats
X_test shape: (207, 768), Y_test shape: (207, 1)
Test R^

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LL: Fitting took 2.993s.


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


{'fit_time': array([1.13403606, 0.71021914, 0.70692301, 0.94511938, 0.70389986]),
 'score_time': array([0.00330091, 0.00626612, 0.00312686, 0.00320363, 0.00355315]),
 'test_neg_mean_absolute_error': array([-0.71477043, -0.72290221, -0.72781451, -0.6354781 , -0.49224836]),
 'test_r2': array([-1.27730683, -2.22577447, -0.12148633, -0.94548584, -1.00569811]),
 'train_neg_mean_absolute_error': array([-0.34115504, -0.41338563, -0.34299651, -0.51164494, -0.33233158]),
 'train_r2': array([0.65244861, 0.46990985, 0.57143254, 0.        , 0.69284152])}
LL Q_2: -1.11515

BPA_Combinatorial_LL
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.00000, train MAE: 0.54042

Part name: Unseen substrates
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -0.04266, test MAE: 0.56389

Part name: Unseen catalysts
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -0.19093, test MAE: 0.63337

Part name: Unseen subs and cats
X_test shape: (207, 768), Y_test shape: (207, 1)
Test R^

In [23]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.inspection import permutation_importance

# def analyze_feature_importance(model, X, y, feature_names, model_name):
#     if hasattr(model, 'feature_importances_'):
#         # For tree-based models (RF, GBR, XGB)
#         importances = model.feature_importances_
#         indices = np.argsort(importances)[::-1]
        
#         plt.figure(figsize=(12, 8))
#         plt.title(f"Feature Importances ({model_name})")
#         plt.bar(range(min(20, len(importances))), importances[indices][:20], align="center")
#         plt.xticks(range(min(20, len(importances))), [feature_names[i] for i in indices[:20]], rotation=90)
#         plt.tight_layout()
#         plt.savefig(f"feature_importances_{model_name.lower()}.png")
#         plt.close()
        
#         print(f"\nTop 20 features for {model_name}:")
#         for f in range(min(20, len(importances))):
#             print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], importances[indices[f]]))
    
#     # Permutation Importance (works for all models)
#     perm_importance = permutation_importance(model, X, y, n_repeats=10, random_state=42, n_jobs=-1)
    
#     perm_importances = pd.DataFrame({
#         'feature': feature_names[:len(perm_importance.importances_mean)],
#         'importance': perm_importance.importances_mean,
#         'std': perm_importance.importances_std
#     }).sort_values('importance', ascending=False)
    
#     plt.figure(figsize=(12, 8))
#     plt.title(f"Permutation Importances ({model_name})")
#     plt.bar(range(20), perm_importances['importance'][:20], yerr=perm_importances['std'][:20], align="center")
#     plt.xticks(range(20), perm_importances['feature'][:20], rotation=90)
#     plt.tight_layout()
#     plt.savefig(f"permutation_importances_{model_name.lower()}.png")
#     plt.close()
    
#     print(f"\nTop 20 features by permutation importance for {model_name}:")
#     print(perm_importances.head(20))

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, mutual_info_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# Define a new base pipeline
base_pipeline = Pipeline(steps=[
    ('variance', VarianceThreshold()),
    ('scaler', StandardScaler()),
    ('feature_selection', SelectPercentile(mutual_info_regression, percentile=25)),
])

# Define simpler models with regularization
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "RF": RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
}

In [47]:
# Define parameter grids for each model
param_grids = {
    "Linear": {},
    "Ridge": {'model__alpha': np.logspace(-6, 6, 13)},
    "Lasso": {'model__alpha': np.logspace(-6, 6, 13)},
    "ElasticNet": {
        'model__alpha': np.logspace(-6, 6, 13),
        'model__l1_ratio': np.linspace(0.1, 0.9, 9)
    },
    "RF": {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4]
    }
}


In [48]:
def train_and_evaluate(X_train, Y_train, model_name, model, param_grid, comb_partitions):
    pipeline = Pipeline([
        ('base', base_pipeline),
        ('model', model)
    ])
    
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, Y_train.values.ravel())
    
    print(f"\nBest parameters for {model_name}:")
    print(grid_search.best_params_)
    
    # Get the best estimator
    best_estimator = grid_search.best_estimator_
    
    # Perform cross-validation on the training set
    cv_scores = cross_val_score(best_estimator, X_train, Y_train.values.ravel(), cv=5, scoring='r2')
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV R^2: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
    
    # Plot and print results
    multiplot_and_print(best_estimator, X_train, Y_train, comb_partitions, f"BPA_Combinatorial_{model_name}")
    
    return best_estimator

In [49]:
# Train and evaluate each model
best_models = {}
for model_name, model in models.items():
    print(f"\nTraining and evaluating {model_name}")
    best_models[model_name] = train_and_evaluate(X_train, Y_train, model_name, model, param_grids[model_name], comb_partitions)



Training and evaluating Linear

Best parameters for Linear:
{}
Cross-validation scores: [-6.61833703e+27 -1.43745982e+28 -1.92772296e+27 -1.43584673e+28
 -2.38496972e+28]
Mean CV R^2: -12225764541330651860395098112.0000 (+/- 15013983359583576394901225472.0000)

BPA_Combinatorial_Linear
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.61735, train MAE: 0.35179

Part name: Unseen substrates
X_test shape: (207, 768), Y_test shape: (207, 1)
Test R^2: 0.69646, test MAE: 0.31288

Part name: Unseen catalysts
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -1644339616134846208374996992.00000, test MAE: 20986609839537.30078

Part name: Unseen subs and cats
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -1490474672968638370731261952.00000, test MAE: 20986609839537.30859

Plot saved as BPA_Combinatorial_Linear.png

Training and evaluating Ridge

Best parameters for Ridge:
{'model__alpha': 10000.0}
Cross-validation scores: [-1.64987489e-02 -1.42682652e+00  3.

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Best parameters for Lasso:
{'model__alpha': 1.0}
Cross-validation scores: [-0.08490417 -1.48528503 -0.00982976 -0.94548584 -0.01997947]
Mean CV R^2: -0.5091 (+/- 1.2039)

BPA_Combinatorial_Lasso
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.00000, train MAE: 0.54042

Part name: Unseen substrates
X_test shape: (207, 768), Y_test shape: (207, 1)
Test R^2: -0.03286, test MAE: 0.58499

Part name: Unseen catalysts
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -0.04266, test MAE: 0.56389

Part name: Unseen subs and cats
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -0.19093, test MAE: 0.63337

Plot saved as BPA_Combinatorial_Lasso.png

Training and evaluating ElasticNet


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c


Best parameters for ElasticNet:
{'model__alpha': 1.0, 'model__l1_ratio': 0.30000000000000004}
Cross-validation scores: [-0.11429581 -1.51013857 -0.01073523 -0.8878431  -0.02151605]
Mean CV R^2: -0.5089 (+/- 1.1958)

BPA_Combinatorial_ElasticNet
X_train shape: (368, 768), Y_train shape: (368, 1)
Train R^2: 0.00917, train MAE: 0.53830

Part name: Unseen substrates
X_test shape: (207, 768), Y_test shape: (207, 1)
Test R^2: -0.02365, test MAE: 0.58229

Part name: Unseen catalysts
X_test shape: (304, 768), Y_test shape: (304, 1)
Test R^2: -0.04485, test MAE: 0.56444

Part name: Unseen subs and cats
X_test shape: (171, 768), Y_test shape: (171, 1)
Test R^2: -0.19220, test MAE: 0.63377

Plot saved as BPA_Combinatorial_ElasticNet.png

Training and evaluating RF

Best parameters for RF:
{'model__max_depth': 30, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 50}
Cross-validation scores: [-0.20111788 -0.93696238  0.18180083 -0.44117455  0.06042239]
Mean CV R^

In [50]:
# Function to analyze feature importances for Random Forest
def analyze_rf_feature_importance(rf_model, X_train):
    feature_importance = rf_model.named_steps['model'].feature_importances_
    feature_names = X_train.columns[rf_model.named_steps['base'].named_steps['feature_selection'].get_support()]
    
    # Sort features by importance
    sorted_idx = np.argsort(feature_importance)
    sorted_features = feature_names[sorted_idx]
    sorted_importance = feature_importance[sorted_idx]
    
    # Plot feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(sorted_importance)), sorted_importance)
    plt.yticks(range(len(sorted_importance)), sorted_features)
    plt.xlabel('Feature Importance')
    plt.title('Random Forest Feature Importance')
    plt.tight_layout()
    plt.savefig('RF_Feature_Importance.png')
    plt.close()
    
    print("\nTop 20 most important features:")
    for i, (feature, importance) in enumerate(zip(sorted_features[-20:], sorted_importance[-20:]), 1):
        print(f"{i}) {feature}: {importance:.4f}")

# Analyze feature importance for Random Forest
if 'RF' in best_models:
    analyze_rf_feature_importance(best_models['RF'], X_train)


Top 20 most important features:
1) 28: 0.0065
2) 186: 0.0068
3) 54: 0.0069
4) 14: 0.0077
5) 21: 0.0080
6) 12: 0.0084
7) 109: 0.0089
8) 146: 0.0106
9) 31: 0.0109
10) 4: 0.0126
11) 27: 0.0136
12) 34: 0.0163
13) 97: 0.0183
14) 73: 0.0184
15) 93: 0.0212
16) 152: 0.0250
17) 184: 0.0449
18) 68: 0.1022
19) 38: 0.1565
20) 98: 0.3259


In [51]:
# Plot learning curves for the best model
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10), scoring='r2')
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score (R^2)")
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.legend(loc="best")
    plt.savefig(f'{title.replace(" ", "_")}.png')
    plt.close()

# Plot learning curves for each model
for model_name, model in best_models.items():
    plot_learning_curve(model, X_train, Y_train.values.ravel(), f'Learning Curve - {model_name}')

print("All models trained, evaluated, and analyzed.")

All models trained, evaluated, and analyzed.


In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest, f_regression

# Assuming X_train, Y_train, and comb_partitions are already defined

# 1. Analyze Feature Importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, Y_train.values.ravel())

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 important features:")
print(feature_importance.head(20))

# 2. Error Analysis
def analyze_errors(model, X, y_true, set_name):
    y_pred = model.predict(X)
    
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values.ravel()
    elif isinstance(y_true, pd.Series):
        y_true = y_true.values
    
    errors = np.abs(y_true - y_pred)
    error_df = pd.DataFrame({
        'true': y_true,
        'predicted': y_pred,
        'error': errors
    }).sort_values('error', ascending=False)
    
    print(f"\nTop 10 errors for {set_name}:")
    print(error_df.head(10))

    return error_df

train_errors = analyze_errors(rf, X_train, Y_train, "Training Set")
for name, (X_test, Y_test, _) in comb_partitions.items():
    test_errors = analyze_errors(rf, X_test, Y_test, name)

# 3. Visualize Embeddings
def plot_embeddings(X, y, title):
    pca = PCA(n_components=50)
    X_pca = pca.fit_transform(X)
    
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X_pca)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y.values.ravel(), cmap='viridis')
    plt.colorbar(scatter)
    plt.title(title)
    plt.savefig(f'{title.replace(" ", "_")}.png')
    plt.close()

plot_embeddings(X_train, Y_train, "Training Set Embeddings")
for name, (X_test, Y_test, _) in comb_partitions.items():
    plot_embeddings(X_test, Y_test, f"{name} Embeddings")

# 4. Simple Baseline Model
k_best = SelectKBest(f_regression, k=10)
X_train_simple = k_best.fit_transform(X_train, Y_train.values.ravel())
simple_model = RandomForestRegressor(n_estimators=100, random_state=42)
simple_model.fit(X_train_simple, Y_train.values.ravel())

print("\nSimple Baseline Model Performance:")
print("Train R2:", r2_score(Y_train, simple_model.predict(X_train_simple)))
for name, (X_test, Y_test, _) in comb_partitions.items():
    X_test_simple = k_best.transform(X_test)
    print(f"{name} R2:", r2_score(Y_test, simple_model.predict(X_test_simple)))

# 5. Cross-Validation Strategy
def stratified_group_kfold(X, y, groups, n_splits=5):
    unique_groups = np.unique(groups)
    n_samples = len(X)
    group_to_indices = {g: np.where(groups == g)[0] for g in unique_groups}
    
    indices = np.arange(n_samples)
    for i in range(n_splits):
        test_groups = np.random.choice(unique_groups, size=len(unique_groups)//n_splits, replace=False)
        test_indices = np.concatenate([group_to_indices[g] for g in test_groups])
        train_indices = np.setdiff1d(indices, test_indices)
        yield train_indices, test_indices

# Assuming you have a 'group' column that identifies catalyst/substrate groups
groups = X_train.index

cv_scores = cross_val_score(rf, X_train, Y_train.values.ravel(), cv=stratified_group_kfold(X_train, Y_train.values.ravel(), groups))
print("\nStratified Group K-Fold CV Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

# Additional Analyses

# Ensure Y_train is a 1D array
Y_train_1d = Y_train.values.ravel()

# Distribution of target variable
plt.figure(figsize=(12, 6))
sns.histplot(Y_train_1d, kde=True, color='blue', label='Train')
for name, (_, Y_test, _) in comb_partitions.items():
    sns.histplot(Y_test.values.ravel(), kde=True, alpha=0.5, label=name)
plt.legend()
plt.title('Distribution of Target Variable')
plt.savefig('target_distribution.png')
plt.close()

# Correlation of features with target
correlation = X_train.corrwith(Y_train.iloc[:, 0])
top_corr = correlation.abs().sort_values(ascending=False).head(20)
plt.figure(figsize=(12, 8))
sns.barplot(x=top_corr.values, y=top_corr.index)
plt.title('Top 20 Features Correlated with Target')
plt.savefig('feature_correlation.png')
plt.close()

# PCA to check for clustering
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=Y_train_1d, cmap='viridis')
plt.colorbar(scatter, label='Target Value')
plt.title('PCA of Features Colored by Target Value')
plt.savefig('pca_visualization.png')
plt.close()

# Check for multicollinearity
correlation_matrix = X_train.corr()
plt.figure(figsize=(20, 16))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
plt.title('Feature Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.close()

# Print some additional information
print("\nShape of X_train:", X_train.shape)
print("Shape of Y_train:", Y_train.shape)
print("\nUnique values in Y_train:", np.unique(Y_train_1d).shape[0])
print("Min value in Y_train:", np.min(Y_train_1d))
print("Max value in Y_train:", np.max(Y_train_1d))

# Print information about the partitions
for name, (X_test, Y_test, _) in comb_partitions.items():
    print(f"\n{name}:")
    print("Shape of X_test:", X_test.shape)
    print("Shape of Y_test:", Y_test.shape)
    y_test_1d = Y_test.values.ravel()
    print("Unique values in Y_test:", np.unique(y_test_1d).shape[0])
    print("Min value in Y_test:", np.min(y_test_1d))
    print("Max value in Y_test:", np.max(y_test_1d))

Top 20 important features:
     feature  importance
98        98    0.214366
38        38    0.105375
68        68    0.068443
152      152    0.024051
184      184    0.020077
4          4    0.017338
57        57    0.014764
73        73    0.010990
474      474    0.010204
31        31    0.010024
424      424    0.009323
557      557    0.009201
28        28    0.008335
618      618    0.008282
453      453    0.007856
93        93    0.007102
97        97    0.007101
442      442    0.007042
401      401    0.007014
12        12    0.006871

Top 10 errors for Training Set:
         true  predicted     error
247  2.479053   2.163926  0.315127
266  1.744971   1.526178  0.218793
236  3.135881   2.922421  0.213460
74  -0.215698  -0.006231  0.209467
167  0.715833   0.922058  0.206226
122  0.750075   0.547394  0.202682
58   0.750075   0.550081  0.199994
165  1.301689   1.120171  0.181517
239  3.135881   2.960202  0.175679
341  0.821563   0.647846  0.173717

Top 10 errors for Unseen subs