# Benchmark different featurization on 3 HTE dataset

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from tqdm import tqdm
import os
import time
from sklearn.decomposition import PCA

In [20]:
# 读取数据
df_19 = pd.read_csv('data/19_science/19_science_total.csv', )
df_18 = pd.read_excel('data/18_science/18 science_original_chem.xlsx')
df_sm = pd.read_excel("data/suzuki/suzuki.xlsx").dropna().reset_index(drop=True)
df_pep = pd.read_csv('data/peptide_data/conjugate_addition.csv' )

In [17]:
# read descriptor
des_path = 'HTE_descriptors'

def read_des(folder='folder_name'):
    # folder = 'MFP_descriptor'
    file_list = [f for f in os.listdir(folder) if f.endswith('.csv')]
    # 读取并处理每个文件
    fp_dfs = []
    for filename in file_list:
        filepath = os.path.join(folder, filename)
        df = pd.read_csv(filepath)

        if 'Original_SMILES' in df.columns:
            df = df.drop(columns=['Original_SMILES'])
    
        # 使用文件名作为前缀（去掉扩展名）
        prefix = os.path.splitext(filename)[0]
        df.columns = [f"{prefix}_{col}" for col in df.columns]
        print(f"{filename}的维度:", df.shape)
        fp_dfs.append(df)

    # 合并所有分子指纹特征
    combined_fp = pd.concat(fp_dfs, axis=1)
    # 输出结果查看
    print("合并后的维度:", combined_fp.shape)
    return combined_fp

# one-hot encode
OHE_19 = pd.read_csv(os.path.join(des_path, f"OH_encode/19_onehot.csv"))
OHE_18 = pd.read_csv(os.path.join(des_path, f"OH_encode/18_onehot.csv"))
OHE_sm = pd.read_csv(os.path.join(des_path, f"OH_encode/sm_onehot.csv"))
OHE_pep = pd.read_csv(os.path.join(des_path, f"OH_encode/conjugate_onehot.csv"))

print(f"OHE19:{OHE_19.shape}, OHE18:{OHE_18.shape}, OHE_sm:{OHE_sm.shape}, OHE_pep:{OHE_pep.shape}")

# 分子指纹（Morgan Fingerprint）

# FP_19 = read_des(os.path.join(des_path, 'MFP_descriptor/19_data'))
# FP_18 = read_des(os.path.join(des_path, 'MFP_descriptor/18_data'))
# FP_sm = read_des(os.path.join(des_path, 'MFP_descriptor/suzuki'))
FP_19 = read_des(os.path.join(des_path, 'MACCS/19_data'))
FP_18 = read_des(os.path.join(des_path, 'MACCS/18_data'))
FP_sm = read_des(os.path.join(des_path, 'MACCS/suzuki'))
FP_pep = read_des(os.path.join(des_path, 'MACCS/conjugate'))


# RDKit 分子性质

RDkit_19 = read_des(os.path.join(des_path, 'RDKit_descriptors/19_data'))
RDKit_18 = read_des(os.path.join(des_path, 'RDKit_descriptors/18_data'))
RDkit_sm = read_des(os.path.join(des_path, 'RDKit_descriptors/suzuki'))
RDkit_sm = read_des(os.path.join(des_path, 'RDKit_descriptors/conjugate'))

# mordred
Mord_19 = read_des(os.path.join(des_path, 'Mordred/19_data'))
Mord_18 = read_des(os.path.join(des_path, 'Mordred/18_data'))
Mord_sm = read_des(os.path.join(des_path, 'Mordred/suzuki'))
Mord_pep = read_des(os.path.join(des_path, 'Mordred/conjugate'))

UniMol_19 = read_des(os.path.join(des_path, 'unimol/19_data'))
UniMol_18 = read_des(os.path.join(des_path, 'unimol/18_data'))
UniMol_sm = read_des(os.path.join(des_path, 'unimol/suzuki'))
UniMol_pep = read_des(os.path.join(des_path, 'unimol/conjugate'))

# X_desc = rdkit_descriptors(df['full_smi'])
# desc_dim = X_desc.shape[1]

OHE19:(1075, 53), OHE18:(3955, 44), OHE_sm:(4620, 33), OHE_pep:(200, 55)
Catalyst_smi_MACCS.csv的维度: (1075, 167)
Imine_smi_MACCS.csv的维度: (1075, 167)
MACCS.csv的维度: (5, 167)
Thiol_smi_MACCS.csv的维度: (1075, 167)
合并后的维度: (1075, 668)
Additive_MACCS.csv的维度: (3955, 167)
Aryl halide_MACCS.csv的维度: (3955, 167)
Base_MACCS.csv的维度: (3955, 167)
Ligand_MACCS.csv的维度: (3955, 167)
合并后的维度: (3955, 668)
ligand_fingerprints.csv的维度: (4620, 167)
reactant_1_fingerprints.csv的维度: (4620, 167)
reactant_2_fingerprints.csv的维度: (4620, 167)
reagent_1_fingerprints.csv的维度: (4620, 167)
solvent_1_fingerprints.csv的维度: (4620, 167)
合并后的维度: (4620, 835)
peptide_SMILES_MACCS.csv的维度: (200, 167)
Reactant_1_smi_MACCS.csv的维度: (200, 167)
Reactant_2_smi_MACCS.csv的维度: (200, 167)
合并后的维度: (200, 501)
Catalyst_smi_descriptors.csv的维度: (1075, 209)
Imine_smi_descriptors.csv的维度: (1075, 209)
Thiol_smi_descriptors.csv的维度: (1075, 209)
合并后的维度: (1075, 627)
Additive_descriptors.csv的维度: (3955, 209)
Aryl halide_descriptors.csv的维度: (3955, 209)
Base_desc

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# 清理 RDKit 特征数据
def clean_rdkit_features(X):
    imp = SimpleImputer(strategy='mean')
    scaler = StandardScaler()
    X_clean = imp.fit_transform(X)
    X_scaled = scaler.fit_transform(X_clean)
    return X_scaled

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from mordred import Calculator, descriptors

# 初始化 Mordred 描述符计算器
calc = Calculator(descriptors, ignore_3D=True)

# 示例 SMILES（替换为你自己的）
smiles_list = [
    "CC(C)O", 
    "c1ccccc1", 
    "C1CCCCC1", 
    "CC(=O)O", 
    "INVALID_SMILES"
]

# 过滤非法 SMILES
mols = []
valid_indices = []
for i, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        mols.append(mol)
        valid_indices.append(i)

# 计算描述符
df = calc.pandas(mols)

# 替换 inf 为 NaN，删除含 NaN 的列
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(axis=1, how='any')

# 删除包含非数字（例如字符）的列
def is_column_all_numeric(col):
    try:
        return col.apply(lambda x: isinstance(x, (int, float, np.integer, np.floating))).all()
    except:
        return False

numeric_cols = [col for col in df.columns if is_column_all_numeric(df[col])]
df_cleaned = df[numeric_cols]

# 添加 SMILES 列（可选）
df_cleaned['Original_SMILES'] = [smiles_list[i] for i in valid_indices]

# 查看结果
print("最终维度（所有值为纯数值）:", df_cleaned.shape)

# 保存
df_cleaned.to_csv('final_clean_mordred_descriptors.csv', index=False)


In [9]:
def evaluate_model(X, y, model_type="MLP", n_runs=30):
    r2_scores, maes, times = [], [], []

    for _ in tqdm(range(n_runs)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        if model_type == "MLP":
            model = MLPRegressor(hidden_layer_sizes=(256, 128), activation='relu',
                                 solver='adam', alpha=1e-4, max_iter=1000, random_state=0)
        elif model_type == "RF":
            model = RandomForestRegressor(n_estimators=300,max_depth=15, random_state=42, n_jobs=-1) # , min_samples_split=5
        else:
            raise ValueError("Unsupported model.")
        
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time.time() - start_time

        r2_scores.append(r2_score(y_test, y_pred))
        maes.append(mean_absolute_error(y_test, y_pred))
        times.append(duration)

        # model.fit(X_train, y_train)
        # y_pred = model.predict(X_test)
        # r2_scores.append(r2_score(y_test, y_pred))
        # maes.append(mean_absolute_error(y_test, y_pred))

    return {
        'R2_mean': np.mean(r2_scores),
        'R2_std': np.std(r2_scores),
        'MAE_mean': np.mean(maes),
        'MAE_std': np.std(maes),
        'Time_mean': np.mean(times),
        'Time_std': np.std(times)
    }

In [23]:
results = []
#运行对比 只使用随机森林模型
# load data without pca
# X_onehot = OHE_19
# X_fp = FP_19.fillna(0)
# X_desc = RDkit_19
# X_mord = Mord_19

# load data with pca
pca = PCA(n_components=100)
X_fp = pca.fit_transform(FP_19.fillna(0))
X_desc = pca.fit_transform(RDkit_19)
X_mord = pca.fit_transform(Mord_19)
X_unimol = pca.fit_transform(UniMol_19)

y = df_19['Output'].values

# 1 test on 19 science data
for X, name, dim in [
    # # (X_onehot, "OneHot_SMILES", X_onehot.shape[1]),
    # (X_fp, "Fingerprint", X_fp.shape[1]),
    # (clean_rdkit_features(X_desc), "RDKit_Descriptors", X_desc.shape[1]),
    # (X_mord, "Mordred", X_mord.shape[1]),
    (X_unimol,"Unimol",X_unimol.shape[1])
]:
    for model in ["RF"]:
        result = evaluate_model(X, y, model_type=model, n_runs=30)
        results.append([name, model, dim,
                        result['R2_mean'], result['R2_std'],
                        result['MAE_mean'], result['MAE_std']])

result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/19_science_7-3_split_30_pca_unimol.csv')

100%|██████████| 30/30 [00:21<00:00,  1.37it/s]

  Feature_Type Model  Dim   R2_mean    R2_std  MAE_mean   MAE_std
0       Unimol    RF  100  0.907735  0.012775  0.144409  0.008373





In [26]:
# conjugate
results = []
#运行对比 只使用随机森林模型
# load data without pca
X_onehot = OHE_pep
X_fp = FP_pep.fillna(0)
X_desc = FP_pep
X_mord = Mord_pep
X_unimol = UniMol_pep

y = df_pep['Reaction I eesyn (%)'].values

# 1 test on 19 science data
for X, name, dim in [
    (X_onehot, "OneHot_SMILES", X_onehot.shape[1]),
    # (X_fp, "Fingerprint", X_fp.shape[1]),
    # (clean_rdkit_features(X_desc), "RDKit_Descriptors", X_desc.shape[1]),
    # (X_mord, "Mordred", X_mord.shape[1]),
    # (X_unimol,'Unimol',X_unimol.shape[1])
]:
    for model in ["RF"]:
        result = evaluate_model(X, y, model_type=model, n_runs=30)
        results.append([name, model, dim,
                        result['R2_mean'], result['R2_std'],
                        result['MAE_mean'], result['MAE_std']])

result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/conjugate_7-3_split_30_oh.csv')

100%|██████████| 30/30 [00:08<00:00,  3.54it/s]

    Feature_Type Model  Dim   R2_mean    R2_std   MAE_mean   MAE_std
0  OneHot_SMILES    RF   55  0.675637  0.111148  29.943669  3.762986





In [50]:
results = []

# X_onehot = OHE_18
# X_fp = FP_18
# X_desc = RDKit_18
# X_mord = Mord_18
y = df_18['Output'].values

# load data with pca
pca = PCA(n_components=100)
X_fp = pca.fit_transform(FP_18.fillna(0))
X_desc = pca.fit_transform(RDKit_18)
X_mord = pca.fit_transform(Mord_18)

# 1 test on 18 science data
for X, name, dim in [
    # (X_onehot, "OneHot_SMILES", X_onehot.shape[1]),
    (X_fp, "Fingerprint", X_fp.shape[1]),
    (clean_rdkit_features(X_desc), "RDKit_Descriptors", X_desc.shape[1]),
    (X_mord, "Mordred", X_mord.shape[1]),
]:
    for model in [ "RF"]:
        result = evaluate_model(X, y, model_type=model, n_runs=30)
        results.append([name, model, dim,
                        result['R2_mean'], result['R2_std'],
                        result['MAE_mean'], result['MAE_std'],
                        result['Time_mean'], result['Time_std']])

result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std",
    "Time_mean", "Time_std"
])
print(result_df)
result_df.to_csv('results/ADD/18_science_RF_300_pca.csv')

100%|██████████| 30/30 [02:49<00:00,  5.65s/it]
100%|██████████| 30/30 [08:52<00:00, 17.75s/it]
100%|██████████| 30/30 [03:39<00:00,  7.32s/it]

        Feature_Type Model  Dim   R2_mean    R2_std  MAE_mean   MAE_std  \
0        Fingerprint    RF  100  0.862435  0.011537  6.775841  0.187797   
1  RDKit_Descriptors    RF  100  0.855471  0.009001  7.453152  0.169071   
2            Mordred    RF  100  0.869478  0.010302  6.591533  0.143120   

   Time_mean  Time_std  
0   5.642702  0.122901  
1  17.749761  0.559831  
2   7.317358  0.207988  





In [54]:
results = []
# X_onehot = OHE_sm
# X_fp = FP_sm
# X_desc = RDkit_sm
# X_mord = Mord_sm
y = df_sm['Output'].values

# load data with pca
pca = PCA(n_components=100)
X_fp = pca.fit_transform(FP_sm.fillna(0))
X_desc = pca.fit_transform(RDkit_sm.fillna(0))
X_mord = pca.fit_transform(Mord_sm)

# 1 test on suzuki  data
for X, name, dim in [
    # (X_onehot, "OneHot_SMILES", X_onehot.shape[1]),
    (X_fp, "Fingerprint", X_fp.shape[1]),
    (clean_rdkit_features(X_desc), "RDKit_Descriptors", X_desc.shape[1]),
    (X_mord, "Mordred", X_mord.shape[1]),
    # (X_unimol, "Uni-Mol", X_unimol.shape[1])
]:
    for model in [ "RF"]:
        result = evaluate_model(X, y, model_type=model, n_runs=30)
        results.append([name, model, dim,
                        result['R2_mean'], result['R2_std'],
                        result['MAE_mean'], result['MAE_std']])

result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/suzuki_RF_300_pca.csv')

100%|██████████| 30/30 [00:32<00:00,  1.09s/it]
100%|██████████| 30/30 [07:18<00:00, 14.61s/it]
100%|██████████| 30/30 [02:52<00:00,  5.74s/it]

        Feature_Type Model  Dim   R2_mean    R2_std  MAE_mean   MAE_std
0        Fingerprint    RF  100  0.843082  0.008093  0.076762  0.001839
1  RDKit_Descriptors    RF  100  0.747930  0.010618  0.104555  0.002450
2            Mordred    RF  100  0.799522  0.009492  0.091249  0.002502



