# try autoencoder SMILES and use the latent space to train model

In [137]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
import random
from sklearn.decomposition import PCA

In [138]:
# Load data

# if use 19_science data
# df = pd.read_csv("data/19_science/19_science_total.csv")
# df["full_smi"] = df["Catalyst_smi"] + "." + df["Imine_smi"] + "." + df["Thiol_smi"]

# if use 18_science data
# df = pd.read_excel('data/18_science/18 science_original_chem.xlsx')
# df["full_smi"] = df["Ligand"] + "." + df["Additive"] + "." + df["Base"]+"." + df["Aryl halide"]

# if use suzuki data
# df = pd.read_excel('data/suzuki/suzuki.xlsx').dropna().reset_index(drop=True)
# df["full_smi"] = df["reactant_1"] + "." + df["reactant_2"] + "." + df["ligand"]+"." + df["reagent_1"]+"." + df["solvent_1"]

# if use conjugate data
df = pd.read_csv('data/peptide_data/conjugate_addition.csv').dropna().reset_index(drop=True)
df["full_smi"] = df["peptide_SMILES"] + "." + df["Reactant_1_smi"] + "." + df["Reactant_2_smi"]

In [139]:

# 构建字符索引
all_chars = set("".join(df["full_smi"].tolist()))
char_to_index = {char: idx for idx, char in enumerate(sorted(all_chars))}
index_to_char = {idx: char for char, idx in char_to_index.items()}

MAX_LEN = max(df["full_smi"].apply(len))
NCHARS = len(char_to_index)

# one-hot 编码函数
def smiles_to_onehot(smiles, max_len=MAX_LEN, n_chars=NCHARS):
    onehot = np.zeros((max_len, n_chars), dtype=np.float32)
    for i, c in enumerate(smiles[:max_len]):
        if c in char_to_index:
            onehot[i, char_to_index[c]] = 1.0
    return onehot

In [140]:
class SMILESEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, conv_channels=64, latent_dim=128):
        super(SMILESEncoder, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, conv_channels, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(conv_channels, conv_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(conv_channels * MAX_LEN, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x):
        # x shape: (batch, seq_len, n_chars)
        x = x.permute(0, 2, 1)  # 转为 (batch, channels, seq_len)
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        z = self.fc2(x)
        return z


In [141]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import time

def evaluate_model(X, y, model_type="MLP", n_runs=30):
    r2_scores, maes, times = [], [], []

    for _ in tqdm(range(n_runs)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

        if model_type == "MLP":
            model = MLPRegressor(hidden_layer_sizes=(256, 128), activation='relu',
                                 solver='adam', alpha=1e-4, max_iter=1000, random_state=0, early_stopping=False)
        elif model_type == "RF":
            model = RandomForestRegressor(n_estimators=300,max_depth=15, random_state=42, n_jobs=-1) # , min_samples_split=5
        else:
            raise ValueError("Unsupported model.")
        
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time.time() - start_time

        r2_scores.append(r2_score(y_test, y_pred))
        maes.append(mean_absolute_error(y_test, y_pred))
        times.append(duration)
        
    return {
        'R2_mean': np.mean(r2_scores),
        'R2_std': np.std(r2_scores),
        'MAE_mean': np.mean(maes),
        'MAE_std': np.std(maes),
        'Time_mean': np.mean(times),
        'Time_std': np.std(times)
    }

In [146]:
# 编码所有 SMILES
onehot_encoded = np.stack([smiles_to_onehot(smi) for smi in df["full_smi"]])
input_tensor = torch.tensor(onehot_encoded)

# 构建并编码
encoder = SMILESEncoder(input_dim=NCHARS, latent_dim=128)
with torch.no_grad():
    latent_vectors = encoder(input_tensor)

# 保存 latent vectors 和产率
X_latent = latent_vectors.numpy()
y_output = df["Reaction I eesyn (%)"].values

results = []

for X, name, dim in [
    # (X_onehot, "OneHot_SMILES", X_onehot.shape[1]),
    (X_latent, "Autoencoder", X_latent.shape[1]),
]:
    for model in ["RF","MLP"]:
        result = evaluate_model(X, y_output, model_type=model, n_runs=30)
        results.append([name, model, dim,
                        result['R2_mean'], result['R2_std'],
                        result['MAE_mean'], result['MAE_std']])

result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/conjugate_Autoencoder.csv')

100%|██████████| 30/30 [00:27<00:00,  1.08it/s]
100%|██████████| 30/30 [00:03<00:00,  8.73it/s]

  Feature_Type Model  Dim   R2_mean    R2_std   MAE_mean   MAE_std
0  Autoencoder    RF  128  0.254873  0.074901  52.698247  3.144060
1  Autoencoder   MLP  128 -0.027859  0.039978  66.564334  1.976207





# peptide

In [108]:
DATA_DIR = './data/peptide_data/'
# OUT_DIR = 'out/models_unimol_infer'+datetime.now().strftime('%y%m%d%H%M')+'/'
OUT_DIR = 'out/'

INPUTS_tripep_repr = 'peptide_SMILES.csv'  # Unscaled  data 
INPUTS_reactant1_repr = 'Reactant_1_smi.csv'
INPUTS_reactant2_repr = 'Reactant_2_smi.csv'
INPUTS_Origin_DF = 'conjugate_addition.csv'

inputs_tripep_repr = pd.read_csv(DATA_DIR + INPUTS_tripep_repr)
inputs_reactant1_repr = pd.read_csv(DATA_DIR + INPUTS_reactant1_repr)
inputs_reactant2_repr = pd.read_csv(DATA_DIR + INPUTS_reactant2_repr)

yields = pd.read_csv(DATA_DIR + INPUTS_Origin_DF)['Reaction I eesyn (%)']

inputs = np.concatenate([inputs_tripep_repr ,inputs_reactant1_repr,inputs_reactant2_repr],axis=1)

In [114]:
results = []

X = inputs

for X, name, dim in [
    # (X_onehot, "OneHot_SMILES", X_onehot.shape[1]),
    (inputs, "unimol", inputs.shape[1]),
]:
    for model in ["RF","MLP"]:
        result = evaluate_model(X, yields, model_type=model, n_runs=30)
        results.append([name, model, dim,
                        result['R2_mean'], result['R2_std'],
                        result['MAE_mean'], result['MAE_std']])

result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/peptide_unimol_70_30.csv')

100%|██████████| 30/30 [01:20<00:00,  2.67s/it]
100%|██████████| 30/30 [02:27<00:00,  4.93s/it]

  Feature_Type Model   Dim   R2_mean    R2_std   MAE_mean   MAE_std
0       unimol    RF  1536  0.826865  0.080260  21.005526  3.579010
1       unimol   MLP  1536  0.634688  0.133655  31.171844  5.432956





## 按照多肽种类划分 没用，文章可能是有问题

In [40]:
import random

# 读取数据
df = pd.read_csv("./data/peptide_data/conjugate_addition.csv")

DATA_DIR = './data/peptide_data/'
# OUT_DIR = 'out/models_unimol_infer'+datetime.now().strftime('%y%m%d%H%M')+'/'
OUT_DIR = 'out/'

INPUTS_tripep_repr = 'peptide_SMILES.csv'  # Unscaled  data 
INPUTS_reactant1_repr = 'Reactant_1_smi.csv'
INPUTS_reactant2_repr = 'Reactant_2_smi.csv'
INPUTS_Origin_DF = 'conjugate_addition.csv'

inputs_tripep_repr = pd.read_csv(DATA_DIR + INPUTS_tripep_repr)
inputs_reactant1_repr = pd.read_csv(DATA_DIR + INPUTS_reactant1_repr)
inputs_reactant2_repr = pd.read_csv(DATA_DIR + INPUTS_reactant2_repr)

yields = pd.read_csv(DATA_DIR + INPUTS_Origin_DF)['Reaction I eesyn (%)']

inputs = np.concatenate([inputs_tripep_repr ,inputs_reactant1_repr,inputs_reactant2_repr],axis=1)

# 设置随机种子（可选）
random.seed(42)

peptide_list = df['Peptide'].unique().tolist()
peptide_list = [e for e in peptide_list if e != "P5"]
selected_9 = random.sample(peptide_list, 9)
final_selection = ["P5"] + selected_9

test_idx = df[df["Peptide"].isin(final_selection)].index

X_test = inputs[test_idx]
X_train = np.delete(inputs, test_idx, axis=0)

y = yields  # 如果有标签列的话
y_test = y[test_idx]
y_train = np.delete(y, test_idx, axis=0)

results = []

for model in ["RF"]:
    result = evaluate_model(X_train, y_train, X_test,y_test, model_type=model, n_runs=10)
    print(result)
    results.append([name, model, dim,
                    result['R2_mean'], result['R2_std'],
                    result['MAE_mean'], result['MAE_std']])
result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/peptide_sequence_split_2.csv')


160 40


100%|██████████| 10/10 [00:30<00:00,  3.03s/it]

{'R2_mean': -0.037125025195120065, 'R2_std': 0.0, 'MAE_mean': 64.12308333333331, 'MAE_std': 1.4210854715202004e-14, 'Time_mean': 3.0255325555801393, 'Time_std': 0.040060729718162005}
  Feature_Type Model   Dim   R2_mean        R2_std   MAE_mean       MAE_std
0       unimol    RF  1536  0.850623  7.129368e-02  18.994637  3.418826e+00
1       unimol   MLP  1536  0.691957  1.000335e-01  29.409517  4.701280e+00
2       unimol    RF  1536 -0.037125  0.000000e+00  64.123083  1.421085e-14
3       unimol   MLP  1536 -0.736334  1.110223e-16  79.703945  0.000000e+00
4       unimol    RF  1536 -0.037125  0.000000e+00  64.123083  1.421085e-14
5       unimol   MLP  1536 -0.736334  1.110223e-16  79.703945  0.000000e+00
6       unimol    RF  1536 -0.037125  0.000000e+00  64.123083  1.421085e-14





In [81]:
smi_columns = ['peptide_SMILES', 'Reactant_1_smi', 'Reactant_2_smi']

# 对每个分子列进行 One-Hot 编码
encoded_parts = []
for col in smi_columns:
    onehot = pd.get_dummies(df[col], prefix=col, dtype=int)
    encoded_parts.append(onehot)

# 合并编码后的部分和输出
encoded_df = pd.concat(encoded_parts, axis=1)

# 存储为新表格
encoded_df.to_csv('HTE_descriptors/OH_encode/pep_onehot.csv', index=False)
# print(encoded_df.shape)

(200, 55)


In [102]:
des_path = 'HTE_descriptors'

def read_des(folder='folder_name'):
    # folder = 'MFP_descriptor'
    file_list = [f for f in os.listdir(folder) if f.endswith('.csv')]
    # 读取并处理每个文件
    fp_dfs = []
    for filename in file_list:
        filepath = os.path.join(folder, filename)
        df = pd.read_csv(filepath)

        if 'Original_SMILES' in df.columns:
            df = df.drop(columns=['Original_SMILES'])
    
        # 使用文件名作为前缀（去掉扩展名）
        prefix = os.path.splitext(filename)[0]
        df.columns = [f"{prefix}_{col}" for col in df.columns]
        print(f"{filename}的维度:", df.shape)
        fp_dfs.append(df)

    # 合并所有分子指纹特征
    combined_fp = pd.concat(fp_dfs, axis=1)
    # 输出结果查看
    print("合并后的维度:", combined_fp.shape)
    return combined_fp

des_path = 'HTE_descriptors'
RDkit_pep = read_des(os.path.join(des_path, 'MACCS/peptide'))

peptide_SMILES_MACCS.csv的维度: (200, 167)
Reactant_1_smi_MACCS.csv的维度: (200, 167)
Reactant_2_smi_MACCS.csv的维度: (200, 167)
合并后的维度: (200, 501)


In [135]:
def evaluate_model(X_train, y_train,X_test,y_test, model_type="MLP", n_runs=30):
    r2_scores, maes, times = [], [], []

    for _ in tqdm(range(n_runs)):
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        if model_type == "MLP":
            model = MLPRegressor(hidden_layer_sizes=(256, 128), activation='relu',
                                 solver='adam', alpha=1e-4, max_iter=1000, random_state=0)
        elif model_type == "RF":
            model = RandomForestRegressor(n_estimators=300,max_depth=15, random_state=42, n_jobs=-1) # , min_samples_split=5
        else:
            raise ValueError("Unsupported model.")
        
        start_time = time.time()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time.time() - start_time

        r2_scores.append(r2_score(y_test, y_pred))
        maes.append(mean_absolute_error(y_test, y_pred))
        times.append(duration)

        # model.fit(X_train, y_train)
        # y_pred = model.predict(X_test)
        # r2_scores.append(r2_score(y_test, y_pred))
        # maes.append(mean_absolute_error(y_test, y_pred))

    return {
        'R2_mean': np.mean(r2_scores),
        'R2_std': np.std(r2_scores),
        'MAE_mean': np.mean(maes),
        'MAE_std': np.std(maes),
        'Time_mean': np.mean(times),
        'Time_std': np.std(times)
    }

In [132]:
# 读取数据
DATA_DIR = './data/peptide_data/'
# OUT_DIR = 'out/models_unimol_infer'+datetime.now().strftime('%y%m%d%H%M')+'/'
OUT_DIR = 'out/peptide/'

INPUTS_tripep_repr = 'peptide_SMILES.csv'  # Unscaled  data 
INPUTS_reactant1_repr = 'Reactant_1_smi.csv'
INPUTS_reactant2_repr = 'Reactant_2_smi.csv'
INPUTS_Origin_DF = 'conjugate_addition.csv'

inputs_tripep_repr = pd.read_csv(DATA_DIR + INPUTS_tripep_repr)
inputs_reactant1_repr = pd.read_csv(DATA_DIR + INPUTS_reactant1_repr)
inputs_reactant2_repr = pd.read_csv(DATA_DIR + INPUTS_reactant2_repr)

yields = pd.read_csv(DATA_DIR + INPUTS_Origin_DF)['Reaction I eesyn (%)']


inputs = np.concatenate([encoded_df]) # np.concatenate([encoded_df])
# inputs = np.concatenate([inputs_tripep_repr ,inputs_reactant1_repr,inputs_reactant2_repr],axis=1)

total_cleaned_len = len(inputs_tripep_repr)

# # pca
# pca = PCA(n_components=100)
# inputs_reduced = pca.fit_transform(inputs)

# random.seed(42)

peptide_list = df['Peptide'].unique().tolist()
# peptide_list = [e for e in peptide_list if e != "P5"]
selected_9 = random.sample(peptide_list, 10)

# final_selection =  selected_9 # ["P5"] +
final_selection = ['P9', 'P41', 'P33', 'P32', 'P25', 'P24', 'P22', 'P20', 'P2', 'P5'] #['P9', 'P27', 'P15', 'P16', 'P14', 'P12', 'P11', 'P11', 'P1', 'P5']

print(final_selection)
test_idx = df[df["Peptide"].isin(final_selection)].index.to_list()

test_indices = np.zeros((total_cleaned_len), dtype=bool)
test_indices[test_idx] = True  # 只在 test_idx 的位置上设为 True

# 训练集布尔索引是取反
train_indices = ~test_indices

# 保存索引文件
np.savetxt(OUT_DIR + 'clean_data_train_indices.csv', train_indices, delimiter=',')
np.savetxt(OUT_DIR + 'clean_data_test_indices.csv', test_indices, delimiter=',')

['P9', 'P41', 'P33', 'P32', 'P25', 'P24', 'P22', 'P20', 'P2', 'P5']


In [133]:
# Load yield data
ee = np.array(yields)
ee = ee.flatten()
ee = np.nan_to_num(ee, nan=0)
print('len(inputs): ', len(inputs))
# Use the indices to generate train/test sets
X_train = inputs[train_indices]
#X_train = inputs[:2]
y_train = ee[train_indices]
featuresTrain = torch.from_numpy(X_train)
targetsTrain = torch.from_numpy(y_train)
batch_size = len(X_train)
print('batch_size: ', batch_size)

X_test = inputs[test_indices]
y_test = ee[test_indices]
featuresTest = torch.from_numpy(X_test)
targetsTest = torch.from_numpy(y_test)#.type(torch.LongTensor)
batch_size_test = len(X_test)
print('batch_size_test: ', batch_size_test)

train = torch.utils.data.TensorDataset(featuresTrain,targetsTrain)
test = torch.utils.data.TensorDataset(featuresTest,targetsTest)

train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(test, batch_size = batch_size_test, shuffle = False)

len(inputs):  200
batch_size:  160
batch_size_test:  40


In [136]:
results = []
for model in ["RF"]:
    result = evaluate_model(X_train, y_train, X_test,y_test, model_type=model, n_runs=1)
    print(result)
    results.append([name, model, dim,
                    result['R2_mean'], result['R2_std'],
                    result['MAE_mean'], result['MAE_std']])
result_df = pd.DataFrame(results, columns=[
    "Feature_Type", "Model", "Dim",
    "R2_mean", "R2_std", "MAE_mean", "MAE_std"
])
print(result_df)
result_df.to_csv('results/ADD/peptide_sequence_split_new.csv')


100%|██████████| 1/1 [00:00<00:00,  3.70it/s]

{'R2_mean': -0.5399905718486462, 'R2_std': 0.0, 'MAE_mean': 27.863472672242473, 'MAE_std': 0.0, 'Time_mean': 0.26892805099487305, 'Time_std': 0.0}
  Feature_Type Model   Dim   R2_mean  R2_std   MAE_mean  MAE_std
0       unimol    RF  1536 -0.539991     0.0  27.863473      0.0



