In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from tqdm import tqdm
import os
import time
import optuna
from utils import MLPRegressor, train_model, evaluate_model

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


In [None]:
file_path = '../data/18_science/18 science_original_chem.xlsx'
sheet_names = pd.ExcelFile(file_path).sheet_names

df_18 = pd.read_excel(file_path, sheet_name='FullCV_01')
df_test1 = pd.read_excel(file_path, sheet_name='Test1')
df_test2 = pd.read_excel(file_path, sheet_name='Test2')
df_test3 = pd.read_excel(file_path, sheet_name='Test3')
df_test4 = pd.read_excel(file_path, sheet_name='Test4')


In [None]:
def get_train_test_indices(df, test_df, test_size=300):
    # get train_test indices as the original data given
    id_cols = ['Ligand', 'Additive', 'Base', 'Aryl halide']

    test_subset = test_df.tail(test_size).copy()

    df['key'] = df[id_cols].astype(str).agg('|'.join, axis=1)
    test_subset['key'] = test_subset[id_cols].astype(str).agg('|'.join, axis=1)

    test_keys = set(test_subset['key'])
    df_indices = df.reset_index()

    test_indices = df_indices[df_indices['key'].isin(test_keys)]['index'].tolist()
    train_indices = df_indices[~df_indices['key'].isin(test_keys)]['index'].tolist()

    df.drop(columns='key', inplace=True)

    return train_indices, test_indices


In [11]:
train_idx1, test_idx1 = get_train_test_indices(df_18, df_test1,test_size=897)
train_idx2, test_idx2 = get_train_test_indices(df_18, df_test2,test_size=899)
train_idx3, test_idx3 = get_train_test_indices(df_18, df_test3,test_size=896)
train_idx4, test_idx4 = get_train_test_indices(df_18, df_test4,test_size=899)

In [None]:
# read descriptor
des_path = '../HTE_descriptors'

def read_des(folder='folder_name'):
    # folder = 'MFP_descriptor'
    file_list = [f for f in os.listdir(folder) if f.endswith('.csv')]
    fp_dfs = []
    for filename in file_list:
        filepath = os.path.join(folder, filename)
        df = pd.read_csv(filepath)

        if 'Original_SMILES' in df.columns:
            df = df.drop(columns=['Original_SMILES'])
    
        prefix = os.path.splitext(filename)[0]
        df.columns = [f"{prefix}_{col}" for col in df.columns]
        print(f"{filename}的维度:", df.shape)
        fp_dfs.append(df)
    combined_fp = pd.concat(fp_dfs, axis=1)
    print("合并后的维度:", combined_fp.shape)
    return combined_fp

# one-hot encode
OHE_18 = pd.read_csv(os.path.join(des_path, f"OH_encode/18_onehot.csv"))

# （Morgan Fingerprint）

FP_18 = read_des(os.path.join(des_path, 'MACCS/18_data'))

# RDKit 

RDKit_18 = read_des(os.path.join(des_path, 'RDKit_descriptors/18_data'))

# mordred
Mord_18 = read_des(os.path.join(des_path, 'Mordred/18_data'))

UniMol_18 = read_des(os.path.join(des_path, 'unimol/18_task1'))

Additive_repr.csv的维度: (3955, 1536)
Aryl halide_repr.csv的维度: (3955, 1536)
Base_repr.csv的维度: (3955, 1536)
Ligand_repr.csv的维度: (3955, 1536)
合并后的维度: (3955, 6144)


In [12]:
test_sets = [df_test1, df_test2, df_test3, df_test4]
test_size = [897, 899, 896, 899]

features = {
    "Uni-Mol": (UniMol_18),
}
X = UniMol_18  # 特征
y = df_18['Output'].values  # 标签

results = []

for i, test_df in enumerate(test_sets, 1):
    print(f"Processing Task {i}...")
    train_idx, test_idx = get_train_test_indices(df_18.copy(), test_df,test_size=test_size[i-1])
    X_train, X_test = X.iloc[train_idx].values, X.iloc[test_idx].values
    y_train, y_test = y[train_idx], y[test_idx]

    # 训练模型
    model = RandomForestRegressor(random_state=42, n_estimators= 350, max_depth= 30, min_samples_split=12, min_samples_leaf=3, max_features='log2')
    model.fit(X_train, y_train)

    # 预测与评估
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append([
        f'Task{i}', X.shape[1], r2, mae, mse, mae])

    print(f"Task: {i}")
    print(f"Test MSE: {mae:.4f}")
    print(f"Test R² : {r2:.4f}")
results_df = pd.DataFrame(results, columns=[
    "Feature", "Dimension", "R²", "MAE", "MSE", "MAE"
])
# results_df.to_csv('Buchwald_task_rf.csv', index=False)

Processing Task 1...
Task: 1
Test MSE: 10.8601
Test R² : 0.7070
Processing Task 2...
Task: 2
Test MSE: 9.9201
Test R² : 0.7626
Processing Task 3...
Task: 3
Test MSE: 12.4978
Test R² : 0.6506
Processing Task 4...
Task: 4
Test MSE: 14.4377
Test R² : 0.4950


## Try mlp model

In [None]:
def objective(trial, X_train, y_train, X_test, y_test, input_dim):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 超参数搜索空间16, 
    batch_size = trial.suggest_categorical("batch_size", [16])
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    hidden_dim = trial.suggest_categorical("hidden_dim", [256,128])
    num_layers = trial.suggest_int("num_layers", 2, 3)
    dropout = trial.suggest_float("dropout", 0.0, 0.3)
    epochs = 300

    # 构建数据集
    train_ds = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    test_ds = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size)

    # 构建模型
    model = MLPRegressor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        dropout=dropout
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        train_model(model, train_loader, optimizer, criterion, device)

    y_pred, y_true = evaluate_model(model, test_loader, device)
    return mean_squared_error(y_true, y_pred)

def run_mlp_with_optuna(X, y, df, test_sets, n_trials=30):
    results = []
    input_dim = X.shape[1]

    for i, test_df in enumerate(test_sets, 1):
        train_idx, test_idx = get_train_test_indices(df.copy(), test_df, test_size=899)
        X_train, X_test = X.iloc[train_idx].values, X.iloc[test_idx].values
        y_train, y_test = y[train_idx], y[test_idx]

        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test, input_dim),
                       n_trials=n_trials)

        best_params = study.best_params
        print(f"Task {i} best params: {best_params}")

        # 训练最终模型
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        train_ds = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
        test_ds = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))
        train_loader = DataLoader(train_ds, batch_size=best_params['batch_size'], shuffle=True)
        test_loader = DataLoader(test_ds, batch_size=best_params['batch_size'])

        model = MLPRegressor3(
            input_dim=input_dim,
            hidden_dim=best_params['hidden_dim'],
            num_layers=best_params['num_layers'],
            dropout=best_params['dropout']
        ).to(device)
        optimizer = optim.Adam(model.parameters(), lr=best_params['lr'])
        criterion = nn.MSELoss()
        for epoch in range(900):
            train_model(model, train_loader, optimizer, criterion, device)

        y_pred, y_true = evaluate_model(model, test_loader, device)

        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)

        print(f"Task {i}: R²={r2:.4f}, MAE={mae:.4f}, MSE={mse:.4f}")
        results.append([f"Task{i}", input_dim, r2, mae, mse, mae])

    # 保存结果
    results_df = pd.DataFrame(results, columns=["Task", "Dimension", "R²", "MAE", "MSE", "MAE"])
    return results_df


In [20]:
# 可能需要在win电脑上运行一下
X = UniMol_18  # 特征矩阵
y = df_18['Output'].values
test_sets = [df_test1]

results_df = run_mlp_with_optuna(X, y, df_18, test_sets, n_trials=30)

[I 2025-08-02 21:05:25,807] A new study created in memory with name: no-name-fb3edc65-d5d3-45fc-aeaa-c1deb7e2c761
[I 2025-08-02 21:08:02,033] Trial 0 finished with value: 228.97921752929688 and parameters: {'batch_size': 16, 'lr': 2.989730841471986e-05, 'hidden_dim': 128, 'num_layers': 3, 'dropout': 0.1181912399167215}. Best is trial 0 with value: 228.97921752929688.
[I 2025-08-02 21:10:11,660] Trial 1 finished with value: 200.3040008544922 and parameters: {'batch_size': 16, 'lr': 4.3988132275403704e-05, 'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.24567830073735186}. Best is trial 1 with value: 200.3040008544922.
[I 2025-08-02 21:12:48,325] Trial 2 finished with value: 248.79977416992188 and parameters: {'batch_size': 16, 'lr': 0.0003841476882670736, 'hidden_dim': 128, 'num_layers': 3, 'dropout': 0.16365263610135455}. Best is trial 1 with value: 200.3040008544922.
[I 2025-08-02 21:15:25,416] Trial 3 finished with value: 248.624267578125 and parameters: {'batch_size': 16, 'lr': 1.8

Task 1 best params: {'batch_size': 16, 'lr': 1.0071528108380703e-05, 'hidden_dim': 128, 'num_layers': 2, 'dropout': 0.24299760610273677}
Task 1: R²=0.7596, MAE=9.2901, MSE=178.7480
