In [1]:
# Load dependence 
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

In [3]:
# Load raw data
descs = [desc_name[0] for desc_name in Descriptors._descList]
desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(descs)

df = pd.read_csv('./RuCHFunctionalizationDataset/dataset.csv')
df_DG = pd.read_csv('./RuCHFunctionalizationDataset/DG.csv')
df_RX = pd.read_csv('./RuCHFunctionalizationDataset/RX.csv')

num = df['number'].to_list()
num = np.array(num)
target = df['tag'].to_numpy()

target_DG_columns = ['lumo', 'EA', 'bv-3.5']
target_DG = df_DG[target_DG_columns].to_numpy()

target_RX_columns = ['somo', 'bv-3.5','spin']
target_RX = df_RX[target_RX_columns].to_numpy()

DG_smiles = df['DG'].to_list()
RX_smiles = df['RX'].to_list()
cat_smiles = df['catalyst'].to_list()
sol_smiles = df['solvent'].to_list()
lig_smiles = df['ligand'].to_list()
ad_smiles = df['addictive'].to_list()

In [5]:
# Generate discriptors
def gen_mol_form_smi(smi_list):
    mol_list = []
    for smi in smi_list:
        if isinstance(smi, str) == True :
            mol = Chem.MolFromSmiles(smi)
            if mol == None:
                print(smi)
            mol_list.append(mol)
        else:
            mol = 0
            mol_list.append(mol)
    return mol_list

DG_mols =  gen_mol_form_smi(DG_smiles)
RX_mols =  gen_mol_form_smi(RX_smiles)
cat_mols =  gen_mol_form_smi(cat_smiles)
sol_mols =  gen_mol_form_smi(sol_smiles)
lig_mols =  gen_mol_form_smi(lig_smiles)
ad_mols = gen_mol_form_smi(ad_smiles)
        
def gen_desc_from_mol(mol_list):
    desc_list = []
    for mol in mol_list:
        if mol != 0:
            desc = desc_calc.CalcDescriptors(mol)
            desc_list.append(desc)
        else:
            n = 208
            decs = [0]*n
            desc_list.append(decs)
    decs = np.array(desc_list)
    return decs

DG_descs = gen_desc_from_mol(DG_mols)
RX_descs = gen_desc_from_mol(RX_mols)
cat_descs = gen_desc_from_mol(cat_mols)
sol_descs = gen_desc_from_mol(sol_mols)
lig_descs = gen_desc_from_mol(lig_mols)
ad_descs = gen_desc_from_mol(ad_mols)

f_p_2 = df_DG['f+2'].to_numpy()
f_p_3 = df_DG['f+3'].to_numpy()
f_p_4 = df_DG['f+4'].to_numpy()
f_p_5 = df_DG['f+5'].to_numpy()
f_p_6 = df_DG['f+6'].to_numpy()

f_m_2 = df_DG['f-2'].to_numpy()
f_m_3 = df_DG['f-3'].to_numpy()
f_m_4 = df_DG['f-4'].to_numpy()
f_m_5 = df_DG['f-5'].to_numpy()
f_m_6 = df_DG['f-6'].to_numpy()

f_0_2 = df_DG['f0-2'].to_numpy()
f_0_3 = df_DG['f0-3'].to_numpy()
f_0_4 = df_DG['f0-4'].to_numpy()
f_0_5 = df_DG['f0-5'].to_numpy()
f_0_6 = df_DG['f0-6'].to_numpy()

Q_2 = df_DG['Q2'].to_numpy()
Q_3 = df_DG['Q3'].to_numpy()
Q_4 = df_DG['Q4'].to_numpy()
Q_5 = df_DG['Q5'].to_numpy()
Q_6 = df_DG['Q6'].to_numpy()

DG_phys = np.column_stack([f_p_2, f_p_3, f_p_4, f_p_5, f_p_6, f_m_2, f_m_3, f_m_4, f_m_5, f_m_6,
                          f_0_2, f_0_3, f_0_4, f_0_5, f_0_6, Q_2, Q_3, Q_4, Q_5, Q_6])

Qc_R = df_RX['Qc'].to_numpy()
f0 = df_RX['f0'].to_numpy()
f_m = df_RX['f-'].to_numpy()
f_p = df_RX['f+'].to_numpy()

RX_phys = np.column_stack([Qc_R, f0, f_m, f_p])

#feature clean
all_descriptors = np.concatenate([DG_descs, RX_descs, cat_descs, lig_descs, sol_descs, ad_descs, DG_phys, RX_phys],axis=1)
all_descriptors = pd.DataFrame(all_descriptors)
all_descriptors = all_descriptors.dropna(axis=1,how='any')
all_descriptors= np.array(all_descriptors)
all_descriptors = np.unique(all_descriptors, axis=1)

DG_descs = pd.DataFrame(DG_descs)
DG_descs = DG_descs.dropna(axis=1,how='any')
DG_descs = np.array(DG_descs)
DG_descs = np.unique(DG_descs, axis=1)

RX_descs = pd.DataFrame(RX_descs)
RX_descs = RX_descs.dropna(axis=1,how='any')
RX_descs = np.array(RX_descs)
RX_descs = np.unique(RX_descs, axis=1)

reaction_dim = len(all_descriptors[0])
DG_dim = len(DG_descs[0])
RX_dim = len(RX_descs[0])

In [6]:
# Find substituted arenes
def ortho_substituted(mol):
    mol_no_H = AllChem.RemoveHs(mol)
    for idx, atom in enumerate(mol_no_H.GetAtoms()):
        if idx == 1 :
            if atom.GetDegree() > 2:
                return True
        elif idx == 5:
            if atom.GetDegree() > 2:
                return True
            
def meta_substituted(mol):
    mol_no_H = AllChem.RemoveHs(mol)
    for idx, atom in enumerate(mol_no_H.GetAtoms()):
        if idx == 2 :
            if atom.GetDegree() > 2:
                return True
        elif idx == 4:
            if atom.GetDegree() > 2:
                return True
            

ortho_sub_list = []
meta_sub_list = []

for idx, mol in enumerate(DG_mols):
    if ortho_substituted(mol) == True:
        ortho_sub_list.append(idx)

for idx, mol in enumerate(DG_mols):
    if meta_substituted(mol) == True:
        meta_sub_list.append(idx)

In [9]:
# Define multi-task neural network model and dataset
class MultiTaskNN(nn.Module):
    def __init__(self, in_dim1,in_dim2,in_dim3, hidden_dim, regression_output_dim1, regression_output_dim2, num_classes):
        super(MultiTaskNN, self).__init__()
        
        self.fc1 = nn.Linear(in_dim1, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        
        self.fc3 = nn.Linear(in_dim2, hidden_dim)
        self.fc4 = nn.Linear(in_dim3, hidden_dim)
        
        self.regression_output1 = nn.Linear(hidden_dim, regression_output_dim1)
        self.regression_output2 = nn.Linear(hidden_dim, regression_output_dim2)
        self.classification_output = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, h1, h2, h3):
            
        h1 = F.relu(h1)
        h1 = F.relu(self.fc1(h1))
        h1 = F.relu(self.fc2(h1))
        
        h2 = F.relu(self.fc3(h2))
        h2 = F.relu(self.fc2(h2))
        
        h3 = F.relu(self.fc4(h3))
        h3 = F.relu(self.fc2(h3))
        
        regression_output1 = self.regression_output1(h1)
        regression_output2 = self.regression_output2(h2)
        classification_output = self.classification_output(h3)
        
        return regression_output1, regression_output2, classification_output

class MultiTaskDataset(Dataset):
    def __init__(self, DG_list, RX_list, reaction_list, label1_list, label2_list, label3_list, num_list):
        self.DG_list = DG_list
        self.RX_list = RX_list
        self.reaction_list = reaction_list
        self.label1_list = label1_list
        self.label2_list = label2_list
        self.label3_list = label3_list
        self.num_list = num_list

    def __len__(self):
        return len(self.DG_list)

    def __getitem__(self, index):
        DG = self.DG_list[index]
        RX = self.RX_list[index]
        reaction = self.reaction_list[index]
        label1 = self.label1_list[index]
        label2 = self.label2_list[index]
        label3 = self.label3_list[index]
        num = self.num_list[index]
        return DG, RX, reaction, label1, label2, label3, num
    
target = torch.tensor(target, dtype=torch.long)
num = torch.tensor(num, dtype=torch.long)
target_DG = torch.tensor(target_DG, dtype=torch.float)
target_RX = torch.tensor(target_RX, dtype=torch.float)
DG_descs = torch.tensor(DG_descs, dtype=torch.float)
RX_descs = torch.tensor(RX_descs, dtype=torch.float)
all_descriptors = torch.tensor(all_descriptors, dtype=torch.float)
dataset = MultiTaskDataset(DG_descs, RX_descs, all_descriptors, target_DG, target_RX, target, num)

dataloader = DataLoader(
    dataset,
    batch_size=30,
    shuffle=True)

In [13]:
# Define evaluation function
def evaluate(model, dataloader, o_sub_list, m_sub_list):
    model.eval()
    correct_task3 = 0
    total = 0
    num_list = []
    pred_task1 = []
    label_task1 = []
    pred_task2 = []
    label_task2 = []

    for h1, h2, h3, labels1, labels2, labels3, num in dataloader:
        
        task1_output, task2_output, task3_output = model(h1, h2, h3)
        
        total += len(labels3)
        _, predicted = torch.max(task3_output, dim=1)
        
        for i in range(len(labels3)):
            if num[i] in o_sub_list:
                if labels3[i] == predicted[i]:
                    correct_task3 += 1
                if labels3[i] == 4 and predicted[i] == 0:
                    correct_task3 += 1
                    predicted[i] = 4
            elif num[i] in m_sub_list:
                if labels3[i] == predicted[i]:
                    correct_task3 += 1
                if labels3[i] == 4 and predicted[i] == 0:
                    correct_task3 += 1
                    predicted[i] = 4
                if labels3[i] == 3 and predicted[i] == 1:
                    correct_task3 += 1
                    predicted[i] = 3
            else:
                if labels3[i] == 0 and predicted[i] == 0:
                    correct_task3 += 1
                elif labels3[i] == 0 and predicted[i] == 4:
                    correct_task3 += 1
                elif labels3[i] == 1 and predicted[i] == 1:
                    correct_task3 += 1
                elif labels3[i] == 1 and predicted[i] == 3:
                    correct_task3 += 1
                elif labels3[i] == 2 and predicted[i] == 2:
                    correct_task3 += 1

        num_list.append(num)
        task1_output = task1_output.tolist()
        task2_output = task2_output.tolist()
        labels1 = labels1.tolist()
        labels2 = labels2.tolist()
        pred_task1.extend(task1_output)
        label_task1.extend(labels1)
        pred_task2.extend(task2_output)
        label_task2.extend(labels2)               
        
    accuracy_task3 = 1.0 * correct_task3 / total

    MAE1 = mean_absolute_error(label_task1, pred_task1)
    MAE2 = mean_absolute_error(label_task2, pred_task2)    
    
    label_task1 = np.array(label_task1)
    pred_task1 = np.array(pred_task1)
    label_task1_flat = label_task1.flatten()
    pred_task1_flat = pred_task1.flatten()
    r2_1, _ = pearsonr(label_task1_flat, pred_task1_flat)
    
    label_task2 = np.array(label_task2)
    pred_task2 = np.array(pred_task2)
    label_task2_flat = label_task2.flatten()
    pred_task2_flat = pred_task2.flatten()
    r2_2, _ = pearsonr(label_task2_flat, pred_task2_flat)

    return accuracy_task3, predicted, labels3, num_list, MAE1, MAE2, r2_1, r2_2, label_task1, pred_task1, label_task2, pred_task2

In [15]:
# Training
kf = KFold(n_splits=10, shuffle=True,random_state=0)
fold_num = 1
all_acc_list = []
all_loss_list = []
r2_1_all_list = []
r2_2_all_list = []
MAE1_all_list = []
MAE2_all_list = []
target_list_cm = []
pred_list_cm = []
output1_list = []
loss_list = []
loss_list1 = []
loss_list2 = []
MAE1_val_list = []
MAE2_val_list = []
r2_1_val_list = []
r2_2_val_list = []
num_val_end = []
pred1_all = []
label1_all = []
pred2_all = []
label2_all = []

for train_indices, val_indices in kf.split(dataset):
    train_acc_list = []
    valid_acc_list = [0.0]    
    train_dataset = [dataset[i] for i in train_indices]
    val_dataset = [dataset[i] for i in val_indices]

    train_loader = DataLoader(
        train_dataset,
        batch_size=30,
        drop_last=False,
        shuffle=True)
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=30,
        drop_last=False,
        shuffle=True)
    
    model = MultiTaskNN(DG_dim,RX_dim,reaction_dim, 250, 3, 3, 5)
    opt = torch.optim.Adam(model.parameters(), lr=0.001)
    
    predicted_val_list = []
    labels_val_list = []
    num_val_list = []
    
    for epoch in range(200):
        model.train()
        for inputs_task1, inputs_task2, inputs_task3, labels_task1, labels_task2, labels_task3, num in train_loader:
            opt.zero_grad()
            output1, output2, output3 = model(inputs_task1, inputs_task2, inputs_task3)
            output1_list.extend(output1)
            
            loss1 = F.mse_loss(output1, labels_task1)
            loss2 = F.l1_loss(output2, labels_task2)
            loss3 = F.cross_entropy(output3, labels_task3)
            total_loss = loss1*0.4 + loss2*0.4 + loss3*0.2
            
            total_loss.backward()
            opt.step()

        train_acc, predicted_train, labels_train, num_train, MAE1_t, MAE2_t, r2_1_t, r2_2_t, _, _, _, _ = evaluate(model, train_loader, ortho_sub_list, meta_sub_list)
        train_acc_list.append(train_acc)
        valid_acc, predicted_val, labels_val, num_val, MAE1_val, MAE2_val, r2_1_val, r2_2_val, label1_v, pred1_v, label2_v, pred2_v = evaluate(model, val_loader, ortho_sub_list, meta_sub_list)
        valid_acc_list.append(valid_acc)
        MAE1_val_list.append(MAE1_val)
        MAE2_val_list.append(MAE2_val)
        r2_1_val_list.append(r2_1_val)
        r2_2_val_list.append(r2_2_val)
        loss4 = total_loss
        loss4 = loss4.detach() 
        loss1 = loss1.detach() 
        loss2 = loss2.detach() 
        loss4 = loss4.numpy()
        loss1 = loss1.numpy()
        loss2 = loss2.numpy()
        loss_list.append(loss4)
        loss_list1.append(loss1)
        loss_list2.append(loss2)
        #print("Epoch {:05d} | Loss {:.4f} | Train Acc. {:.4f} | Validation Acc. {:.4f} ".format(epoch, total_loss , train_acc, valid_acc))
            
        if max(valid_acc_list[:-1]) < valid_acc_list[-1]:
            num_val_end_m = torch.empty(0)
            target_max = []
            pred_max = []
            pred1 = []
            label1 = []
            pred2 = []
            label2 = []
            
            num_val_end_m = torch.cat((num_val_end_m, num_val[0]), dim=0)
            target_max.append(labels_val)
            pred_max.append(predicted_val)
            label1.append(label1_v)
            pred1.append(pred1_v)
            label2.append(label2_v)
            pred2.append(pred2_v)
        
    num_val_end.append(num_val_end_m)
    pred1_all.append(pred1)
    label1_all.append(label1)
    pred2_all.append(pred2)
    label2_all.append(label2)
    
    target_list_cm.extend(target_max)
    pred_list_cm.extend(pred_max)
    
    print(
        "Fold {:02d} | Loss {:.4f} | Train Acc. {:.4f} | Validation Acc. {:.4f} ".format(
        fold_num, min(loss_list) , max(train_acc_list), max(valid_acc_list))
        )
    all_acc_list.append(max(valid_acc_list))
    all_loss_list.append(min(loss_list))
    r2_1_all_list.append(max(r2_1_val_list))
    r2_2_all_list.append(max(r2_2_val_list))
    MAE1_all_list.append(MAE1_val_list[-1])
    MAE2_all_list.append(MAE2_val_list[-1])
    fold_num += 1

average_accuracy = np.mean(all_acc_list)
average_loss = np.mean(all_loss_list)
average_r2_1 = np.mean(r2_1_all_list)
average_r2_2 = np.mean(r2_2_all_list)
average_MAE1 = np.mean(MAE1_all_list)
average_MAE2 = np.mean(MAE2_all_list)
print("Accuracy_all {:.4f}, Loss {:.4f}".format(average_accuracy, average_loss))


Fold 01 | Loss 0.0626 | Train Acc. 0.9391 | Validation Acc. 0.7308 
Fold 02 | Loss 0.0626 | Train Acc. 0.8913 | Validation Acc. 0.9615 
Fold 03 | Loss 0.0543 | Train Acc. 0.9174 | Validation Acc. 0.8077 
Fold 04 | Loss 0.0543 | Train Acc. 0.8826 | Validation Acc. 0.9231 
Fold 05 | Loss 0.0543 | Train Acc. 0.8913 | Validation Acc. 0.9615 
Fold 06 | Loss 0.0493 | Train Acc. 0.9217 | Validation Acc. 0.9231 
Fold 07 | Loss 0.0493 | Train Acc. 0.9307 | Validation Acc. 0.7600 
Fold 08 | Loss 0.0493 | Train Acc. 0.8788 | Validation Acc. 0.9600 
Fold 09 | Loss 0.0436 | Train Acc. 0.9394 | Validation Acc. 0.9600 
Fold 10 | Loss 0.0436 | Train Acc. 0.9091 | Validation Acc. 0.8800 
Accuracy_all 0.8868, Loss 0.0523
