In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from tqdm import tqdm

import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader

# Import Dataset

In [3]:
df_ligands = pd.read_csv("dataset_20220217_2/ligand.csv")
df_centroids = pd.read_csv("dataset_20220217_2/centroids.csv")
df_pair = pd.read_csv("dataset_20220217_2/pair.csv")

# Smiles string - One-hot encoding

In [4]:
# define SMILES characters ----------------------------------------------------
SMILES_CHARS = [' ',
                '#', '%', '(', ')', '+', '-', '.', '/',
                '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                '=', '@',
                'A', 'B', 'C', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P',
                'R', 'S', 'T', 'V', 'W', 'X', 'Z',
                '[', '\\', ']',
                'a', 'b', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's',
                't', 'u']
                
# define encoder and decoder --------------------------------------------------
smi2index = dict( (c,i) for i,c in enumerate( SMILES_CHARS ) )
index2smi = dict( (i,c) for i,c in enumerate( SMILES_CHARS ) )

def smiles_encoder( smiles, maxlen=200 ):
    X = np.zeros( ( maxlen, len( SMILES_CHARS ) ) )
    for i, c in enumerate( smiles ):
        X[i, smi2index[c] ] = 1
    return X

def smiles_decoder( X ):
    smi = ''
    X = X.argmax( axis=-1 )
    for i in X:
        smi += index2smi[ i ]
    return smi

In [5]:
SMISET = {"C": 67, "l": 1, ".": 2, "c": 3, "1": 4, "2": 5, "(": 6,
          "N": 7, "=": 8, "3": 9, ")": 10, "n": 11, "[": 12, "H": 13,
           "]": 14, "O": 15, "@": 16, "s": 17, "+": 18, "/": 19, "S": 20,
            "F": 21, "-": 22, "4": 23, "B": 24, "r": 25, "o": 26, "\\": 27,
             "#": 28, "5": 29, "a": 30, "P": 31, "e": 32, "6": 33, "7": 34,
              "I": 35, "A": 36, "i": 37, "8": 38, "9": 39, "Z": 40, "K": 41,
               "L": 42, "%": 43, "0": 44, "T": 45, "g": 46, "G": 47, "d": 48,
                "M": 49, "b": 50, "u": 51, "t": 52, "R": 53, "p": 54, "m": 55,
                 "W": 56, "Y": 57, "V": 58, "~": 59, "U": 60, "E": 61, "f": 62,
                  "X": 63, "D": 64, "y": 65, "h": 66}

PROTSET = {"A": 1, "R": 2, "N": 3, "D": 4, "C": 5, "Q": 6,
           "E": 7, "G": 8, "H": 9, "I": 10, "L": 11, "K": 12,
           "M": 13, "F": 14, "P": 15, "S": 16, "T": 17, "W": 18,
           "Y": 19, "V": 20, "O": 21}

pro_missing_ls = []

def one_hot_smiles(line, MAX_SMI_LEN=200):
    X = np.zeros((1, MAX_SMI_LEN, len(SMISET)))  # +1

    if type(line)!=str:
        print('SMILE format is not str!')
    for i, ch in enumerate(line[:MAX_SMI_LEN]):
        tmp=SMISET.get(ch)
        if tmp:
            X[0, i, tmp - 1] = 1
        else:
            print(line,'exits not in SMISET character',ch)
#     X = X.tolist() 
    return X

def one_hot_protein(line, MAX_SEQ_LEN=1200):
    X = np.zeros((1, MAX_SEQ_LEN, len(PROTSET)))
    for i, ch in enumerate(line[:MAX_SEQ_LEN]):
        tmp=PROTSET.get(ch)
        if tmp:
            X[0, i, tmp - 1] = 1
        else:
#             print('exits not in PROTSET character',ch)
            if ch not in pro_missing_ls:
                pro_missing_ls.append(ch)
#     X = X.tolist()
    return X

In [131]:
import os
from os import listdir
from os.path import isfile, join

mypath = os.path.join(os.getcwd(), "dataset_20220217_2", "pdbs")
pdb_files_dic = {f: os.path.join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))}

unique_atom_dic = {}

for i in pdb_files_dic.keys():
    path = pdb_files_dic[i]
    X_list, Y_list, Z_list, atomtype_list = read_pdb(path)
    
    curr_unique = list(set(atomtype_list))
    for i in curr_unique:
        if i not in unique_atom_dic.keys():
            unique_atom_dic[i] = 1
        else: 
            unique_atom_dic[i] = unique_atom_dic[i]+1

In [9]:
unique_smile_dic = {}

for i in df_ligands["Smiles"]:
    curr_unique = list(i)
    for i in curr_unique:
        if i not in unique_smile_dic.keys():
            unique_smile_dic[i] = 1
        else: 
            unique_smile_dic[i] = unique_smile_dic[i]+1

In [12]:
for i in unique_smile_dic.keys():
    if i not in SMISET.keys():
        print (i)

 


# Spatial matrix 

In [13]:
import periodictable as pt
unique_atoms = {'C': pt.C.number, 
                'N': pt.N.number,  
                'O': pt.O.number, 
                'S': pt.S.number,
               'P': pt.P.number}

In [15]:
def read_pdb(filename: str):
    """Read a protein file to get four atom information lists.
    
    You can copy this function to your project code.
    """ 
    with open(filename, 'r') as file:
        strline_L = file.readlines()
    strline_L=[strline.strip() for strline in strline_L]

    X_list=[float(strline.split()[-3]) for strline in strline_L]
    Y_list=[float(strline.split()[-2]) for strline in strline_L]
    Z_list=[float(strline.split()[-1]) for strline in strline_L]
    atomtype_list=[strline.split()[-7][0] for strline in strline_L]

    return X_list, Y_list, Z_list, atomtype_list

def convert_protein_list_to_matrix(X_list, Y_list, Z_list):
    protein_coords = np.concatenate((np.array(X_list).reshape(-1, 1), 
                                     np.array(Y_list).reshape(-1, 1), 
                                     np.array(Z_list).reshape(-1, 1)), 
                                    axis = 1)
    return protein_coords

def find_closet_k_atoms(atomtype_list, complex_coords, atom_index, k):
    target_coords = complex_coords[atom_index]
    euclidean_dist = np.sum(np.square(complex_coords - target_coords), axis = 1)
    closest_k_index = np.argsort(euclidean_dist)[:k]
    
    if atom_index in closest_k_index:
        closest_k_index_updated = [j for j in closest_k_index if j!=atom_index]
        closest_k_index_updated.append(np.argsort(euclidean_dist)[k])
    else:
        closest_k_index_updated = closest_k_index
    
    closest_k_dist = euclidean_dist[closest_k_index_updated]
    closest_k_coords = complex_coords[closest_k_index_updated]
    closest_k_atoms = np.array(atomtype_list)[closest_k_index_updated]
    closest_k_atoms_num = np.array([unique_atoms[i] for i in closest_k_atoms])
    
    return closest_k_atoms.reshape(1,k), closest_k_atoms_num.reshape(1,k), closest_k_dist.reshape(1,k), closest_k_coords

def featurize_pdb(path, k, max_length):
    X_list, Y_list, Z_list, atomtype_list = read_pdb(path)
    curr_protein_coords = convert_protein_list_to_matrix(X_list, Y_list, Z_list)
    output_dist = np.zeros([max_length, k])
    output_atoms = []
    output_atoms_num = np.zeros([max_length, k])
    for i in range(curr_protein_coords.shape[0]):
        k_atoms, k_atoms_num, k_dist, k_coords = find_closet_k_atoms(atomtype_list,
                                                                     curr_protein_coords,
                                                                     i,
                                                                     k)
        output_atoms.append(list(k_atoms)[0])
        output_atoms_num[i] = k_atoms_num
        output_dist[i] = k_dist
    
    return output_dist, output_atoms, output_atoms_num

In [99]:
output_dist, output_atoms, output_atoms_num = featurize_pdb('/Users/user/Desktop/CS5242/project/cs5242_project/dataset_20220217_2/pdbs/102D.pdb', 12, 1200)

In [105]:
np.concatenate((output_dist, output_atoms_num), axis = 1)

array([[ 2.061981,  5.776022,  6.73325 , ...,  6.      ,  6.      ,
         6.      ],
       [ 2.061981,  2.253537,  5.413489, ...,  6.      , 15.      ,
         6.      ],
       [ 2.081778,  2.253537,  2.412322, ..., 15.      ,  8.      ,
         6.      ],
       ...,
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]])

In [None]:
for j in tqdm(pdb_files_dic.keys()):
    path = pdb_files_dic[j]
#     print(path)
    X_list, Y_list, Z_list, atomtype_list = read_pdb(path)
    curr_protein_coords = convert_protein_list_to_matrix(X_list, Y_list, Z_list)
    output_dist = np.zeros([curr_protein_coords.shape[0], k])
    output_atoms = []
    for i in range(curr_protein_coords.shape[0]):
        k_atoms, k_dist, k_coords = find_closet_k_atoms(atomtype_list, 
                                                        curr_protein_coords,
                                                        i,
                                                        k)
        output_atoms.append(list(k_atoms)[0])
        output_dist[i] = k_dist

In [106]:
def build_molecule_features(x_list: list, y_list: list, z_list: list, atom_type_list: list,
                            molecule_is_protein: bool) -> np.array:
    """
    Convert the data extract from file into a np.ndarray.
    The information of one atom is represented as a line in the array.
    See settings.py for values used to represented categorical features (molecule type and atom type)
    :param x_list: list of x coordinates
    :param y_list: list of y coordinates
    :param z_list: list of z coordinates
    :param atom_type_list: list of atom type (string)
    :param molecule_is_protein: boolean
    :return: np.ndarray of dimension (nb_atoms, 3 + nb_atom_features)
    """
    nb_atoms = len(x_list)

    # One hot encoding for atom type and molecule types
    is_hydrophobic_list = np.array([1. if atom_type in HYDROPHOBIC_TYPES else 0. for atom_type in atom_type_list])
    is_polar_list = 1. - is_hydrophobic_list

#     is_from_protein_list = (1. * molecule_is_protein) * np.ones((nb_atoms,))
#     is_from_ligand_list = 1. - is_from_protein_list

    # See `FEATURES_NAMES` in settings to see how the features are organized
    molecule_features = np.array([x_list, y_list, z_list,
                                  is_hydrophobic_list, is_polar_list]).T


    return molecule_features

# Data Preparation

In [16]:
def generate_negative_example(df_pairs, df_ligands, ratio, seed):
    proteins_ls = list(df_pairs["PID"])
    ligands_ls = list(df_ligands["LID"])
    np.random.seed(seed)
    
    out_proteins_ls = []
    out_ligands_ls = []
    target_ls = []
    
    for i in proteins_ls:
        paired_ligand = df_pairs[df_pairs["PID"] == i]["LID"].values[0]
        for j in range(ratio):
            out_proteins_ls.append(i)
            chosen_ligand = np.random.choice([k for k in ligands_ls if k!=paired_ligand], 1)[0] 
            out_ligands_ls.append(chosen_ligand)
            target_ls.append(0)
    
    df_out = pd.DataFrame({"PID": out_proteins_ls,
                          "LID": out_ligands_ls,
                          "target": target_ls})
    
    return df_out

In [19]:
num_positive = len(df_pair)
df_positive = df_pair.copy()
df_positive["target"] = 1
df_train_positive = df_positive.iloc[0:int(np.floor(num_positive * 0.7)), :]
df_validation_positive = df_positive.iloc[int(np.floor(num_positive * 0.7)):, :]
df_train_negative = generate_negative_example(df_train_positive, df_ligands, 2, 0)
df_validation_negative = generate_negative_example(df_validation_positive, df_ligands, 2, 0)

In [20]:
df_train = pd.concat([df_train_positive, df_train_negative])
df_test = pd.concat([df_validation_positive, df_validation_negative])
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [21]:
def process_PDB(pid, pdbs_dir):
    X_list, Y_list, Z_list, atomtype_list = read_pdb(path)
    return one_hot_protein(atomtype_list)

def batch_process_SMILE(ligands):
    return one_hot_smiles(ligands)

# Defining Network

In [None]:
'''
To add a regularization term for the weight parameter, you could manually add it to the loss:
output = model(input)
loss = criterion(output, target)
loss = loss + torch.norm(model.layer.weight, p=2)

intialize weight:
def weight_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.zeros_(m.bias)

model.apply(weight_init)

'''

In [432]:
test_conv2d = nn.Conv2d(1, 8, kernel_size = (1, 1), stride = 1, padding="same", bias = False)

In [427]:
for i, data in enumerate(trainloader, 0):
    if i == 0:
        ligand = data[0].to(device)
        protein = data[1].to(device)
        target = data[2].to(device)
    else:
        break

In [430]:
protein.shape

torch.Size([128, 1, 1200, 21])

In [435]:
test_conv2d(protein.float()).shape

torch.Size([128, 8, 1200, 21])

In [436]:
# inception block
class Conv2dLayer(nn.Module):
    def __init__(self, out_channels, num_row, num_col,
              padding='same', strides=1, use_bias=False, in_channels=1):
        super().__init__()
        print (f"current out channels: {out_channels}")
        self.net = nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size = (num_row, num_col),
              stride = strides, padding=padding, bias = use_bias),
        nn.ReLU()
        )
    def forward(self, x):
        output = self.net(x)
        print (f"conv2d current output {x.shape}")
        return output

class InceptionBlock(nn.Module):
    def __init__(self, filters_1x1, filters_3x3_reduce, filters_3x3,
                    filters_5x5_reduce, filters_5x5, filters_pool_proj, in_channels = 1):
        super().__init__()
        self.layer0 = Conv2dLayer(filters_1x1, 1, 1, in_channels = in_channels)
        self.layer1 = Conv2dLayer(filters_3x3_reduce, 1, 1, in_channels = in_channels)
        self.layer2 = Conv2dLayer(filters_3x3, 3, 3, in_channels = filters_3x3_reduce)
        self.layer3 = Conv2dLayer(filters_5x5_reduce, 1, 1, in_channels = in_channels)
        self.layer4 = Conv2dLayer(filters_5x5, 3, 3, in_channels = filters_5x5_reduce)
        self.layer5 = Conv2dLayer(filters_5x5, 3, 3, in_channels = filters_5x5)
        self.layer6 = nn.MaxPool2d(kernel_size=3, stride=2, padding="same")
        self.layer7 = Conv2dLayer(filters_pool_proj, 1, 1, in_channels = in_channels)
        
    def forward(self, x):
        branch_0 = self.layer0(x)
        branch_1 = self.layer1(x)
        branch_1 = self.layer2(branch_1)
        branch_2 = self.layer3(x)
        branch_2 = self.layer4(branch_2)
        branch_2 = self.layer5(branch_2)
        branch_3 = self.layer6(x)
        branch_3 = self.layer7(branch_3)
        
        x_out = torch.cat([branch_0, branch_1, branch_2, branch_3], dim=1)
        return x_out

class InceptionBlockB(nn.Module):
    def __init__(self, filters_1x1, filters_5x5_reduce, filters_5x5,
                      filters_7x7_reduce, filters_1x7,filters_7x1,filters_pool_proj):
        super().__init__()
        self.layer0 = Conv2dLayer(filters_1x1, 1, 1)
        self.layer1 = Conv2dLayer(filters_7x7_reduce, 1, 1)
        self.layer2 = Conv2dLayer(filters_1x7, 1, 7)
        self.layer3 = Conv2dLayer(filters_7x1, 7, 1)
        self.layer4 = Conv2dLayer(filters_5x5_reduce, 1, 1)
        self.layer5 = Conv2dLayer(filters_5x5, 3, 3)
        self.layer6 = nn.AvgPool2d(kernel_size=3, stride=1, padding="same")
        self.layer7 = Conv2dLayer(filters_pool_proj, 1, 1)
        
    def forward(self, x):
        branch_0 = self.layer0(x)
        branch_1 = self.layer1(x)
        branch_1 = self.layer2(branch_1)
        branch_1 = self.layer3(branch_1)
        branch_2 = self.layer4(x)
        branch_2 = self.layer5(branch_2)
        branch_2 = self.layer5(branch_2)
        branch_3 = self.layer6(x)
        branch_3 = self.layer7(branch_3)
        
        x_out = torch.cat([branch_0, branch_1, branch_2, branch_3], dim=1)
        return x_out

class SimpleBlock(nn.Module):
    def __init__(self, nb_filter, num_row, num_col):
        self.layer = Conv2dLayer(nb_filter, num_row, num_col)
        
    def forward(self, x):
        x = self.layer(x)
        x = self.layer(x)
        return x

# class Attention3DBlock(nn.Module):
#     def __init__(self, hidden_states,input_shape1, input_shape2, out_shape=128):
#         self.dense1 = nn.linear(input_shape1, out_shape, bias=False) # identify input_shape here
#         self.softmax = nn.Softmax()
#         self.dense2 = nn.linear(input_shape2, out_shape) # identify input_shape here
#         self.tanh = nn.Tanh()
        
#     def forward(self, x):
#         score_first_part = self.dense1(x)
#         h_t = score_first_part[:, -1, :]
#         score = torch.dot(score_first_part, h_t)
#         attention_weights = self.softmax(score)
#         context_vector = torch.dot(hidden_states, attention_weights)   # how does keras dot work????
#         pre_activation = torch.tensordot([context_vector, h_t], dims=([1], [1]))
#         attention_vector = self.dense2(pre_activation)
        
#         return attention_vector

In [437]:
class MyNet(nn.Module):
    def __init__(self, alpha, device, pro_branch_switch1='inception_block', pro_branch_switch2='inception_block', 
                 pro_branch_switch3='inception_block_b', pro_add_attention=False,
                comp_branch_switch1='inception_block', comp_branch_switch2='inception_block', 
                 comp_branch_switch3='inception_block_b', comp_add_attention=False):
        super().__init__()
        self.pro_branch_switch1 = pro_branch_switch1
        self.pro_branch_switch2 = pro_branch_switch2
        self.pro_branch_switch3 = pro_branch_switch3
        self.pro_add_attention = pro_add_attention
        self.comp_branch_switch1 = comp_branch_switch1
        self.comp_branch_switch2 = comp_branch_switch2
        self.comp_branch_switch3 = comp_branch_switch3
        self.comp_add_attention = comp_add_attention
        self.alpha = alpha
        self.device = device
        self._create_network()
        self._init_params()
        
    
    def _create_network(self):
        protein_blocks = []
        if self.pro_branch_switch1 == "inception_block":
            protein_blocks.append(InceptionBlock(filters_1x1=8, filters_3x3_reduce=1, filters_3x3=32,
                                         filters_5x5_reduce=1, filters_5x5=32, filters_pool_proj=16, in_channels=1))
        else:
            protein_blocks.append(SimpleBlock(nb_filter=32, num_row=3, num_col=3))
        protein_blocks.append(nn.MaxPool2d(kernel_size=3, stride=3, padding="same"))
        
        if self.pro_branch_switch2 == "inception_block":
            protein_blocks.append(InceptionBlock(filters_1x1=16, filters_3x3_reduce=16, filters_3x3=64,
                                         filters_5x5_reduce=16, filters_5x5=64, filters_pool_proj=32))
        else:
            protein_blocks.append(SimpleBlock(nb_filter=64,num_row=3,num_col=3))
        protein_blocks.append(nn.MaxPool2d(kernel_size=3, stride=3, padding="same"))
        
        if self.pro_branch_switch3 == "inception_block":
            protein_blocks.append(InceptionBlock(filters_1x1=32, filters_3x3_reduce=64, filters_3x3=128,
                                         filters_5x5_reduce=64, filters_5x5=128, filters_pool_proj=64))
        elif self.pro_branch_switch3 == "inception_block_b":
            protein_blocks.append(InceptionBlockB(filters_1x1=32, filters_5x5_reduce=64, filters_5x5=128,
                                         filters_7x7_reduce=64, filters_1x7=128,filters_7x1=128, filters_pool_proj=64))
        else:
            protein_blocks.append(SimpleBlock(nb_filter=128,num_row=3,num_col=3))
        protein_blocks.append(nn.MaxPool2d(kernel_size=3, stride=3, padding="same"))
        
        if self.pro_add_attention:
            pass
        else:
            protein_blocks.append(nn.Flatten(start_dim=1, end_dim=-1))
            protein_blocks.append(nn.LazyLinear(1024)) # identify input_shape here
            protein_blocks.append(nn.ReLU())
        protein_blocks.append(nn.Dropout(self.alpha))
        self.protein_blocks = nn.Sequential(*protein_blocks)
        
        ligand_blocks = []
        if self.comp_branch_switch1 == "inception_block":
            ligand_blocks.append(InceptionBlock(filters_1x1=8, filters_3x3_reduce=1, filters_3x3=16,
                                         filters_5x5_reduce=1, filters_5x5=16, filters_pool_proj=16))
        else:
            ligand_blocks.append(SimpleBlock(nb_filter=32, num_row=3, num_col=3))
        ligand_blocks.append(nn.MaxPool2d(kernel_size=2, stride=2, padding="same"))
        
        if self.comp_branch_switch2 == "inception_block":
            ligand_blocks.append(InceptionBlock(filters_1x1=16, filters_3x3_reduce=16, filters_3x3=64,
                                         filters_5x5_reduce=16, filters_5x5=64, filters_pool_proj=32))
        else:
            ligand_blocks.append(SimpleBlock(nb_filter=64,num_row=3,num_col=3))
        ligand_blocks.append(nn.MaxPool2d(kernel_size=2, stride=2, padding="same"))
        
        if self.comp_branch_switch3 == "inception_block":
            ligand_blocks.append(InceptionBlock(filters_1x1=32, filters_3x3_reduce=32, filters_3x3=128,
                                         filters_5x5_reduce=32, filters_5x5=128, filters_pool_proj=32))
        elif self.comp_branch_switch3 == "inception_block_b":
            ligand_blocks.append(InceptionBlockB(filters_1x1=32, filters_5x5_reduce=32, filters_5x5=128,
                                         filters_7x7_reduce=32, filters_1x7=128,filters_7x1=128, filters_pool_proj=32))
        else:
            ligand_blocks.append(SimpleBlock(nb_filter=128,num_row=3,num_col=3))
        ligand_blocks.append(nn.MaxPool2d(kernel_size=2, stride=2, padding="same"))
        
        if self.comp_add_attention:
            pass
        else:
            ligand_blocks.append(nn.Flatten(start_dim=1, end_dim=-1))
            ligand_blocks.append(nn.LazyLinear(1024)) # identify input_shape here
            ligand_blocks.append(nn.ReLU())
        ligand_blocks.append(nn.Dropout(self.alpha))
        self.ligand_blocks = nn.Sequential(*ligand_blocks)
        
        combined_blocks = []
        combined_blocks.append(nn.Linear(2048, 512))
        combined_blocks.append(nn.ReLU())
        combined_blocks.append(nn.Dropout(self.alpha)) 
        self.combined_blocks = nn.Sequential(*combined_blocks)
        
        self.fc_pro_ligand_1 = nn.Linear(512, 64)
        self.fc_pro_ligand_2 = nn.Linear(64, 1)
        self.fc_sigmoid = nn.Sigmoid()
        self.fc_relu = nn.ReLU()
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)

    def _init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                
    def process_PDB(self, pid, pdbs_dir):
        X_list, Y_list, Z_list, atomtype_list = read_pdb(pdbs_dir)
        return one_hot_protein(atomtype_list)

    def batch_process_SMILE(self, ligands):
        return one_hot_smiles(ligands)
    
    def forward(self, protein_input, ligand_input):
        '''
        protein_input: shape(1200, num_encoding)
        ligand_input: shape(200, num_encoding)
        '''
        
        protein_output = self.protein_blocks(protein_input)
        ligand_output = self.ligand_blocks(ligand_input)
        
        protein_ligand = torch.concat([protein_output, ligand_output], dim = -1)
        protein_ligand_out = self.combined_blocks(protein_ligand)
        
        dense1 = torch.empty(1, 5)
        
        x = self.dropout1(protein_ligand_out)
        x = self.fc_pro_ligand_1(x)
        x = self.fc_relu(x)
        x = self.fc_pro_ligand_2(x)
        x = self.fc_sigmoid(x)
        dense1[0][0] = x

        x = self.dropout2(protein_ligand_out)
        x = self.fc_pro_ligand_1(x)
        x = self.fc_relu(x)
        x = self.fc_pro_ligand_2(x)
        x = self.fc_sigmoid(x)
        dense1[0][1] = x

        x = self.dropout3(protein_ligand_out)
        x = self.fc_pro_ligand_1(x)
        x = self.fc_relu(x)
        x = self.fc_pro_ligand_2(x)
        x = self.fc_sigmoid(x)
        dense1[0][2] = x
        
        x = self.dropout4(protein_ligand_out)
        x = self.fc_pro_ligand_1(x)
        x = self.fc_relu(x)
        x = self.fc_pro_ligand_2(x)
        x = self.fc_sigmoid(x)
        dense1[0][3] = x
        
        x = self.dropout5(protein_ligand_out)
        x = self.fc_pro_ligand_1(x)
        x = self.fc_relu(x)
        x = self.fc_pro_ligand_2(x)
        x = self.fc_sigmoid(x)
        dense1[0][4] = x
        
        out = torch.mean(dense1)
        
        return out
    
    def inference(self, PID, pdbs_dir, centroid, LIDs, ligands):
        p = self.process_PDB(PID, pdbs_dir).to(self.device)
        l = self.batch_prcoess_SMILE(ligands).to(self.device)
        p = self.batch_extend(p, c)
        
        return self.forward(p, l)

In [438]:
class CustomDataset(Dataset):
    def __init__(self, df_pair, df_ligands, path):
        self.df_pair = df_pair
        self.df_ligands = df_ligands
        self.path = path

    def __len__(self):
        return len(self.df_pair)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        pid = self.df_pair["PID"][idx]
        lid = self.df_pair["LID"][idx]
        target = np.array([self.df_pair["target"][idx]])
        
        out_ligand = one_hot_smiles(self.df_ligands[self.df_ligands["LID"] == lid]["Smiles"].values[0])
        X_list, Y_list, Z_list, atomtype_list = read_pdb(f"{path}/{pid}.pdb")
        out_protein = one_hot_protein(atomtype_list)       
        return out_ligand, out_protein, target

In [406]:
# summary(model)

In [414]:
num_epoch = 3 # 300
batch_size = 128 
dropout_alpha = 0.5
learning_rate = 0.0001

In [439]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = MyNet(dropout_alpha, device).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current out channels: 8
current out channels: 1
current out channels: 32
current out channels: 1
current out channels: 32
current out channels: 32
current out channels: 16
current out channels: 16
current out channels: 16
current out channels: 64
current out channels: 16
current out channels: 64
current out channels: 64
current out channels: 32
current out channels: 32
current out channels: 64
current out channels: 128
current out channels: 128
current out channels: 64
current out channels: 128
current out channels: 64
current out channels: 8
current out channels: 1
current out channels: 16
current out channels: 1
current out channels: 16
current out channels: 16
current out channels: 16
current out channels: 16
current out channels: 16
current out channels: 64
current out channels: 16
current out channels: 64
current out channels: 64
current out channels: 32
current out channels: 32
current out channels: 32
current out channels: 128
current out channels: 128
current out channels: 32
c

In [440]:
from torchsummary import summary
summary(model, [(1, 200, 67), (1, 1200, 21)])

conv2d current output torch.Size([2, 1, 200, 67])
conv2d current output torch.Size([2, 1, 200, 67])
conv2d current output torch.Size([2, 1, 200, 67])
conv2d current output torch.Size([2, 1, 200, 67])
conv2d current output torch.Size([2, 1, 200, 67])
conv2d current output torch.Size([2, 32, 200, 67])


TypeError: max_pool2d(): argument 'padding' (position 4) must be tuple of ints, not str

In [416]:
path = os.path.join(os.getcwd(), "dataset_20220217_2", "pdbs")
custom_dataset = CustomDataset(df_train, df_ligands, path)
trainloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

In [441]:
loss_log=[]
for epoch in tqdm(range(num_epoch)):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        ligand = data[0].to(device)
        protein = data[1].to(device)
        target = data[2].to(device)
        print(ligand.shape)
        print(protein.shape)
        print(target.shape)

        optimizer.zero_grad()
        outputs = model(protein.float(), ligand.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        loss_log.append(loss.item())
        running_loss += loss.item()
        if (i + 1) % 128 == 0:
            print('epoch {:3d} | {:5d} batches loss: {:.4f}'.format(epoch, i + 1, running_loss/128))
            running_loss = 0.0

print('Finished Training')

  0%|          | 0/3 [00:00<?, ?it/s]

torch.Size([128, 1, 200, 67])
torch.Size([128, 1, 1200, 21])
torch.Size([128, 1])
conv2d current output torch.Size([128, 1, 1200, 21])
conv2d current output torch.Size([128, 1, 1200, 21])
conv2d current output torch.Size([128, 1, 1200, 21])
conv2d current output torch.Size([128, 1, 1200, 21])
conv2d current output torch.Size([128, 1, 1200, 21])


  0%|          | 0/3 [00:03<?, ?it/s]

conv2d current output torch.Size([128, 32, 1200, 21])





TypeError: max_pool2d(): argument 'padding' (position 4) must be tuple of ints, not str

# Keras

In [32]:
import keras
from keras.models import Model
from keras.layers import Input,Dense, Dropout, Activation, Flatten,Reshape,concatenate,LSTM,Bidirectional, Average
from keras.layers import Conv2D, MaxPooling2D,Conv1D,MaxPooling1D,AveragePooling2D
from keras.layers import Lambda, dot
import tensorflow as tf
#import string
# from keras.utils import multi_gpu_model
from keras.utils.vis_utils import plot_model
import numpy as np
import os
from keras import regularizers
from keras import initializers
import tensorflow

def attention_3d_block(hidden_states,out_shape=128,name='pro_'):
    hidden_size = int(hidden_states.shape[2])
    # Inside dense layer
    #              hidden_states            dot               W            =>           score_first_part
    # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
    # W is the trainable weight matrix of attention Luong's multiplicative style score
    score_first_part = Dense(hidden_size, use_bias=False, name=name+'attention_score_vec')(hidden_states)
    #            score_first_part           dot        last_hidden_state     => attention_weights
    # (batch_size, time_steps, hidden_size) dot   (batch_size, hidden_size)  => (batch_size, time_steps)
    h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name=name+'last_hidden_state')(hidden_states)
    score = dot([score_first_part, h_t], [2, 1], name=name+'attention_score')
    attention_weights = Activation('softmax', name=name+'attention_weight')(score)
    # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
    context_vector = dot([hidden_states, attention_weights], [1, 1], name=name+'context_vector')
    pre_activation = concatenate([context_vector, h_t], name=name+'attention_output')
    attention_vector = Dense(out_shape, use_bias=False, activation='tanh', name=name+'attention_vector')(pre_activation)
    return attention_vector

def conv2d_bn(x, nb_filter, num_row, num_col,name,
              padding='same', strides=(1, 1), use_bias=False):
    x = Conv2D(nb_filter, (num_row, num_col),
                      name=name,
                      strides=strides,
                      padding=padding,
                      use_bias=use_bias,
                      kernel_regularizer=regularizers.l2(0.00004),
                      kernel_initializer=initializers.VarianceScaling(scale=2.0, mode='fan_in', distribution='normal', seed=None))(x)
    x = Activation('relu')(x)
    return x

def block_inception(input,filters_1x1, filters_3x3_reduce, filters_3x3,
                      filters_5x5_reduce, filters_5x5, filters_pool_proj,layer_name):
    branch_0 = conv2d_bn(input, filters_1x1, 1, 1,name=layer_name+'_branch_0')
    branch_1 = conv2d_bn(input, filters_3x3_reduce, 1, 1,name=layer_name+'_branch_1_3x3_reduce')
    branch_1 = conv2d_bn(branch_1, filters_3x3, 3, 3,name=layer_name+'_branch_1_3x3')
    branch_2 = conv2d_bn(input, filters_5x5_reduce, 1, 1,name=layer_name+'_branch_2_5x5_reduce')
    branch_2 = conv2d_bn(branch_2, filters_5x5, 3, 3,name=layer_name+'_branch_2_3x3_0')
    branch_2 = conv2d_bn(branch_2, filters_5x5, 3, 3,name=layer_name+'_branch_2_3x3_1')
    branch_3 = MaxPooling2D((3,3), strides=(1,1), padding='same',name=layer_name+'_branch_3_maxpooling')(input)#AveragePooling2D
    branch_3 = conv2d_bn(branch_3, filters_pool_proj, 1, 1,name=layer_name+'_branch_3_pool_proj')
    x = concatenate([branch_0, branch_1, branch_2, branch_3], axis=-1,name=layer_name+'_concat')
    return x

def block_inception_b(input,filters_1x1, filters_5x5_reduce, filters_5x5,
                      filters_7x7_reduce, filters_1x7,filters_7x1,filters_pool_proj,layer_name):
    branch_0 = conv2d_bn(input, filters_1x1, 1, 1,name=layer_name+'_branch_0')

    branch_1 = conv2d_bn(input,filters_7x7_reduce, 1, 1,name=layer_name+'_branch_1_7x7_reduce')
    branch_1 = conv2d_bn(branch_1,filters_1x7, 1, 7,name=layer_name+'_branch_1_7x7_0')
    branch_1 = conv2d_bn(branch_1,filters_7x1, 7, 1,name=layer_name+'_branch_1_7x7_1')

    branch_2 = conv2d_bn(input, filters_5x5_reduce, 1, 1,name=layer_name+'_branch_2_5x5_reduce')
    branch_2 = conv2d_bn(branch_2, filters_5x5, 3, 3,name=layer_name+'_branch_2_3x3_0')
    branch_2 = conv2d_bn(branch_2, filters_5x5, 3, 3,name=layer_name+'_branch_2_3x3_1')

    branch_3 = AveragePooling2D((3,3), strides=(1,1), padding='same')(input)
    branch_3 = conv2d_bn(branch_3, filters_pool_proj, 1, 1,name=layer_name+'_branch_3_pool_proj')

    x = concatenate([branch_0, branch_1, branch_2, branch_3], axis=-1)#branch_2,
    return x

def simple_block(input,nb_filter,num_row,num_col,layer_name):
    input = Conv2D(nb_filter, (num_row, num_col), padding='same', activation='relu', name=layer_name+'_conv0')(input)
    input = Conv2D(nb_filter, (num_row, num_col), padding='same', activation='relu', name=layer_name+'_conv1')(input)
    return input

def get_model_classification(save_dir,alpha,
                                          pro_branch_switch1='',pro_branch_switch2='',
                                          pro_branch_switch3='',pro_add_attention=False,
                                          comp_branch_switch1='',comp_branch_switch2='',
                                          comp_branch_switch3='',comp_add_attention=False,
               ):
    ###MODEL
    ##input
    protein_input = Input(shape=(1200, 21, 1), name='protein_input')
    comp_input = Input(shape=(200, 67, 1), name='comp_input')
    ##protein branch
    # layer1
    with tf.device('/gpu:0'):
        if pro_branch_switch1 == 'inception_block':
            pro_layer1 = block_inception(protein_input, filters_1x1=8, filters_3x3_reduce=1, filters_3x3=32,
                                         filters_5x5_reduce=1, filters_5x5=32, filters_pool_proj=16,layer_name='pro_layer1')
        else:
            pro_layer1 = simple_block(protein_input, nb_filter=32, num_row=3, num_col=3, layer_name='pro_layer1')
        pro_layer1 = MaxPooling2D(pool_size=(3, 3),padding='same', name='pro_layer1_poll')(pro_layer1)
        # layer2
        if pro_branch_switch2=='inception_block':
            pro_layer2 = block_inception(pro_layer1, filters_1x1=16, filters_3x3_reduce=16, filters_3x3=64,
                                         filters_5x5_reduce=16, filters_5x5=64, filters_pool_proj=32,layer_name='pro_layer2')
        else:
            pro_layer2=simple_block(pro_layer1,nb_filter=64,num_row=3,num_col=3,layer_name='pro_layer2')
        pro_layer2 = MaxPooling2D(pool_size=(3, 3), padding='same',name='pro_layer2_poll')(pro_layer2)
            # layer3
        if pro_branch_switch3=='inception_block':
            pro_layer3 = block_inception(pro_layer2, filters_1x1=32, filters_3x3_reduce=64, filters_3x3=128,
                                         filters_5x5_reduce=64, filters_5x5=128, filters_pool_proj=64,layer_name='pro_layer3')
        elif pro_branch_switch3=='inception_block_b':
            pro_layer3 = block_inception_b(pro_layer2, filters_1x1=32, filters_5x5_reduce=64, filters_5x5=128,
                                         filters_7x7_reduce=64, filters_1x7=128,filters_7x1=128, filters_pool_proj=64,layer_name='pro_layer3')
        else:
            pro_layer3 = simple_block(pro_layer2, nb_filter=128, num_row=3, num_col=3, layer_name='pro_layer3')
        pro_layer3 = MaxPooling2D(pool_size=(3, 3), padding='same',name='pro_layer3_pool')(pro_layer3)
        # layer4
        if pro_add_attention:
            h_t = Lambda(tf.reshape,output_shape=[45,352,], arguments={'shape': [-1, 45, 352]}, name='pro_convert_to_timestep')(pro_layer3)
            pro_layer_tran_result = attention_3d_block(h_t,1024,'pro_')#batch*1024
        else:
            pro_layer_tran_result = Flatten(name='pro_layer4_flatten')(pro_layer3)
            pro_layer_tran_result = Dense(1024, activation='relu', name='pro_layer5_den')(pro_layer_tran_result)
        pro_layer_tran_result = Dropout(alpha, name='pro_drop1')(pro_layer_tran_result)
    ##compound branch
    # layer1
    with tf.device('/gpu:1'):
        if comp_branch_switch1=='inception_block':
            comp_layer1 = block_inception(comp_input, filters_1x1=8, filters_3x3_reduce=1, filters_3x3=16,
                                         filters_5x5_reduce=1, filters_5x5=16, filters_pool_proj=16,
                                         layer_name='comp_layer1')
        else:
            comp_layer1 = simple_block(comp_input, 32, 3, 3, 'comp_layer1')
        comp_layer1 = MaxPooling2D(pool_size=(2, 2),padding='same', name='comp_layer1_poll')(comp_layer1)
        # layer2
        if comp_branch_switch2=='inception_block':
            comp_layer2 = block_inception(comp_layer1, filters_1x1=16, filters_3x3_reduce=16, filters_3x3=64,
                                          filters_5x5_reduce=16, filters_5x5=64, filters_pool_proj=32,layer_name='comp_layer2')
        else:
            comp_layer2=simple_block(comp_layer1,64,3,3,'comp_layer2')
        comp_layer2 = MaxPooling2D(pool_size=(2, 2), padding='same',name='comp_layer2_poll')(comp_layer2)
        # layer3
        if comp_branch_switch3=='inception_block':
            comp_layer3 = block_inception(comp_layer2, filters_1x1=32, filters_3x3_reduce=32, filters_3x3=128,
                                          filters_5x5_reduce=32, filters_5x5=128, filters_pool_proj=32,layer_name='comp_layer3')
        elif comp_branch_switch3=='inception_block_b':
            comp_layer3 = block_inception_b(comp_layer2, filters_1x1=32, filters_5x5_reduce=32, filters_5x5=128,
                                           filters_7x7_reduce=32, filters_1x7=128, filters_7x1=128,
                                           filters_pool_proj=32, layer_name='comp_layer3')
        else:
            comp_layer3=simple_block(comp_layer2,128,3,3,'comp_layer3')
        comp_layer3 = MaxPooling2D(pool_size=(2, 2), padding='same',name='comp_layer3_pool')(comp_layer3)
        # layer4
        if comp_add_attention:
            h_t = Lambda(tf.reshape,output_shape=[25*8,320,], arguments={'shape': [-1, 25*8, 320]}, name='comp_convert_to_timestep')(comp_layer3)
            comp_layer_tran_result = attention_3d_block(h_t,1024,'comp_')#batch*1024
        else:
            comp_layer_tran_result = Flatten(name='comp_layer4_flatten')(comp_layer3)
            comp_layer_tran_result = Dense(640, activation='relu', name='comp_layer5_den')(comp_layer_tran_result)
        # layer5
        comp_layer_tran_result = Dropout(alpha, name='comp_drop1')(comp_layer_tran_result)
    with tf.device('/gpu:2'):
        pro_com = keras.layers.concatenate([pro_layer_tran_result, comp_layer_tran_result])
        # We stack a deep densely-connected network on top
        fc_pro_com = Dense(512, activation='relu', name='den1')(pro_com)
        fc_pro_com = Dropout(alpha, name='drop1')(fc_pro_com)
        dense1 = []
        FC1 = Dense(64, activation='relu')
        for p in np.linspace(0.1,0.5, 5):
            x = Dropout(p)(fc_pro_com)
            x = FC1(x)
            x = Dense(1,activation='sigmoid')(x)
            dense1.append(x)
        class_out = Average()(dense1)
    classification_model = Model([protein_input, comp_input],class_out)
    plot_model(classification_model, to_file=save_dir + '/model_with_classification.png', show_shapes=True)
    return classification_model

In [33]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.x_val,self.y_val = validation_data
    def on_epoch_end(self, epoch, log={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print('\n ROC_AUC - epoch:%d - AUC score:%.6f \n' % (epoch+1, score))


In [34]:
def read_class(df_pair, df_ligands, path):
    x_prot=[]
    x_comp=[]
    y_=[]
    for i in range(len(df_pair)):
        pid = df_pair["PID"][i]
        lid = df_pair["LID"][i]
        target = np.array([df_pair["target"][i]])

        out_ligand = one_hot_smiles(df_ligands[df_ligands["LID"] == lid]["Smiles"].values[0])
        X_list, Y_list, Z_list, atomtype_list = read_pdb(f"{path}/{pid}.pdb")
        out_protein = one_hot_protein(atomtype_list)  
        
        x_prot.append(out_protein)
        x_comp.append(out_ligand)
        y_.append(target)

    print('shape:', len(x_prot))
    x_prot=np.array(x_prot)
    x_prot = x_prot.reshape([-1, 1200, 21, 1])
    x_comp=np.array(x_comp)
    x_comp = x_comp.reshape([-1, 200, 67, 1])
    y_=np.array(y_)
    y_=y_.reshape([-1,1])
    return x_prot,x_comp,y_

In [35]:
def read_class_generator(df_pair, df_ligands, path, batch_size, ft_flag = False):
    x_prot=[]
    x_comp=[]
    y_=[]
    while 1:
        df_pair = df_pair.sample(frac=1).reset_index(drop=True)
        for i in range(len(df_pair)):
            
            pid = df_pair["PID"][i]
            lid = df_pair["LID"][i]
            target = np.array([df_pair["target"][i]])
#             print (pid)
#             print(lid)

            out_ligand = one_hot_smiles(df_ligands[df_ligands["LID"] == lid]["Smiles"].values[0])
            X_list, Y_list, Z_list, atomtype_list = read_pdb(f"{path}/{pid}.pdb")
            out_protein = one_hot_protein(atomtype_list)             

            x_prot.append(out_protein)
            x_comp.append(out_ligand)
            y_.append(target)
            if len(x_prot)==batch_size:
                x_prot=np.array(x_prot)
                x_prot = x_prot.reshape([-1, 1200, 21, 1])
                x_comp=np.array(x_comp)
                x_comp = x_comp.reshape([-1, 200, 67, 1])
                y_=np.array(y_)
                y_=y_.reshape([-1,1])
                if ft_flag:
                    yield ({'protein_input':x_prot, 'comp_input':x_comp}, {'average':y_})
                else:
                    yield (x_prot, x_comp, y_)  
                x_prot=[]
                x_comp=[]
                y_=[]

In [None]:
import keras
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

from keras import backend as K
K.clear_session()

save_dir = "."
model_name = "first_model"
alpha = 0.5
batch_size = 128
epochs = 300
lr = 0.0001
patience = 10 
path = os.path.join(os.getcwd(), "dataset_20220217_2", "pdbs")

model = get_model_classification(save_dir, alpha,                 
                    pro_branch_switch1 = 'inception_block', pro_branch_switch2 = 'inception_block',
                    pro_branch_switch3='inception_block_b', pro_add_attention = False,
                    comp_branch_switch1 = 'inception_block', comp_branch_switch2 = 'inception_block',
                    comp_branch_switch3 = 'inception_block_b', comp_add_attention = False)

validation_x_prot, validation_x_comp, validation_y=read_class(df_test, df_ligands, path)

optimizer = tf.keras.optimizers.Adam(lr=lr)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=patience)
bestfile = save_dir + "/%s_best_model.hdf5" % model_name
checkpoint = ModelCheckpoint(bestfile, monitor='val_loss', verbose=1, save_best_only=True,
                             mode='min')    
# AUC
RocAuc = RocAucEvaluation(validation_data=([validation_x_prot, validation_x_comp],
                                               validation_y), interval=1)

path = os.path.join(os.getcwd(), "dataset_20220217_2", "pdbs")
history = model.fit(read_class_generator(df_train, df_ligands, path, batch_size, ft_flag = True),
                    steps_per_epoch=739,
                    epochs=epochs,
                    validation_data=([validation_x_prot, validation_x_comp], validation_y),
                    callbacks=[RocAuc,early_stopping, checkpoint]
                    )

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.
shape: 2694


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/300
 17/739 [..............................] - ETA: 2:27:41 - loss: 0.8321 - accuracy: 0.6521 Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=Cc6c(c(c7n6[Co]3(N45)N8C(=C7)C(=C(C8=C2)CCC(=O)O)C)CCC(=O)O)C)CCC(=O)O)C exits not in SMISET character  
 34/739 [>.............................] - ETA: 2:23:15 - loss: 0.8304 - accuracy: 0.6473 Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=Cc6c(c(c7n6[Co]3(N45)N8C(=C7)C(=C(C8=C2)CCC(=O)O)C)CCC(=O)O)C)CCC(=O)O)C exits not in SMISET character  
 62/739 [=>............................] - ETA: 2:16:19 - loss: 0.8241 - accuracy: 0.6501 Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=Cc6c(c(c7n6[Co]3(N45)N8C(=C7)C(=C(C8=C2)CCC(=O)O)C)CCC(=O)O)C)CCC(=O)O)C exits not in SMISET character  
 80/739 [==>...........................] - ETA: 2:09:45 - loss: 0.8205 - accuracy: 0.6544 Cc1c2n3c(c1CCC(=O)O)C=C4C(=C(C5=Cc6c(c(c7n6[Co]3(N45)N8C(=C7)C(=C(C8=C2)CCC(=O)O)C)CCC(=O)O)C)CCC(=O)O)C exits not in SMISET character  
103/739 [===>..........................] - ETA: 2:03:30 - loss: 0.8175 - accurac