7 encoders using different modalities of compounds
- Multi-Layer Perceptrons (MLP) on Morgan
- Daylight
- RDKit 2D Fingerprint
- Convolutional Neural Network (CNN) on SMILES strings
- Recurrent Neural Network (RNN) on top of CNN
- transformer encoders on substructure fingerprints
- message passing graph neural network on molecular graph

In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
def smile_to_fp(smile ,radius , nBits):
    mol = Chem.MolFromSmiles(smile)
    if mol is None:
        return None
    mol = Chem.MolFromSmiles(smile)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    return np.array(fp)


In [5]:
ds_small=pd.read_csv("../data/BindingDB_Small.csv")
ds_small.sample(2)

Unnamed: 0,Identifier,Uniprot ID,SMILES,Sequence,Value,Split
22,1555345,O00748,Cc1ccc(CC2C(=O)N(N=C2c2ccccc2)c2ccccc2)cc1,MRLHRLRARLSAVACGLLLLLVRGQGQDSASPIRTTHTGQVLGSLV...,7.21251,Train
13,1133322,P42336,Cc1nc(N)c2ccc(C)c(C(=O)Nc3cnc4[nH]ccc4c3)c2n1,MPPRPSSGELWGIHLMPPRILVECLLPNGMIVTLECLREATLITIK...,8.457422,Val


## Multi-Layer Perceptrons (MLP) on Morgan
SMILES → Morgan Fingerprint (2048 bits) → MLP → Latent drug embedding

In [25]:
class MLPEncoder(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dims):
        super().__init__()

        layers = []
        dims = [input_dim] + hidden_dims

        for i in range(len(hidden_dims)):
            layers.append(nn.Linear(dims[i], dims[i+1]))
            layers.append(nn.LayerNorm(dims[i+1])) 
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))

        self.encoder = nn.Sequential(*layers)
        self.projection = nn.Linear(hidden_dims[-1], output_dim)

    def forward(self, x):
        x = x.float()
        x = self.encoder(x)
        return self.projection(x)


In [27]:
fp = smile_to_fp("CCO" , radius=2, nBits=1024)
fp_tensor = torch.tensor(fp).unsqueeze(0)
drug_encoder= MLPEncoder(input_dim=1024, output_dim=16, hidden_dims=[16, 16]).to(device)
embedding = drug_encoder(fp_tensor)
embedding



tensor([[ 0.4850, -0.6580,  1.2225, -0.8588,  0.3488, -0.0358, -0.2499, -0.4974,
          0.3613, -1.0361, -1.4517,  0.3751, -0.0835,  0.3028, -0.2415, -0.9294]],
       grad_fn=<AddmmBackward0>)

# Daylight

In [29]:
from rdkit.Chem.Fingerprints import FingerprintMols
def smiles2daylight(s):
	try:
		NumFinger = 2048
		mol = Chem.MolFromSmiles(s)
		bv = FingerprintMols.FingerprintMol(mol)
		temp = tuple(bv.GetOnBits())
		features = np.zeros((NumFinger, ))
		features[np.array(temp)] = 1
	except:
		print('rdkit not found this smiles: ' + s + ' convert to all 0 features')
		features = np.zeros((2048, ))
	return np.array(features)

In [30]:
smiles2daylight("CCO")

array([1., 0., 0., ..., 0., 0., 0.], shape=(2048,))