In [2]:
### Import all needed modules
from pathlib import Path

import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from typing import List

import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import metrics
#from apex import amp

In [3]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
    device = torch.device('cpu')
    
else:
    print('CUDA is available!  Training on GPU ...')
    device = torch.device("cuda:0")
   

CUDA is not available.  Training on CPU ...


In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
from torch.utils.data import Dataset

from typing import List

CHAR_SMI_SET = {"(": 1, ".": 2, "0": 3, "2": 4, "4": 5, "6": 6, "8": 7, "@": 8,
                "B": 9, "D": 10, "F": 11, "H": 12, "L": 13, "N": 14, "P": 15, "R": 16,
                "T": 17, "V": 18, "Z": 19, "\\": 20, "b": 21, "d": 22, "f": 23, "h": 24,
                "l": 25, "n": 26, "r": 27, "t": 28, "#": 29, "%": 30, ")": 31, "+": 32,
                "-": 33, "/": 34, "1": 35, "3": 36, "5": 37, "7": 38, "9": 39, "=": 40,
                "A": 41, "C": 42, "E": 43, "G": 44, "I": 45, "K": 46, "M": 47, "O": 48,
                "S": 49, "U": 50, "W": 51, "Y": 52, "[": 53, "]": 54, "a": 55, "c": 56,
                "e": 57, "g": 58, "i": 59, "m": 60, "o": 61, "s": 62, "u": 63, "y": 64}

CHAR_SMI_SET_LEN = len(CHAR_SMI_SET)
PT_FEATURE_SIZE = 40


def label_smiles(line, max_smi_len):
    X = np.zeros(max_smi_len, dtype=np.int)
    for i, ch in enumerate(line[:max_smi_len]):
        X[i] = CHAR_SMI_SET[ch] - 1

    return X


class MyDataset(Dataset):
    def __init__(self, data_path, phase, max_smi_len):
        data_path = Path(data_path)
        
        ## Need for protein Tensor
        
        
        affinity = {}
        affinity_df = pd.read_csv(data_path / 'affinity_data.csv')
        for _, row in affinity_df.iterrows():
            affinity[row[0]] = row[1]
        self.affinity = affinity

        ligands_df = pd.read_csv(data_path / f"{phase}_smi.csv")
        ligands = {i["pdbid"]: i["smiles"] for _, i in ligands_df.iterrows()}
        self.smi = ligands
        self.max_smi_len = max_smi_len
        
        seq_path = data_path / phase / 'global'
        self.seq_path = sorted(list(seq_path.glob('*')))
        
        pro_path = data_path / phase / 'descriptors'
        self.pro_path = sorted(list(pro_path.glob('*')))
        
      
        #protein = {}
        #pro_tensor = pd.read_csv(data_path / f"{phase}_2048.csv")
        #descriptors = pro_tensor.iloc[1:, 1:].values
        #self.descriptors = descriptors
        
        self.length = len(self.smi)

    def __getitem__(self, idx):
        seq = self.seq_path[idx]
        pro = self.pro_path[idx]
        #pkt = self.pkt_path[idx]
        #assert seq.name == pkt.name
        
        #_seq_tensor = pd.read_csv(seq, index_col=0).drop(['idx'], axis=1).values[:self.max_seq_len]
        #seq_tensor = np.zeros((self.max_seq_len, PT_FEATURE_SIZE))
        #seq_tensor[:len(_seq_tensor)] = _seq_tensor
        
        _pro_tensor = pd.read_csv(pro, header=None)
        pro_tensor = np.zeros((1, 2048))
        pro_tensor[:len(_pro_tensor)+1] = _pro_tensor

        
                                
        


        return ( pro_tensor.astype(np.float32),
                label_smiles(self.smi[seq.name.split('.')[0]], self.max_smi_len),
                np.array(self.affinity[seq.name.split('.')[0]], dtype=np.float32))

    def __len__(self):
        return self.length

In [5]:
import sys
import time
from datetime import datetime
from pathlib import Path

import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.auto import tqdm

data_path = 'data/'
max_seq_len = 1000  
max_pkt_len = 63
max_smi_len = 150
batch_size = 1

data_loaders = {phase_name:
                    DataLoader(MyDataset(data_path, phase_name,
                                          max_smi_len),
                               batch_size=batch_size,
                               pin_memory=True,
                               num_workers=0,
                               shuffle=True)
                for phase_name in ['training', 'validation', 'test']}

In [6]:
protein_tensor,  label_smi, affinity = next(iter(data_loaders['training']))
print(label_smi.size())
print(affinity.size())
print(protein_tensor)
print(affinity)

torch.Size([1, 150])
torch.Size([1])
tensor([[[0.0788, 0.0080, 0.0482,  ..., 0.0032, 0.0000, 0.0000]]])
tensor([3.3300])


In [7]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import metrics

#from dataset import PT_FEATURE_SIZE

CHAR_SMI_SET_LEN = 64


class Squeeze(nn.Module):
    def forward(self, input: torch.Tensor):
        return input.squeeze()


class CDilated(nn.Module):
    def __init__(self, nIn, nOut, kSize, stride=1, d=1):
        super().__init__()
        padding = int((kSize - 1) / 2) * d
        self.conv = nn.Conv1d(nIn, nOut, kSize, stride=stride, padding=padding, bias=False, dilation=d)

    def forward(self, input):
        output = self.conv(input)
        return output


class DilatedParllelResidualBlockA(nn.Module):
    def __init__(self, nIn, nOut, add=True):
        super().__init__()
        n = int(nOut / 5)
        n1 = nOut - 4 * n
        self.c1 = nn.Conv1d(nIn, n, 1, padding=0)
        self.br1 = nn.Sequential(nn.BatchNorm1d(n), nn.PReLU())
        self.d1 = CDilated(n, n1, 3, 1, 1)  # dilation rate of 2^0
        self.d2 = CDilated(n, n, 3, 1, 2)  # dilation rate of 2^1
        self.d4 = CDilated(n, n, 3, 1, 4)  # dilation rate of 2^2
        self.d8 = CDilated(n, n, 3, 1, 8)  # dilation rate of 2^3
        self.d16 = CDilated(n, n, 3, 1, 16)  # dilation rate of 2^4
        self.br2 = nn.Sequential(nn.BatchNorm1d(nOut), nn.PReLU())

        if nIn != nOut:
#             print(f'{nIn}-{nOut}: add=False')
            add = False
        self.add = add

    def forward(self, input):
        # reduce
        output1 = self.c1(input)
        output1 = self.br1(output1)
        # split and transform
        d1 = self.d1(output1)
        d2 = self.d2(output1)
        d4 = self.d4(output1)
        d8 = self.d8(output1)
        d16 = self.d16(output1)

        # heirarchical fusion for de-gridding
        add1 = d2
        add2 = add1 + d4
        add3 = add2 + d8
        add4 = add3 + d16

        # merge
        combine = torch.cat([d1, add1, add2, add3, add4], 1)

        # if residual version
        if self.add:
            combine = input + combine
        output = self.br2(combine)
        return output

class DilatedParllelResidualBlockB(nn.Module):
    def __init__(self, nIn, nOut, add=True):
        super().__init__()
        n = int(nOut / 4)
        n1 = nOut - 3 * n
        self.c1 = nn.Conv1d(nIn, n, 1, padding=0)
        self.br1 = nn.Sequential(nn.BatchNorm1d(n), nn.PReLU())
        self.d1 = CDilated(n, n1, 3, 1, 1)  # dilation rate of 2^0
        self.d2 = CDilated(n, n, 3, 1, 2)  # dilation rate of 2^1
        self.d4 = CDilated(n, n, 3, 1, 4)  # dilation rate of 2^2
        self.d8 = CDilated(n, n, 3, 1, 8)  # dilation rate of 2^3
        self.br2 = nn.Sequential(nn.BatchNorm1d(nOut), nn.PReLU())

        if nIn != nOut:
#             print(f'{nIn}-{nOut}: add=False')
            add = False
        self.add = add

    def forward(self, input):
        # reduce
        output1 = self.c1(input)
        output1 = self.br1(output1)
        # split and transform
        d1 = self.d1(output1)
        d2 = self.d2(output1)
        d4 = self.d4(output1)
        d8 = self.d8(output1)

        # heirarchical fusion for de-gridding
        add1 = d2
        add2 = add1 + d4
        add3 = add2 + d8

        # merge
        combine = torch.cat([d1, add1, add2, add3], 1)

        # if residual version
        if self.add:
            combine = input + combine
        output = self.br2(combine)
        return output







In [14]:
class DeepDTAF(nn.Module):

    def __init__(self):
        super().__init__()

        smi_embed_size = 128
        #seq_embed_size = 128
        
        #seq_oc = 128
        #pkt_oc = 128
        smi_oc = 128

        self.smi_embed = nn.Embedding(CHAR_SMI_SET_LEN, smi_embed_size)
        
        self.protein = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.Dropout(0.5),
            nn.PReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.5),
            nn.PReLU(),
            nn.Linear(512,256),
            nn.PReLU())

        conv_smi = []
        ic = smi_embed_size
        for oc in [32, 64, smi_oc]:
            conv_smi.append(DilatedParllelResidualBlockB(ic, oc))
            ic = oc
        conv_smi.append(nn.AdaptiveMaxPool1d(1))
        conv_smi.append(Squeeze())
        self.conv_smi = nn.Sequential(*conv_smi)  # (N,128)
        
        
        self.cat_dropout = nn.Dropout(0.2)
        
        self.classifier = nn.Sequential(
            nn.Linear(256+smi_oc, 128),
            nn.Dropout(0.5),
            nn.PReLU(),
            nn.Linear(128, 64),
            nn.Dropout(0.5),
            nn.PReLU(),
            nn.Linear(64,1),
            nn.PReLU())
        

    def forward(self, pro, smi):
      
    # assert pro.shape ==(N, L)
        pro_out = self.protein(pro)
        pro_out = pro_out.view([-1])

        # assert smi.shape == (N, L)
        smi_embed = self.smi_embed(smi)  # (N,L,32)
        smi_embed = torch.transpose(smi_embed, 1, 2)
        smi_conv = self.conv_smi(smi_embed)  # (N,128)
        
        #print(pro_out.size())
        #print(smi_conv.size())
        
        cat = torch.cat([pro_out, smi_conv], dim=0)  # (N,128*3)
        cat = self.cat_dropout(cat)
        
        output = self.classifier(cat)
        return output

In [15]:
model = DeepDTAF()
print(model)

DeepDTAF(
  (smi_embed): Embedding(64, 128)
  (protein): Sequential(
    (0): Linear(in_features=2048, out_features=1024, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): PReLU(num_parameters=1)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): Dropout(p=0.5, inplace=False)
    (5): PReLU(num_parameters=1)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): PReLU(num_parameters=1)
  )
  (conv_smi): Sequential(
    (0): DilatedParllelResidualBlockB(
      (c1): Conv1d(128, 8, kernel_size=(1,), stride=(1,))
      (br1): Sequential(
        (0): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): PReLU(num_parameters=1)
      )
      (d1): CDilated(
        (conv): Conv1d(8, 8, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      )
      (d2): CDilated(
        (conv): Conv1d(8, 8, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,), bias=False)
      )
      (d4): CDilated(
       

In [36]:
state_dict = torch.load('../DeepDTAF/runs/DeepDTAF_20200818104105_33927/best_model.pt',map_location='cpu')
model.load_state_dict(state_dict=state_dict, strict=False)

_IncompatibleKeys(missing_keys=['protein.0.weight', 'protein.0.bias', 'protein.2.weight', 'protein.3.weight', 'protein.3.bias', 'protein.5.weight', 'protein.6.weight', 'protein.6.bias', 'protein.7.weight'], unexpected_keys=['seq_embed.weight', 'seq_embed.bias', 'conv_seq.0.c1.weight', 'conv_seq.0.c1.bias', 'conv_seq.0.br1.0.weight', 'conv_seq.0.br1.0.bias', 'conv_seq.0.br1.0.running_mean', 'conv_seq.0.br1.0.running_var', 'conv_seq.0.br1.0.num_batches_tracked', 'conv_seq.0.br1.1.weight', 'conv_seq.0.d1.conv.weight', 'conv_seq.0.d2.conv.weight', 'conv_seq.0.d4.conv.weight', 'conv_seq.0.d8.conv.weight', 'conv_seq.0.d16.conv.weight', 'conv_seq.0.br2.0.weight', 'conv_seq.0.br2.0.bias', 'conv_seq.0.br2.0.running_mean', 'conv_seq.0.br2.0.running_var', 'conv_seq.0.br2.0.num_batches_tracked', 'conv_seq.0.br2.1.weight', 'conv_seq.1.c1.weight', 'conv_seq.1.c1.bias', 'conv_seq.1.br1.0.weight', 'conv_seq.1.br1.0.bias', 'conv_seq.1.br1.0.running_mean', 'conv_seq.1.br1.0.running_var', 'conv_seq.1.br1

In [17]:
def train(model: nn.Module, test_loader, loss_function, device, show):
    # number of epochs to train the model
n_epochs = 2

## TODO complete epoch and training batch loops
## These loops should update the classifier-weights of this model
## And track (and print out) the training loss over time

train_loss = 0 

for epoch in range(n_epochs):
    for image, label in train_loader:
        if train_on_gpu:
            image, label = image.cuda(), label.cuda()
            
        optimizer.zero_grad()
        output = vgg16(image)
        loss = criterion(output, label)
        train_loss += loss.item()*image.size(0)
        loss.backward()
        optimizer.step()
    

    print("Epoch: {} Loss: {}".format(epoch, train_loss/len(train_loader)))

IndentationError: expected an indented block (534157444.py, line 3)

In [24]:
import numpy as np
import sklearn.metrics as m
from scipy.stats import pearsonr

from numba import njit

@njit
def c_index(y_true, y_pred):
    summ = 0
    pair = 0

    for i in range(1, len(y_true)):
        for j in range(0, i):
            pair += 1
            if y_true[i] > y_true[j]:
                summ += 1 * (y_pred[i] > y_pred[j]) + 0.5 * (y_pred[i] == y_pred[j])
            elif y_true[i] < y_true[j]:
                summ += 1 * (y_pred[i] < y_pred[j]) + 0.5 * (y_pred[i] == y_pred[j])
            else:
                pair -= 1

    if pair is not 0:
        return summ / pair
    else:
        return 0


def RMSE(y_true, y_pred):
    return np.sqrt(m.mean_squared_error(y_true, y_pred))


def MAE(y_true, y_pred):
    return m.mean_absolute_error(y_true, y_pred)


def CORR(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]


def SD(y_true, y_pred):
    from sklearn.linear_model import LinearRegression
    y_pred = y_pred.reshape((-1,1))
    lr = LinearRegression().fit(y_pred,y_true)
    y_ = lr.predict(y_pred)
    return np.sqrt(np.square(y_true - y_).sum() / (len(y_pred) - 1))

In [65]:

best_epoch = -1
best_val_loss = 100000000
train_loss = 0
writer = SummaryWriter()
save_best_epoch = 0
save_path = '../DeepDTAF/runs/best_modelnew.pt'

def train(model: nn.Module, training_loader, loss_function, device, best_val_loss, best_epoch, save_best_epoch, save_path):
    from pathlib import Path
     
    best_val_loss = best_val_loss
    best_epoch = best_epoch
    for epoch in range(1, n_epoch + 1):
        train_loss = 0
        tbar = enumerate(data_loaders['training'])
        for idx, (*x, y) in tbar:
            model.train()

            for i in range(len(x)):
                x[i] = x[i].to(device)
            y = y.to(device)

            optimizer.zero_grad()
            output = model(*x)
            loss = loss_function(output.view(-1), y.view(-1))

            # fp16
            #with amp.scale_loss(loss, optimizer) as scaled_loss:
                #scaled_loss.backward()
            loss.backward() 
            train_loss += loss.item()
            optimizer.step()
            scheduler.step()
            
        print(train_loss)
        print(len(y))
        print("Epoch: {} Loss: {}".format(epoch, train_loss/ len(data_loaders['training'])))
            #tbar.set_description(f' * Train Epoch {epoch} Loss={loss.item() / len(y):.3f}')

        for _p in ['training', 'validation']:
            performance = test(model, data_loaders[_p], loss_function, device, False)
            for i in performance:
                writer.add_scalar(f'{_p} {i}', performance[i], global_step=epoch)
            if _p=='validation' and epoch>=save_best_epoch and performance['loss']<best_val_loss:
                best_val_loss = performance['loss']
                best_epoch = epoch
                print("best validation loss is {}".format(best_val_loss))
                torch.save(model.state_dict(), save_path)
            

In [66]:
model = DeepDTAF()
state_dict = torch.load('../DeepDTAF/runs/DeepDTAF_20200818104105_33927/best_model.pt',map_location='cpu')
model.load_state_dict(state_dict=state_dict, strict=False)
print(model)
for param in model.conv_smi.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = False



n_epoch = 20
optimizer = optim.AdamW(model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-3, epochs=n_epoch,
                                          steps_per_epoch=len(data_loaders['training']))
loss_function = nn.MSELoss(reduction='sum')
device = torch.device("cpu")

DeepDTAF(
  (smi_embed): Embedding(64, 128)
  (protein): Sequential(
    (0): Linear(in_features=2048, out_features=1024, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): PReLU(num_parameters=1)
    (3): Linear(in_features=1024, out_features=512, bias=True)
    (4): Dropout(p=0.5, inplace=False)
    (5): PReLU(num_parameters=1)
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): PReLU(num_parameters=1)
  )
  (conv_smi): Sequential(
    (0): DilatedParllelResidualBlockB(
      (c1): Conv1d(128, 8, kernel_size=(1,), stride=(1,))
      (br1): Sequential(
        (0): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): PReLU(num_parameters=1)
      )
      (d1): CDilated(
        (conv): Conv1d(8, 8, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      )
      (d2): CDilated(
        (conv): Conv1d(8, 8, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,), bias=False)
      )
      (d4): CDilated(
       

In [67]:

train(model, data_loaders, loss_function, device, best_val_loss, best_epoch, save_best_epoch, save_path)

40152.561992417875
1
Epoch: 1 Loss: 3.3724644710581115
best validation loss is 2.431045028606081
41244.455873661085
1
Epoch: 2 Loss: 3.464174019289525
74822.01210899638
1
Epoch: 3 Loss: 6.2843954400299324
543134.8591642381
1
Epoch: 4 Loss: 45.61858383707694
740191.2309678928
1
Epoch: 5 Loss: 62.16959776313563
1951573.9810121742
1
Epoch: 6 Loss: 163.91516722763095
11128951.402185421
1
Epoch: 7 Loss: 934.7347053742164
2035873.7241756537
1
Epoch: 8 Loss: 170.99560928738902
10989053.469117656
1
Epoch: 9 Loss: 922.9845010177772
707072.0648275424
1
Epoch: 10 Loss: 59.38787710629451
819828.7922757787
1
Epoch: 11 Loss: 68.85845727160917
3100021756.417689
1
Epoch: 12 Loss: 260374.7485652351
2932353.6594907367
1
Epoch: 13 Loss: 246.29209301954785
897202.57135559
1
Epoch: 14 Loss: 75.35717884726945
407647.84304665535
1
Epoch: 15 Loss: 34.23885797468968
61711.8445478018
1
Epoch: 16 Loss: 5.1832558834034765
48051.60464297531
1
Epoch: 17 Loss: 4.035915054844223
39239.06079882465
1
Epoch: 18 Loss: 3.

In [28]:
def test(model: nn.Module, test_loader, loss_function, device, show):
    model.eval()
    test_loss = 0
    outputs = []
    targets = []
    with torch.no_grad():
        for idx, (*x, y) in enumerate(test_loader):
            for i in range(len(x)):
                x[i] = x[i].to(device)
            y = y.to(device)

            y_hat = model(*x)

            test_loss += loss_function(y_hat.view(-1), y.view(-1)).item()
            outputs.append(y_hat.cpu().numpy().reshape(-1))
            targets.append(y.cpu().numpy().reshape(-1))

    targets = np.concatenate(targets).reshape(-1)
    outputs = np.concatenate(outputs).reshape(-1)

    test_loss /= len(test_loader.dataset)

    evaluation = {
        'loss': test_loss,
        #'c_index': metrics.c_index(targets, outputs),
        c_index(targets, outputs)
        #'RMSE': metrics.RMSE(targets, outputs),
        #'MAE': metrics.MAE(targets, outputs),
        #'SD': metrics.SD(targets, outputs),
        #'CORR': metrics.CORR(targets, outputs),
    }

    return evaluation