In [2]:
import pandas as pd
import argparse
from utils import set_seed
import numpy as np
import wandb
import math
import re

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
from torch.cuda.amp import GradScaler

from model import GPT, GPTConfig
from trainer import Trainer, TrainerConfig

from seq_embedd import SmilesDataset
import selfies as sf
from PyBioMed.PyProtein import CTD

import json

In [3]:
set_seed(42)

run_name = "Transport_seq"

In [4]:
wandb.init(project="DTproject", name=run_name)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzoey_chen[0m (use `wandb login --relogin` to force relogin)


In [5]:
data = pd.read_csv('../datasets/chemb_drug_selfies.csv')
data = data.dropna(axis=0).reset_index(drop=True)
data.columns = data.columns.str.lower()

In [6]:
data = data.dropna(axis=0).reset_index(drop=True)
data.head()  

Unnamed: 0,dt,smiles,selfies,split
0,dt_P08183,CC(=O)O[C@H]1C(=O)[C@@]2(C)C([C@@H](OC(=O)c3cc...,"['[C]', '[C]', '[=Branch1]', '[C]', '[=O]', '[...",train
1,dt_O15245,COc1ccc2cc3[n+](cc2c1OCCOCCOCCOCCOc1c(OC)ccc2c...,"['[C]', '[O]', '[C]', '[=C]', '[C]', '[=C]', '...",train
2,dt_Q9UNQ0,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](OC(=O)CCCC(=O...,"['[C]', '[C]', '[=Branch1]', '[C]', '[=O]', '[...",train
3,dt_P48065,C[N+](C)(C)CCO,"['[C]', '[N+1]', '[Branch1]', '[C]', '[C]', '[...",train
4,dt_Q9Y6L6,CC[C@@H](C)C(=O)O[C@H]1C[C@@H](C)C=C2C=C[C@H](...,"['[C]', '[C]', '[C@@H1]', '[Branch1]', '[C]', ...",train


In [7]:
pro_seq = pd.read_csv("../datasets/transport_pro_seq.txt",sep='\t')
pro_seq = pro_seq.dropna(axis=0).reset_index(drop=True)
pro_seq = pro_seq.rename(columns={"uniprot_id":"dt"})

In [8]:
pro_seq.head()

Unnamed: 0,dt,seq
0,dt_P08183,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...
1,dt_Q8TCC7,MTFSEILDRVGSMGHFQFLHVAILGLPILNMANHNLLQIFTAATPV...
2,dt_Q9NPD5,MDQHQHLNKTAESASSEKKKTRRCNGFKMFLAALSFSYIAKALGGI...
3,dt_O15244,MPTTVDDVLEHGGEFHFFQKQMFFLLALLSATFAPIYVGIVFLGFT...
4,dt_Q9Y6L6,MDQNQHLNKTAEAQPSENKKTRYCNGLKMFLAALSLSFIAKTLGAI...


In [9]:
merge_data = pd.merge(data,pro_seq,how="right",on="dt")
merge_data = merge_data.dropna(axis=0).reset_index(drop=True)

In [10]:
merge_data

Unnamed: 0,dt,smiles,selfies,split,seq
0,dt_P08183,CC(=O)O[C@H]1C(=O)[C@@]2(C)C([C@@H](OC(=O)c3cc...,"['[C]', '[C]', '[=Branch1]', '[C]', '[=O]', '[...",train,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...
1,dt_P08183,Cc1c2oc3c(C)ccc(C(=O)NC4C(=O)NC(C(C)C)C(=O)N5C...,"['[C]', '[C]', '[=C]', '[O]', '[C]', '[=C]', '...",train,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...
2,dt_P08183,CC(=O)O[C@@]12CO[C@@H]1C[C@H](O)[C@@]1(C)C(=O)...,"['[C]', '[C]', '[=Branch1]', '[C]', '[=O]', '[...",train,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...
3,dt_P08183,C/C(=C\C=C\C=C(/C)\C=C\C=C(/C)\C(=O)C[C@]12[C@...,"['[C]', '[/C]', '[=Branch2]', '[Ring2]', '[N]'...",train,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...
4,dt_P08183,CC(C)OC(=O)OCOP(=O)([O-])CO[C@H](C)Cn1cnc2c(N)...,"['[C]', '[C]', '[Branch1]', '[C]', '[C]', '[O]...",train,MDLEGDRNGGAKKKNFFKLNNKSEKDKKEKKPTVSVFSMFRYSNWL...
...,...,...,...,...,...
24049,dt_Q13183,[O-]S(=O)(=O)[O-],"['[O-1]', '[S]', '[=Branch1]', '[C]', '[=O]', ...",train,MATCWQALWAYRSYLIVFFVPILLLPLPILVPSKEAYCAYAIILMA...
24050,dt_Q13183,O=S(=O)([O-])[O-].[Ca+2],"['[O]', '[=S]', '[=Branch1]', '[C]', '[=O]', '...",train,MATCWQALWAYRSYLIVFFVPILLLPLPILVPSKEAYCAYAIILMA...
24051,dt_Q13183,C(C(=O)O)C(=O)O,"['[C]', '[Branch1]', '[=Branch1]', '[C]', '[=B...",test,MATCWQALWAYRSYLIVFFVPILLLPLPILVPSKEAYCAYAIILMA...
24052,dt_Q13183,O=S(=O)([O-])[O-].[Zn+2],"['[O]', '[=S]', '[=Branch1]', '[C]', '[=O]', '...",test,MATCWQALWAYRSYLIVFFVPILLLPLPILVPSKEAYCAYAIILMA...


In [11]:
#Get selfies train and validation datasets

train_data = merge_data[merge_data['split'] == 'train'].reset_index(drop=True)
val_data = merge_data[merge_data['split'] == 'test'].reset_index(drop=True)

selfies_list = list(train_data['selfies'])
vselfies_list = list(val_data['selfies'])

print(len(selfies_list))
print(len(vselfies_list))

17146
6908


In [12]:
#Get All charsets from datasets

from torchtext.legacy import data as d
from torchtext.vocab import Vectors


all_selfies = data['selfies'].to_list()
BLANK_WORD = "<blank>"
tokenizer = lambda x: x.split()
TGT = d.Field(tokenize=tokenizer,pad_token=BLANK_WORD)
src = []
src_len = []
for i in all_selfies:
    i = i[2:-2].replace("\\\\","\\")
    src.append(i.split("', '"))
    src_len.append(len(i.split("', '")))
#max_len = max(src_len) + 2

TGT.build_vocab(src)
#vocab_size = len(TGT.vocab.freqs.most_common()) + 3


whole_string = []
for k in TGT.vocab.stoi.keys():
    whole_string.append(k)
print(len(whole_string))

128


In [13]:
#Get All charsets index
stoi = json.load(open(f'../datasets/drug_selfies_stoi.json', 'r'))
itos = dict(zip(stoi.values(), stoi.keys()))

In [14]:
#Gets the longest string to be flattened later

max_len = max(src_len)
max_len

359

In [15]:
#Treat selfies as inputs of equal length to guarantee that the input model does not have dimensional problems

selfies = []
BLANK_WORD = '<blank>'
for s in selfies_list:
    s = eval(s)
    while len(s) < max_len+1:   #In case the end information is lost
        s.append(BLANK_WORD)
    
    selfies.append(s)
    
vselfies = [] 
#BOS_WORD = '<s>'
#EOS_WORD = '</s>'
BLANK_WORD = '<blank>'
for vs in vselfies_list:
    vs = eval(vs)
    while len(vs) < max_len+1:  #In case the end information is lost
        vs.append(BLANK_WORD)
    
    vselfies.append(vs)

In [16]:
#Obtain protein sequence conditions

pro = train_data["seq"]
vpro = val_data["seq"]

#Obtain protein sequence embedding length
pro_len=147

In [17]:
train_dataset = SmilesDataset(selfies,whole_string,stoi,itos,max_len,aug_prob=0,pro=pro)
valid_dataset = SmilesDataset(vselfies,whole_string,stoi,itos,max_len,aug_prob=0,pro=vpro)

data has 17146 smiles, 128 unique characters.
data has 6908 smiles, 128 unique characters.


In [30]:
#parameters
n_layer = 8
n_head = 8
n_embd = 256

max_epochs = 10
batch_size = 16
learning_rate = 6e-4

In [31]:
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.max_len, pro_len=pro_len,  # args.num_props,
                        n_layer=n_layer, n_head=n_head, n_embd=n_embd,
                        lstm=False)

In [32]:
model = GPT(mconf)

In [33]:
tconf = TrainerConfig(max_epochs=max_epochs, batch_size=batch_size, 
                      learning_rate=learning_rate,
                      lr_decay=True, warmup_tokens=0.1*len(train_data)*max_len, 
                      final_tokens= max_epochs*len(train_data)*max_len,
                      num_workers=0, 
                      ckpt_path=f'../result/models/{run_name}.pt', 
                      block_size=train_dataset.max_len, generate=False)

In [34]:
trainer = Trainer(model, train_dataset, valid_dataset,
                  tconf, train_dataset.stoi, train_dataset.itos)

In [35]:
df = trainer.train(wandb)

  0%|          | 0/2144 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 10.73 GiB total capacity; 2.39 GiB already allocated; 12.19 MiB free; 2.41 GiB reserved in total by PyTorch)