Author: Zachary Strasser and William Funkbusch

Date: 11-2-2020

Import necessary modules

In [1]:
import pandas as pd
import numpy as np
import itertools
import math as m
from matplotlib import pyplot as plt

Read tsv file from BindingDB website

In [2]:
tsv_file= '/content/sample_data/BindingDB_PDSPKi3.tsv'
tsv_file = pd.read_table(tsv_file, sep='\t', error_bad_lines=False)

b'Skipping line 1717: expected 49 fields, saw 85\nSkipping line 1718: expected 49 fields, saw 85\nSkipping line 1719: expected 49 fields, saw 85\nSkipping line 1720: expected 49 fields, saw 85\nSkipping line 1721: expected 49 fields, saw 85\nSkipping line 3561: expected 49 fields, saw 85\nSkipping line 3562: expected 49 fields, saw 85\nSkipping line 3563: expected 49 fields, saw 85\nSkipping line 3564: expected 49 fields, saw 85\nSkipping line 3565: expected 49 fields, saw 85\nSkipping line 3566: expected 49 fields, saw 85\nSkipping line 3567: expected 49 fields, saw 85\nSkipping line 3568: expected 49 fields, saw 85\nSkipping line 3569: expected 49 fields, saw 85\nSkipping line 3570: expected 49 fields, saw 85\nSkipping line 3571: expected 49 fields, saw 85\nSkipping line 3572: expected 49 fields, saw 85\nSkipping line 3573: expected 49 fields, saw 85\nSkipping line 3574: expected 49 fields, saw 85\nSkipping line 4976: expected 49 fields, saw 85\nSkipping line 11152: expected 49 field

Check columns

In [3]:
tsv_file.columns

Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name Assigned by Curator or DataSource',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)',
       'pH', 'Temp (C)', 'Curation/DataSource', 'Article DOI', 'PMID',
       'PubChem AID', 'Patent Number', 'Authors', 'Institution',
       'Link to Ligand in BindingDB', 'Link to Target in BindingDB',
       'Link to Ligand-Target Pair in BindingDB', 'Ligand HET ID in PDB',
       'PDB ID(s) for Ligand-Target Complex', 'PubChem CID', 'PubChem SID',
       'ChEBI ID of Ligand', 'ChEMBL ID of Ligand', 'DrugBank ID of Ligand',
       'IUPHAR_GRAC ID of Ligand', 'KEGG ID of Ligand', 'ZINC ID of Ligand',
       'Number of Protein Chains in Target (>1 implies a multichain complex)',
       'BindingDB Target Chain  Sequence', 'PDB ID(s) of T

Filter the necessary columns - SMILEs, AA chain, and Ki 

In [4]:
tsv_file_short = tsv_file[['Ligand SMILES', 'BindingDB Target Chain  Sequence', 'Ki (nM)']]

There are 27,712 SMILE and protein sequence pairs with associated Ki values. 

In [5]:
tsv_file_short.head

<bound method NDFrame.head of                                            Ligand SMILES  ... Ki (nM)
0      Cn1c2ncn(CCN3CCC(CC3)C(=O)c3ccc(F)cc3)c2c(=O)n...  ...     4.5
1                               NC(N)=NN=Cc1c(Cl)cccc1Cl  ...   199.5
2      Fc1ccc(cc1)C(=O)C1CCN(CCn2c(=O)[nH]c3ccccc3c2=...  ...     3.2
3      Cc1nc2ccccn2c(=O)c1CCN1CCC(CC1)=C(c1ccc(F)cc1)...  ...       6
4      Cc1nc2sccn2c(=O)c1CCN1CCC(CC1)=C(c1ccc(F)cc1)c...  ...     5.5
...                                                  ...  ...     ...
27707                 CC1C2Cc3ccc(O)cc3C1(C)CCN2CC=C(C)C  ...    1000
27708                 CC1C2Cc3ccc(O)cc3C1(C)CCN2CC=C(C)C  ...    1000
27709                 CC1C2Cc3ccc(O)cc3C1(C)CCN2CC=C(C)C  ...    1000
27710          CN(C1CCCCC1N1CCCC1)C(=O)Cc1ccc(Cl)c(Cl)c1  ...    1000
27711          CN(C1CCCCC1N1CCCC1)C(=O)Cc1ccc(Cl)c(Cl)c1  ...    1000

[27712 rows x 3 columns]>

Check to see if an rows within SMILE column have NaN

In [6]:
tsv_file_short[['Ligand SMILES']].isnull().values.any()

False

No rows have NaN in SMILEs, now check in the AA row

In [7]:
tsv_file_short[['BindingDB Target Chain  Sequence']].isnull().values.any()

False

Check final column for null values. None found

In [8]:
tsv_file_short[['Ki (nM)']].isnull().values.any()

False

Convert PANDA into np.array

In [9]:
DBBind = tsv_file_short.to_numpy()

Remove all numbers from SMILES

In [10]:
value = len(DBBind[:,0])
for x in range((value)):
  DBBind[x,0] = ''.join([i for i in DBBind[x,0] if not i.isdigit()])

First we want to cycle through the SMILES that have two symbols back to back that are single entity and convert this to one symbol. Br-> B, Cl-> K, @@->X.

Subsitute B for Br

In [24]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="Br":
      s = s[:i]+'B' + s[i+2:]
  DBBind[x,0] = s

Substitute K for Cl

In [25]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="Cl":
      s = s[:i]+'K' + s[i+2:]
  DBBind[x,0] = s

Substitute X for @@

In [26]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="@@":
      s = s[:i]+'X' + s[i+2:]
  DBBind[x,0] = s

Check the length of each of the SMILES. Starting with the minimum and maximum

In [27]:
min([len(x) for x in DBBind[:,0].tolist()])

5

Minimum SMILE is a length of 5

In [28]:
max([len(x) for x in DBBind[:,0].tolist()])

1132

Maximum SMILE is a length of 1132

Now check minimum and maximum of the protein

In [29]:
min([len(x) for x in DBBind[:,1].tolist()])

11

Minimum protein AA is 11

In [30]:
max([len(x) for x in DBBind[:,1].tolist()])

4303

Maximum protein AA is 4303

The vast majority of the ligands fall between 20 and 75 length. Therefore we removed any combinations with a SMILE length greater than 90.

In [31]:
value = len(DBBind[:,0])
place_holder = []
for x in range((value)):
  if len(DBBind[x,0]) > 90:
    place_holder.append(x)

In [32]:
DBBind = np.delete(DBBind, place_holder, axis=0)

Now we remove all proteins greater than 990 AA, which is about 100 pairs

In [33]:
value = len(DBBind[:,0])
place_holder = []
for x in range((value)):
  if len(DBBind[x,1]) > 990:
    place_holder.append(x)

In [34]:
DBBind = np.delete(DBBind, place_holder, axis=0)

Our new shape is (23,109 by 3) representing 23,109 pairs

In [35]:
DBBind.shape

(23109, 3)

For now we added 0s to get the ligand sizes to all equal 800

We then add on 0s to every protein AA sequence to get it to 2400 AA's. Also remove > sign and convert Ki to float

In [36]:
for x in range(len(DBBind[:,0])):
  DBBind[x,0] = DBBind[x,0][::-1]
  DBBind[x,0] = DBBind[x,0].zfill(100)       #fill ligand to 100
  DBBind[x,0] = DBBind[x,0][::-1]
  DBBind[x,1] = DBBind[x,1][::-1]
  DBBind[x,1] = DBBind[x,1].zfill(1000)     #fill protein to 2400
  DBBind[x,1] = DBBind[x,1][::-1]  
  DBBind[x,2] = (DBBind[x,2]).strip()        #strip sides
  if '>' == DBBind[x,2][0] :            #if Ki >10000 treat as 10000, Ki >70000 treat as 10000, ect.
    DBBind[x,2] = DBBind[x,2][1:]
  DBBind[x,2] = float(DBBind[x,2])        #convert Ki to int

Check the head

In [37]:
DBBind[0:3]

array([['Cncncn(CCNCCC(CC)C(=O)cccc(F)cc)cc(=O)n(C)c=O0000000000000000000000000000000000000000000000000000000',
        'MEILCEDNISLSSIPNSLMQLGDGPRLYHNDFNSRDANTSEASNWTIDAENRTNLSCEGYLPPTCLSILHLQEKNWSALLTTVVIILTIAGNILVIMAVSLEKKLQNATNYFLMSLAIADMLLGFLVMPVSMLTILYGYRWPLPSKLCAIWIYLDVLFSTASIMHLCAISLDRYVAIQNPIHHSRFNSRTKAFLKIIAVWTISVGISMPIPVFGLQDDSKVFKEGSCLLADDNFVLIGSFVAFFIPLTIMVITYFLTIKSLQKEATLCVSDLSTRAKLASFSFLPQSSLSSEKLFQRSIHREPGSYAGRRTMQSISNEQKACKVLGIVFFLFVVMWCPFFITNIMAVICKESCNENVIGALLNVFVWIGYLSSAVNPLVYTLFNKTYRSAFSRYIQCQYKENRKPLQLILVNTIPALAYKSSQLQVGQKKNSQEDAEQTVDDCSMVTLGKQQSEENCTDNIETVNEKVSCV000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

Check the tail

In [38]:
DBBind[-3:]

array([['CCCCcccc(O)ccC(C)CCNCC=C(C)C000000000000000000000000000000000000000000000000000000000000000000000000',
        'MESLFPAPFWEVLYGSHFQGNLSLLNETVPHHLLLNASHSAFLPLGLKVTIVGLYLAVCIGGLLGNCLVMYVILRHTKMKTATNIYIFNLALADTLVLLTLPFQGTDILLGFWPFGNALCKTVIAIDYYNMFTSTFTLTAMSVDRYVAICHPIRALDVRTSSKAQAVNVAIWALASVVGVPVAIMGSAQVEDEEIECLVEIPAPQDYWGPVFAICIFLFSFIIPVLIISVCYSLMIRRLRGVRLLSGSREKDRNLRRITRLVLVVVAVFVGCWTPVQVFVLVQGLGVQPGSETAVAILRFCTALGYVNSCLNPILYAFLDENFKACFRKFCCASALHREMQVSDRVRSIAKDVGLGCKTSETVPRPA00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

Switch back from numpy to PANDAS

In [39]:
curated_dataframe = pd.DataFrame(data=DBBind)

Rename the column titles

In [40]:
curated_dataframe.columns = ['SMILES', "Protein", "Ki"]

Print to an excel file

In [41]:
curated_dataframe.to_excel("curated_df.xlsx")