Author: Zachary Strasser and William Funkbusch

Date: 11-20-2020

Import necessary modules

In [3]:
import pandas as pd
import numpy as np
import itertools
import math as m
from matplotlib import pyplot as plt

Read tsv file from BindingDB website

In [4]:
tsv_file= '/content/sample_data/BindingDB_PDSPKi3.tsv'
tsv_file = pd.read_table(tsv_file, sep='\t', error_bad_lines=False)

b'Skipping line 1717: expected 49 fields, saw 85\nSkipping line 1718: expected 49 fields, saw 85\nSkipping line 1719: expected 49 fields, saw 85\nSkipping line 1720: expected 49 fields, saw 85\nSkipping line 1721: expected 49 fields, saw 85\nSkipping line 3561: expected 49 fields, saw 85\nSkipping line 3562: expected 49 fields, saw 85\nSkipping line 3563: expected 49 fields, saw 85\nSkipping line 3564: expected 49 fields, saw 85\nSkipping line 3565: expected 49 fields, saw 85\nSkipping line 3566: expected 49 fields, saw 85\nSkipping line 3567: expected 49 fields, saw 85\nSkipping line 3568: expected 49 fields, saw 85\nSkipping line 3569: expected 49 fields, saw 85\nSkipping line 3570: expected 49 fields, saw 85\nSkipping line 3571: expected 49 fields, saw 85\nSkipping line 3572: expected 49 fields, saw 85\nSkipping line 3573: expected 49 fields, saw 85\nSkipping line 3574: expected 49 fields, saw 85\nSkipping line 4976: expected 49 fields, saw 85\nSkipping line 11152: expected 49 field

Check columns

In [5]:
tsv_file.columns

Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name Assigned by Curator or DataSource',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)',
       'pH', 'Temp (C)', 'Curation/DataSource', 'Article DOI', 'PMID',
       'PubChem AID', 'Patent Number', 'Authors', 'Institution',
       'Link to Ligand in BindingDB', 'Link to Target in BindingDB',
       'Link to Ligand-Target Pair in BindingDB', 'Ligand HET ID in PDB',
       'PDB ID(s) for Ligand-Target Complex', 'PubChem CID', 'PubChem SID',
       'ChEBI ID of Ligand', 'ChEMBL ID of Ligand', 'DrugBank ID of Ligand',
       'IUPHAR_GRAC ID of Ligand', 'KEGG ID of Ligand', 'ZINC ID of Ligand',
       'Number of Protein Chains in Target (>1 implies a multichain complex)',
       'BindingDB Target Chain  Sequence', 'PDB ID(s) of T

Filter the necessary columns - SMILEs, AA chain, and Ki 

In [7]:
tsv_file_short = tsv_file[['Ligand SMILES', 'BindingDB Target Chain  Sequence', 'Ki (nM)']]

Convert PANDA into np.array

In [8]:
DBBind = tsv_file_short.to_numpy()

Remove all numbers from SMILES

In [9]:
value = len(DBBind[:,0])
for x in range((value)):
  DBBind[x,0] = ''.join([i for i in DBBind[x,0] if not i.isdigit()])

First we want to cycle through the SMILES that have two symbols back to back that are single entity and convert this to one symbol. Br-> B, Cl-> K, @@->X.

Subsitute B for Br

In [10]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="Br":
      s = s[:i]+'B' + s[i+2:]
  DBBind[x,0] = s

Substitute ! for Cl

In [11]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="Cl":
      s = s[:i]+'!' + s[i+2:]
  DBBind[x,0] = s

Substitute X for @@

In [12]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="@@":
      s = s[:i]+'X' + s[i+2:]
  DBBind[x,0] = s

Substitute * for Si

In [13]:
for x in range(len(DBBind[:,0])):
  s = DBBind[x,0]
  for i in range(0, len(s)-1):
    if s[i:i+2]=="Si":
      s = s[:i]+'*' + s[i+2:]
  DBBind[x,0] = s

The vast majority of the ligands fall between 20 and 75 length. Therefore we removed any combinations with a SMILE length greater than 90.

In [14]:
value = len(DBBind[:,0])
place_holder = []
for x in range((value)):
  if len(DBBind[x,0]) > 90:
    place_holder.append(x)

In [15]:
DBBind = np.delete(DBBind, place_holder, axis=0)

In [16]:
len(DBBind)

23746

In [20]:
DBBind.shape

(23746, 3)

In [22]:
proteinase = "SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTSEDMLNPNYEDLLIRKSNHNFLVQAGNVQLRVIGHSMQNCVLKLKVDTANPKTPKYKFVRIQPGQTFSVLACYNGSPSGVYQCAMRPNFTIKGSFLNGSCGSVGFNIDYDCVSFCYMHHMELPTGVHAGTDLEGNFYGPFVDRQTAQAAGTDTTITVNVLAWLYAAVINGDRWFLNRFTTTLNDFNLVAMKYNYEPLTQDHVDILGPLSAQTGIAVLDMCASLKELLQNGMNGRTILGSALLEDEFTPFDVVRQCSGVTFQ"

Create a small molecule array

In [42]:
small_molecule_array = (DBBind[:,0:1])

Create an array that is hte same length as the small molecule array but is all the sequence of the protein

In [40]:
proteinase_array = np.full((23746,1), proteinase)

Ensure dimensions are appropriate

In [43]:
small_molecule_array.shape

(23746, 1)

In [44]:
proteinase_array.shape

(23746, 1)

Stack the two

In [47]:
new_array = np.hstack((small_molecule_array, proteinase_array))

Ensure the right dimensions

In [48]:
new_array.shape

(23746, 2)

Change name

In [52]:
DBBind = new_array

For now we added 0s to get the ligand sizes to all equal 800

We then add on 0s to every protein AA sequence to get it to 2400 AA's. Also remove > sign and convert Ki to float

In [53]:
for x in range(len(DBBind[:,0])):
  DBBind[x,0] = DBBind[x,0][::-1]
  DBBind[x,0] = DBBind[x,0].zfill(100)       #fill ligand to 100
  DBBind[x,0] = DBBind[x,0][::-1]
  DBBind[x,1] = DBBind[x,1][::-1]
  DBBind[x,1] = DBBind[x,1].zfill(1000)     #fill protein to 2400
  DBBind[x,1] = DBBind[x,1][::-1]  

Turned array into dataframe

In [54]:
curated_dataframe = pd.DataFrame(data=DBBind)

Renamed columns

In [55]:
curated_dataframe.columns = ['Small_Molecule', "Proteinase"]

Convered dataframe into excel file

In [57]:
curated_dataframe.to_excel("curated_df_for_testing.xlsx")