# Binding site detection using DoGSiteScorer

## Imports

In [3]:
import requests

import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Jaime's DoGSiteScorer functions

In [55]:
def dogsite_scorer_submit_with_pdbid(pdb_code, chain_id, ligand=''):
    """This is the official API, but they only allow PDB codes, not custom ones..."""
    # Submit job to proteins.plus
    r = requests.post("https://proteins.plus/api/dogsite_rest",
        json={
            "dogsite": {
                "pdbCode": pdb_code,
                "analysisDetail": "1",
                "bindingSitePredictionGranularity": "1",
                "ligand": ligand,
                "chain": chain_id
            }
        },
        headers= {'Content-type': 'application/json', 'Accept': 'application/json'}
    )

    r.raise_for_status()
    # We have to query location for updates on the calculation
    return r.json()['location']

In [56]:
def dogsite_scorer_guess_binding_site(protein):
    """
    Use proteins.plus' DoGSiteScorer to retrieve most probable binding site in protein.
    
    Parameters
    ----------
    protein : str
        PDB code (4 characters) or PDB file path.
    
    """
    if len(protein) == 4:  # pdb code
        job_location = dogsite_scorer_submit_with_pdbid(protein)
    #lif protein.endswith('.pdb'):
    #   job_location = dogsite_scorer_submit_with_custom_pdb(protein)
    else:
        raise ValueError("`protein` must be a PDB ID or a path to a .pdb file!")
    
    # Check when the calculation has finished
    while True:
        result = requests.get(job_location)
        result.raise_for_status()  # if it fails, it will stop here
        if result.status_code == 202:  # still running
            time.sleep(5)
            continue
        break
    
    # the residues files contain the geometric center and radius as a comment in the PDB file
    # first file (residues[0]) is the best scored pocket
    pdb_residues = requests.get(result.json()['residues'][0]).text
    for line in pdb_residues.splitlines():
        line = line.strip()
        if line.startswith('HEADER') and 'Geometric pocket center at' in line:
            fields = line.split()
            center = [float(x) for x in fields[5:8]]
            radius = float(fields[-1])
            break
    return center, radius  # this is what we need for our Vina calculation

## Try out API

In [5]:
pdb_code='3w32'

In [6]:
chain_id='A'

### POST request

In [47]:
r = requests.post("https://proteins.plus/api/dogsite_rest",
        json={
            "dogsite": {
                "pdbCode": pdb_code,
                "analysisDetail": "1",
                "bindingSitePredictionGranularity": "1",
                "ligand": "",
                "chain": chain_id
            }
        },
        headers= {'Content-type': 'application/json', 'Accept': 'application/json'}
    )

In [48]:
r.raise_for_status()

In [54]:
r.json()['location']

'https://proteins.plus/api/dogsite_rest/Yn2WyiSZp5wPh6oF6TjnKxag'

In [57]:
def dogsite_scorer_post_request(pdb_code, chain):
    r = requests.post("https://proteins.plus/api/dogsite_rest",
        json={
            "dogsite": {
                "pdbCode": pdb_code,
                "analysisDetail": "1",
                "bindingSitePredictionGranularity": "1",
                "ligand": "",
                "chain": chain
            }
        },
        headers= {'Content-type': 'application/json', 'Accept': 'application/json'}
    )
    return r

In [44]:
r = dogsite_scorer_post_request('3w32', 'B')

In [46]:
r.text

'{"status_code":400,"error":"Bad Request","message":"Invalid chain"}'

### GET request

In [36]:
rr = requests.get("https://proteins.plus/api/dogsite_rest")

In [37]:
rr.raise_for_status()

HTTPError: 404 Client Error: Not Found for url: https://proteins.plus/api/dogsite_rest

### Short intro to classes

In [23]:
class Person:
    """
    This class describes a person.
    
    Attributes
    ----------
    name : str
        Person's name.
    age : int
        Person's age.
        
    Parameters
    ----------
    
    """
    
    def __init__(self, name, age):
        self.name = name
        self.age = age
        
    def is_older_than_30(self):
        return self.age > 30
        

In [24]:
person1 = Person('Abishek', 23)

In [25]:
person1.name

'Abishek'

In [26]:
person1.is_older_than_30()

False

### Get job location

In [5]:
job_location = dogsite_scorer_submit_with_pdbid('3w32', 'A')

In [6]:
job_location

'https://proteins.plus/api/dogsite_rest/Yn2WyiSZp5wPh6oF6TjnKxag'

### Get result

In [7]:
result = requests.get(job_location)

In [8]:
result

<Response [200]>

In [9]:
result.json()

{'status_code': 200,
 'result_table': 'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_desc.txt',
 'residues': ['https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_0_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_1_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_2_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_3_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_4_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_0_5_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_1_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_1_0_res.pdb',
  'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_P_1_1_res.pdb',
  'h

In [10]:
result.json().keys()

dict_keys(['status_code', 'result_table', 'residues', 'pockets', 'descriptor_explanation', 'parameters'])

### Check out result table

In [11]:
result.json()['result_table']

'https://proteins.plus/results/dogsite/Yn2WyiSZp5wPh6oF6TjnKxag/3w32_desc.txt'

In [12]:
result_table = requests.get(result.json()['result_table']).text

In [13]:
result_table

'name\tlig_cov\tpoc_cov\tlig_name\tvolume\tenclosure\tsurface\tdepth\tsurf/vol\tlid/hull\tellVol\tell c/a\tell b/a\tsiteAtms\taccept\tdonor\thydrophobic_interactions\thydrophobicity\tmetal\tCs\tNs\tOs\tSs\tXs\tnegAA\tposAA\tpolarAA\tapolarAA\tALA\tARG\tASN\tASP\tCYS\tGLN\tGLU\tGLY\tHIS\tILE\tLEU\tLYS\tMET\tPHE\tPRO\tSER\tTHR\tTRP\tTYR\tVAL\tsimpleScore\tdrugScore\nP_0\t0.00\t0.00\t""\t1422.66\t0.10\t1673.75\t19.26\t1.176493329397045\t-\t-\t0.13\t0.67\t288\t  86\t  40\t  71\t0.36\t   0\t 198\t  45\t  41\t   4\t   0\t0.10\t0.13\t0.24\t0.53\t   4\t   5\t   2\t   5\t   2\t   2\t   1\t   5\t   0\t   3\t  12\t   3\t   2\t   3\t   3\t   1\t   2\t   1\t   1\t   5\t0.63\t0.810023\nP_0_0\t0.00\t0.00\t""\t599.23\t0.06\t540.06\t17.51\t0.9012566126529045\t-\t-\t0.14\t0.22\t131\t  35\t  13\t  25\t0.34\t   0\t  95\t  16\t  17\t   3\t   0\t0.03\t0.10\t0.28\t0.59\t   1\t   2\t   1\t   1\t   2\t   1\t   0\t   2\t   0\t   2\t   7\t   1\t   2\t   2\t   1\t   0\t   2\t   0\t   0\t   2\t0.59\t0.620201\nP_0_

In [14]:
# Split string into list of lists (=table)
result_table_split = [i.split('\t') for i in result_table[:-1].split('\n')]

In [15]:
result_table_split

[['name',
  'lig_cov',
  'poc_cov',
  'lig_name',
  'volume',
  'enclosure',
  'surface',
  'depth',
  'surf/vol',
  'lid/hull',
  'ellVol',
  'ell c/a',
  'ell b/a',
  'siteAtms',
  'accept',
  'donor',
  'hydrophobic_interactions',
  'hydrophobicity',
  'metal',
  'Cs',
  'Ns',
  'Os',
  'Ss',
  'Xs',
  'negAA',
  'posAA',
  'polarAA',
  'apolarAA',
  'ALA',
  'ARG',
  'ASN',
  'ASP',
  'CYS',
  'GLN',
  'GLU',
  'GLY',
  'HIS',
  'ILE',
  'LEU',
  'LYS',
  'MET',
  'PHE',
  'PRO',
  'SER',
  'THR',
  'TRP',
  'TYR',
  'VAL',
  'simpleScore',
  'drugScore'],
 ['P_0',
  '0.00',
  '0.00',
  '""',
  '1422.66',
  '0.10',
  '1673.75',
  '19.26',
  '1.176493329397045',
  '-',
  '-',
  '0.13',
  '0.67',
  '288',
  '  86',
  '  40',
  '  71',
  '0.36',
  '   0',
  ' 198',
  '  45',
  '  41',
  '   4',
  '   0',
  '0.10',
  '0.13',
  '0.24',
  '0.53',
  '   4',
  '   5',
  '   2',
  '   5',
  '   2',
  '   2',
  '   1',
  '   5',
  '   0',
  '   3',
  '  12',
  '   3',
  '   2',
  '   3',
  '

In [16]:
# Remove spaces
result_table_split = [[j.replace(' ', '') for j in i] for i in result_table_split]

In [17]:
result_table_split

[['name',
  'lig_cov',
  'poc_cov',
  'lig_name',
  'volume',
  'enclosure',
  'surface',
  'depth',
  'surf/vol',
  'lid/hull',
  'ellVol',
  'ellc/a',
  'ellb/a',
  'siteAtms',
  'accept',
  'donor',
  'hydrophobic_interactions',
  'hydrophobicity',
  'metal',
  'Cs',
  'Ns',
  'Os',
  'Ss',
  'Xs',
  'negAA',
  'posAA',
  'polarAA',
  'apolarAA',
  'ALA',
  'ARG',
  'ASN',
  'ASP',
  'CYS',
  'GLN',
  'GLU',
  'GLY',
  'HIS',
  'ILE',
  'LEU',
  'LYS',
  'MET',
  'PHE',
  'PRO',
  'SER',
  'THR',
  'TRP',
  'TYR',
  'VAL',
  'simpleScore',
  'drugScore'],
 ['P_0',
  '0.00',
  '0.00',
  '""',
  '1422.66',
  '0.10',
  '1673.75',
  '19.26',
  '1.176493329397045',
  '-',
  '-',
  '0.13',
  '0.67',
  '288',
  '86',
  '40',
  '71',
  '0.36',
  '0',
  '198',
  '45',
  '41',
  '4',
  '0',
  '0.10',
  '0.13',
  '0.24',
  '0.53',
  '4',
  '5',
  '2',
  '5',
  '2',
  '2',
  '1',
  '5',
  '0',
  '3',
  '12',
  '3',
  '2',
  '3',
  '3',
  '1',
  '2',
  '1',
  '1',
  '5',
  '0.63',
  '0.810023'],

In [18]:
# Extract column names, index names, table body
column_names = result_table_split[0]
index_names = [i[0] for i in result_table_split[1:]]
table = [i[1:] for i in result_table_split[1:]]

In [19]:
table

[['0.00',
  '0.00',
  '""',
  '1422.66',
  '0.10',
  '1673.75',
  '19.26',
  '1.176493329397045',
  '-',
  '-',
  '0.13',
  '0.67',
  '288',
  '86',
  '40',
  '71',
  '0.36',
  '0',
  '198',
  '45',
  '41',
  '4',
  '0',
  '0.10',
  '0.13',
  '0.24',
  '0.53',
  '4',
  '5',
  '2',
  '5',
  '2',
  '2',
  '1',
  '5',
  '0',
  '3',
  '12',
  '3',
  '2',
  '3',
  '3',
  '1',
  '2',
  '1',
  '1',
  '5',
  '0.63',
  '0.810023'],
 ['0.00',
  '0.00',
  '""',
  '599.23',
  '0.06',
  '540.06',
  '17.51',
  '0.9012566126529045',
  '-',
  '-',
  '0.14',
  '0.22',
  '131',
  '35',
  '13',
  '25',
  '0.34',
  '0',
  '95',
  '16',
  '17',
  '3',
  '0',
  '0.03',
  '0.10',
  '0.28',
  '0.59',
  '1',
  '2',
  '1',
  '1',
  '2',
  '1',
  '0',
  '2',
  '0',
  '2',
  '7',
  '1',
  '2',
  '2',
  '1',
  '0',
  '2',
  '0',
  '0',
  '2',
  '0.59',
  '0.620201'],
 ['0.00',
  '0.00',
  '""',
  '201.73',
  '0.08',
  '381.07',
  '11.36',
  '1.8890100629554356',
  '-',
  '-',
  '0.17',
  '0.25',
  '51',
  '17',
  

In [20]:
result_table_df = pd.DataFrame(
    table,
    columns=column_names[1:],
    index=index_names
)

In [21]:
result_table_df.index.name = 'name'

In [22]:
result_table_df

Unnamed: 0_level_0,lig_cov,poc_cov,lig_name,volume,enclosure,surface,depth,surf/vol,lid/hull,ellVol,ellc/a,ellb/a,siteAtms,accept,donor,hydrophobic_interactions,hydrophobicity,metal,Cs,Ns,Os,Ss,Xs,negAA,posAA,polarAA,apolarAA,ALA,ARG,ASN,ASP,CYS,GLN,GLU,GLY,HIS,ILE,LEU,LYS,MET,PHE,PRO,SER,THR,TRP,TYR,VAL,simpleScore,drugScore
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
P_0,0.0,0.0,"""""",1422.66,0.1,1673.75,19.26,1.176493329397045,-,-,0.13,0.67,288,86,40,71,0.36,0,198,45,41,4,0,0.1,0.13,0.24,0.53,4,5,2,5,2,2,1,5,0,3,12,3,2,3,3,1,2,1,1,5,0.63,0.810023
P_0_0,0.0,0.0,"""""",599.23,0.06,540.06,17.51,0.9012566126529044,-,-,0.14,0.22,131,35,13,25,0.34,0,95,16,17,3,0,0.03,0.1,0.28,0.59,1,2,1,1,2,1,0,2,0,2,7,1,2,2,1,0,2,0,0,2,0.59,0.620201
P_0_1,0.0,0.0,"""""",201.73,0.08,381.07,11.36,1.889010062955436,-,-,0.17,0.25,51,17,9,10,0.28,0,36,6,7,2,0,0.08,0.17,0.25,0.5,1,1,0,1,1,1,0,0,0,0,3,1,1,0,0,0,1,0,0,1,0.17,0.174816
P_0_2,0.0,0.0,"""""",185.6,0.17,282.0,9.35,1.519396551724138,-,-,0.45,0.55,48,17,8,12,0.32,0,31,8,8,1,0,0.17,0.25,0.08,0.5,0,2,0,1,0,0,1,1,0,0,2,1,1,1,0,0,0,0,0,2,0.13,0.195695
P_0_3,0.0,0.0,"""""",175.3,0.15,297.42,9.29,1.6966343411294922,-,-,0.23,0.37,48,16,8,14,0.37,0,32,8,8,0,0,0.14,0.14,0.36,0.36,1,1,1,2,0,0,0,3,0,0,1,1,0,1,1,1,0,0,0,1,0.13,0.168845
P_0_4,0.0,0.0,"""""",170.37,0.08,390.1,11.99,2.289722368961672,-,-,0.16,0.2,47,14,7,17,0.45,0,34,7,6,0,0,0.0,0.18,0.18,0.64,2,2,0,0,0,1,0,0,0,1,3,0,0,0,0,0,0,1,1,0,0.15,0.223742
P_0_5,0.0,0.0,"""""",90.43,0.24,177.5,6.24,1.9628441888753727,-,-,0.7,0.89,26,8,6,5,0.26,0,16,6,4,0,0,0.12,0.25,0.25,0.38,0,1,1,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,2,0.0,0.165232
P_1,0.0,0.0,"""""",708.99,0.13,1030.19,14.32,1.4530388298847656,-,-,0.14,0.59,140,44,13,34,0.37,0,98,17,25,0,0,0.14,0.11,0.36,0.39,3,1,1,0,0,0,4,4,0,1,4,2,0,1,1,2,2,0,1,1,0.46,0.755915
P_1_0,0.0,0.0,"""""",496.9,0.11,739.17,12.72,1.4875628899174884,-,-,0.14,0.18,103,34,8,22,0.34,0,74,12,17,0,0,0.18,0.09,0.32,0.41,2,1,1,0,0,0,4,1,0,1,4,1,0,0,1,2,2,0,1,1,0.49,0.465489
P_1_1,0.0,0.0,"""""",212.1,0.16,454.31,11.03,2.141961338991042,-,-,0.15,0.34,59,18,7,20,0.44,0,40,7,12,0,0,0.17,0.17,0.33,0.33,2,1,0,0,0,0,2,3,0,0,1,1,0,1,0,0,0,0,1,0,0.19,0.24299


### Check out file with pocket residues

In [23]:
pdb_residues = requests.get(result.json()['residues'][0]).text

In [24]:
pdb_residues

'HEADER\tOutput of DoGSiteScorer by A. Volkamer\nHEADER\tPocket 0 with 288 binding site atoms written.\nHEADER\tReferences: \nHEADER\tA. Volkamer et al. Analyzing the topology of active sites: on the prediction of pockets and subpockets. J. Chem. Inf. Model. 2010,50(11), 2041-52\nHEADER\tA. Volkamer et al. Combining global and local measures for structure-based druggability predictions. J. Chem. Inf. Model. 2012,52,360-372\nHEADER\tGeometric pocket center at  13.65  29.95   8.29 with max radius 20.00\nATOM      2  CA  GLN A 701      -0.291  31.978  -3.835  0.00  0.00           C\nATOM      3  C   GLN A 701       0.946  31.062  -3.957  0.00  0.00           C\nATOM      4  O   GLN A 701       0.876  29.863  -3.659  0.00  0.00           O\nATOM      6  CG  GLN A 701      -2.441  30.619  -3.562  0.00  0.00           C\nATOM     12  C   ALA A 702       3.775  30.322  -3.161  0.00  0.00           C\nATOM     13  O   ALA A 702       3.538  30.924  -2.110  0.00  0.00           O\nATOM     15  

## Try out other things

In [25]:
pd.read_json(
    'data/test.json', 
    orient='split'
)

Unnamed: 0,col 1,col 2
row 1,a,b
row 2,c,d
