In [2]:
import concurrent.futures
import logging
import os
import pickle
import subprocess
import tempfile
import urllib.request
from collections import defaultdict

import prody
import numpy as np
import requests
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import KFold, GroupShuffleSplit
from tqdm import tqdm


logger = logging.getLogger(__name__)

In [2]:
class ToughM1:
    """
    TOUGH-M1 dataset by Govindaraj and Brylinski
    https://osf.io/6ngbs/wiki/home/
    """
    def __init__(self):
        self.tough_data_dir = os.path.join(os.environ.get('DATA_DIR'), 'TOUGH-M1')

    
    


In [10]:

directory_names = os.listdir('../../TOUGH-M1/data/')

subdirectory_tuples = [(name[:4], name[-1]) for name in directory_names]
print(subdirectory_tuples)


[('1wnb', 'C'), ('5hws', 'D'), ('4i8q', 'A'), ('1jgi', 'A'), ('3r1r', 'C'), ('4y9d', 'A'), ('1fnz', 'A'), ('4b2y', 'C'), ('1dmy', 'B'), ('3ryr', 'B'), ('3nfz', 'A'), ('4dq6', 'A'), ('2y55', 'C'), ('4mm0', 'A'), ('3a5z', 'G'), ('4d6e', 'A'), ('2dlc', 'B'), ('4iqg', 'C'), ('1w5e', 'A'), ('4nz3', 'A'), ('4pw3', 'D'), ('2c40', 'A'), ('1jb9', 'A'), ('4jr0', 'B'), ('4q42', 'B'), ('4rk1', 'C'), ('3g00', 'A'), ('1h4x', 'B'), ('4oxd', 'A'), ('3sz3', 'A'), ('1v72', 'A'), ('3ixl', 'A'), ('2qcd', 'B'), ('1vbp', 'A'), ('4rsl', 'A'), ('2a73', 'B'), ('3tsd', 'A'), ('1jjv', 'A'), ('2z6m', 'F'), ('1p42', 'B'), ('1dap', 'A'), ('3r77', 'B'), ('3fbu', 'A'), ('5ees', 'A'), ('3lv1', 'B'), ('3rpd', 'B'), ('2zof', 'B'), ('2btd', 'A'), ('5dj4', 'E'), ('4gqt', 'A'), ('3k0z', 'B'), ('4m0r', 'B'), ('4jix', 'A'), ('3hmz', 'A'), ('2cm4', 'A'), ('3oa2', 'B'), ('2afx', 'B'), ('3ipc', 'A'), ('1t1n', 'A'), ('4cki', 'A'), ('2wa2', 'B'), ('4ysh', 'A'), ('1a4e', 'D'), ('3hwk', 'E'), ('3vof', 'A'), ('3clh', 'B'), ('2p0w', 

In [11]:
element_set = set()

for name in tqdm(directory_names):
    pocket_path = f'../../TOUGH-M1/data/{name}/{name}_pocket.pdb'
    pocket = prody.parsePDB(pocket_path)

    elements = pocket.getElements()

    for element in elements:
        element_set.add(element)

  0%|          | 0/7524 [00:00<?, ?it/s]@> 331 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 260 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 603 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 549 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 136 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 227 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 72 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 240 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 142 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 152 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 178 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 143 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 184 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 365 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 361 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 227 atoms and 1 coordinate set(s) were parsed in 0.00s.
@> 139 atoms and 

In [15]:
print(element_set)

protein = prody.parsePDB('../../TOUGH-M1/data/11asA/11asA_pocket.pdb')

print(set(protein.getResnames()))
print(len(set(protein.getResnames())))

@> 357 atoms and 1 coordinate set(s) were parsed in 0.00s.


{'N', 'C', 'S', 'O'}
{'ILE', 'GLU', 'ALA', 'SER', 'ASP', 'PRO', 'LYS', 'VAL', 'GLY', 'ARG', 'GLN', 'TYR', 'HIS', 'LEU'}
14


In [11]:
import pandas as pd
import csv

path = '../../KinomeScripts/paper_data/pdb_sequence_info.csv'

with open(path, mode='r') as csvfile:
    reader = csv.DictReader(csvfile)
    csv_data = [(row['name'], row['clust30']) for row in reader]

def chain_exists_in_name(pdb_id, chain, name):
    return pdb_id in name and chain in name.split('_')[1]

seq = []
red = []

for pdb_id, chain in tqdm(subdirectory_tuples):
    found = False
    for name, clust30 in csv_data:
        if chain_exists_in_name(pdb_id, chain, name):
            seq.append((pdb_id+chain, clust30))
            found = True
            break
    if not found:
        red.append(pdb_id+chain)

100%|██████████| 7525/7525 [02:22<00:00, 52.71it/s]

[('3i6iA', '662'), ('1h3hA', '662'), ('2h3hA', '22761'), ('3l4lA', '662'), ('1v4xA', '662'), ('3i3lA', '662'), ('3d3fB', '662'), ('4i1iA', '22761'), ('3e4eA', '1050'), ('3k3oA', '662'), ('4b4dA', '21'), ('1h2hA', '22761'), ('2b3bC', '22761'), ('1v4xB', '662'), ('1h4hC', '662'), ('1e3eA', '1050'), ('4n5nA', '1050'), ('3f8fA', '1050'), ('3p4pE', '1050')]





In [14]:
print(len(seq), len(red), len(subdirectory_tuples))

19 7506 7525
