Это ноутбук для подготовки датасета из PDBbind (предварительно в директорию /mnt/storage/vladislove2020/masif_na/PN был скачан датасет с белок-нуклеиновыми комплексами за 2020 год с сайта https://pdbbind-plus.org.cn/download)

In [81]:
from typing import List, Set, Dict, Tuple, Optional
from pathlib import Path

import tqdm

import os
import glob

import numpy as np
import pandas as pd

import prody

In [6]:
SOURCE_PATH = Path('/mnt/storage/vladislove2020/masif_na/PN')

In [9]:
os.listdir(SOURCE_PATH / 'index')

['2020_index.lst',
 'INDEX_refined_name.2020',
 'INDEX_general_NL.2020',
 'INDEX_general_PN.2020',
 'INDEX_structure.2020',
 'INDEX_general_PL_data.2020',
 'INDEX_refined_set.2020',
 'INDEX_general_PL_name.2020',
 'INDEX_general_PP.2020',
 'INDEX_refined_data.2020',
 'INDEX_general_PL.2020']

In [35]:
with open(SOURCE_PATH / 'index' / 'INDEX_general_PN.2020', 'r') as f:
    contents = f.readlines()

In [21]:
for line in contents[: 12]:
    print(line, end='')

# List of protein-nucleic acid complexes with binding data in PDBbind v.2020
# 1052 complexes in total, sorted by their release year
# Latest update: July 2021
# PDB code, resolution, release year, binding data, reference, ligand name
1hvo   NMR  1994  Kd=11uM       // 1hvn.pdf (DNA) Zn(HIV1-Fl):d(TTTGTTT)
1hvn   NMR  1994  Kd=5uM        // 1hvn.pdf (DNA) Zn(HIV1-Fl):d(ACGCC)
1ytf  2.50  1996  Kd=20pM       // 1ytf.pdf (YEAST TFIIA) YEAST TFIIA/TBP/DNA COMPLEX, Kd=20pM
1wet  2.60  1997  Kd=1.5uM      // 1wet.pdf (DNA OPERATOR) THE PURR-GUANINE-PURF OPERATOR ternary complex
1ign  2.25  1997  Kd=13pM       // 1ign.pdf (DNA) The specific Kd for a ribosomal gene binding site
1bdi  3.00  1997  Kd=2.5nM      // 1bdi.pdf (HPA) Kd is for DNA and protein, Kd is 900nM for protein K55A (from other references)


In [19]:
columns = [w.strip() for w in contents[4].split(', ')]
columns

['# PDB code',
 'resolution',
 'release year',
 'binding data',
 'reference',
 'ligand name']

In [36]:
contents_processed = []

for line in contents[6: ]:
    contents_processed.append({})
    for column, value in zip(
        columns,
        [w.lstrip('// ') for w in line.split('  ') if w != '']
    ):
        contents_processed[-1][column] = value

In [37]:
df = pd.DataFrame(contents_processed)
df

Unnamed: 0,# PDB code,resolution,release year,binding data,reference,ligand name
0,1hvo,NMR,1994,Kd=11uM,1hvn.pdf (DNA) Zn(HIV1-Fl):d(TTTGTTT)\n,
1,1hvn,NMR,1994,Kd=5uM,1hvn.pdf (DNA) Zn(HIV1-Fl):d(ACGCC)\n,
2,1ytf,2.50,1996,Kd=20pM,1ytf.pdf (YEAST TFIIA) YEAST TFIIA/TBP/DNA COM...,
3,1wet,2.60,1997,Kd=1.5uM,1wet.pdf (DNA OPERATOR) THE PURR-GUANINE-PURF ...,
4,1ign,2.25,1997,Kd=13pM,1ign.pdf (DNA) The specific Kd for a ribosomal...,
...,...,...,...,...,...,...
1047,6o16,2.88,2019,Kd=9.83nM,6o16.pdf (10-mer)\n,
1048,6on0,1.60,2019,Kd=90nM,6on0.pdf (17-mer)\n,
1049,6qfd,2.13,2019,Kd=91.59nM,6qfd.pdf (28-mer) SELEX experiment\n,
1050,6qh0,2.44,2019,Kd=99.46nM,6qfd.pdf (28-mer) SELEX experiment\n,


In [38]:
df['ligand name'] = df['reference'].apply(
    lambda s: s.split(' ', 1)[1]
)
df['reference'] = df['reference'].apply(
    lambda s: s.split(' ', 1)[0]
)
df

Unnamed: 0,# PDB code,resolution,release year,binding data,reference,ligand name
0,1hvo,NMR,1994,Kd=11uM,1hvn.pdf,(DNA) Zn(HIV1-Fl):d(TTTGTTT)\n
1,1hvn,NMR,1994,Kd=5uM,1hvn.pdf,(DNA) Zn(HIV1-Fl):d(ACGCC)\n
2,1ytf,2.50,1996,Kd=20pM,1ytf.pdf,"(YEAST TFIIA) YEAST TFIIA/TBP/DNA COMPLEX, Kd=..."
3,1wet,2.60,1997,Kd=1.5uM,1wet.pdf,(DNA OPERATOR) THE PURR-GUANINE-PURF OPERATOR ...
4,1ign,2.25,1997,Kd=13pM,1ign.pdf,(DNA) The specific Kd for a ribosomal gene bin...
...,...,...,...,...,...,...
1047,6o16,2.88,2019,Kd=9.83nM,6o16.pdf,(10-mer)\n
1048,6on0,1.60,2019,Kd=90nM,6on0.pdf,(17-mer)\n
1049,6qfd,2.13,2019,Kd=91.59nM,6qfd.pdf,(28-mer) SELEX experiment\n
1050,6qh0,2.44,2019,Kd=99.46nM,6qfd.pdf,(28-mer) SELEX experiment\n


In [51]:
def split_relation(text):
    const = ''
    rel = ''
    val = ''
    units = ''
    i = 0
    while text[i] not in '=~<>':
        const += text[i]
        i += 1
    while not text[i].isdigit():
        rel += text[i]
        i += 1
    while text[i].isdigit() or text[i] in '.,':
        if text[i] == ',':
            val += '.'
        else:
            val += text[i]
        i += 1
    units += text[i: ]
    return const, rel, float(val), units

In [53]:
df['const'], df['relation'], df['value'], df['units'] = zip(*df['binding data'].apply(split_relation))
df

Unnamed: 0,# PDB code,resolution,release year,binding data,reference,ligand name,const,relation,value,units
0,1hvo,NMR,1994,Kd=11uM,1hvn.pdf,(DNA) Zn(HIV1-Fl):d(TTTGTTT)\n,Kd,=,11.00,uM
1,1hvn,NMR,1994,Kd=5uM,1hvn.pdf,(DNA) Zn(HIV1-Fl):d(ACGCC)\n,Kd,=,5.00,uM
2,1ytf,2.50,1996,Kd=20pM,1ytf.pdf,"(YEAST TFIIA) YEAST TFIIA/TBP/DNA COMPLEX, Kd=...",Kd,=,20.00,pM
3,1wet,2.60,1997,Kd=1.5uM,1wet.pdf,(DNA OPERATOR) THE PURR-GUANINE-PURF OPERATOR ...,Kd,=,1.50,uM
4,1ign,2.25,1997,Kd=13pM,1ign.pdf,(DNA) The specific Kd for a ribosomal gene bin...,Kd,=,13.00,pM
...,...,...,...,...,...,...,...,...,...,...
1047,6o16,2.88,2019,Kd=9.83nM,6o16.pdf,(10-mer)\n,Kd,=,9.83,nM
1048,6on0,1.60,2019,Kd=90nM,6on0.pdf,(17-mer)\n,Kd,=,90.00,nM
1049,6qfd,2.13,2019,Kd=91.59nM,6qfd.pdf,(28-mer) SELEX experiment\n,Kd,=,91.59,nM
1050,6qh0,2.44,2019,Kd=99.46nM,6qfd.pdf,(28-mer) SELEX experiment\n,Kd,=,99.46,nM


In [55]:
df['units'].unique()

array(['uM', 'pM', 'nM', 'fM', 'mM'], dtype=object)

In [57]:
powers_mapping = {
    'mM': 1e-3,
    'uM': 1e-6,
    'nM': 1e-9,
    'pM': 1e-12,
    'fM': 1e-15
}

In [60]:
df_new = pd.DataFrame()
df_new['pdb_id'] = df['# PDB code']
df_new['nmr'] = df['resolution'].apply(lambda x: x.lower() == 'nmr')
df_new['resolution'] = df['resolution'].apply(lambda x: x if x.lower() != 'nmr' else np.nan)
df_new['binding_const'] = df['const']
df_new['binding_relation'] = df['relation']
df_new['binding_value_M'] = df.apply(
    lambda row: row['value'] * powers_mapping[row['units']],
    axis=1
)
df_new['ligand_info'] = df['ligand name'].str.strip()
df_new

Unnamed: 0,pdb_id,nmr,resolution,binding_const,binding_relation,binding_value_M,ligand_info
0,1hvo,True,,Kd,=,1.100000e-05,(DNA) Zn(HIV1-Fl):d(TTTGTTT)
1,1hvn,True,,Kd,=,5.000000e-06,(DNA) Zn(HIV1-Fl):d(ACGCC)
2,1ytf,False,2.50,Kd,=,2.000000e-11,"(YEAST TFIIA) YEAST TFIIA/TBP/DNA COMPLEX, Kd=..."
3,1wet,False,2.60,Kd,=,1.500000e-06,(DNA OPERATOR) THE PURR-GUANINE-PURF OPERATOR ...
4,1ign,False,2.25,Kd,=,1.300000e-11,(DNA) The specific Kd for a ribosomal gene bin...
...,...,...,...,...,...,...,...
1047,6o16,False,2.88,Kd,=,9.830000e-09,(10-mer)
1048,6on0,False,1.60,Kd,=,9.000000e-08,(17-mer)
1049,6qfd,False,2.13,Kd,=,9.159000e-08,(28-mer) SELEX experiment
1050,6qh0,False,2.44,Kd,=,9.946000e-08,(28-mer) SELEX experiment


In [82]:
os.makedirs('structures', exist_ok=True)
os.chdir('structures')
for pdb_id in tqdm.tqdm(df_new['pdb_id']):
    prody.parsePDB(pdb_id)
os.chdir('..')

100%|██████████| 1052/1052 [23:23<00:00,  1.33s/it]
