In [1]:
"""Here, we scrape PSCDB (protein Structural Change Database http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/index.html"""

'Here, we scrape PSCDB (protein Structural Change Database http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/index.html'

In [2]:
import pandas as pd
import rdkit
from Bio.PDB import *

ModuleNotFoundError: No module named 'rdkit'

In [3]:
coupled_domain_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/cd.html", header=0)[0]
coupled_domain_motion['motion_type'] = 'coupled_domain_motion'

independent_domain_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/id.html", header=0)[0]
independent_domain_motion['motion_type'] = 'independent_domain_motion'

coupled_local_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/cl.html", header=0)[0]
coupled_local_motion['motion_type'] = 'coupled_local_motion'

independent_local_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/il.html", header=0)[0]
independent_local_motion['motion_type'] = 'independent_local_motion'

burying_ligand_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/b.html", header=0)[0]
burying_ligand_motion['motion_type'] = 'burying_ligand_motion'

no_significant_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/n.html", header=0)[0]
no_significant_motion['motion_type'] = 'no_significant_motion'

other_motion = pd.read_html("http://idp1.force.cs.is.nagoya-u.ac.jp/pscdb/b.html", header=0)[0]
other_motion['motion_type'] = 'other_motion'

structural_rearrangement_data = pd.concat([coupled_domain_motion, independent_domain_motion,
                                           coupled_local_motion, independent_local_motion,
                                           burying_ligand_motion, no_significant_motion, other_motion]).reset_index()
print(structural_rearrangement_data)

     index  PSCID                      Protein Name Free form Bound form  \
0        0   CD.1  HYPOTHETICAL OXIDOREDUCTASE YIAK   1nxu_AB    1s20_AB   
1        1   CD.2                  ADENYLATE KINASE    4ake_A     2eck_A   
2        2   CD.3                       GLUCOKINASE   1q18_AB    1sz2_AB   
3        3   CD.4                       LACTOFERRIN    1lfh_A     1lfi_A   
4        4   CD.5               ELONGATION FACTOR 2    1n0v_D     1n0u_A   
..     ...    ...                               ...       ...        ...   
903     99  B.100   325AA LONG HYPOTHETICAL PROTEIN   2dec_AB    2df8_AB   
904    100  B.101                         LIPOCALIN    3bu9_A     3bu1_A   
905    101  B.102      AECTYLCITRULLINE DEACETYLASE   2f8h_AA    2f7v_AA   
906    102  B.103                 LIPOYLTRANSFERASE    2qht_A     2qhv_A   
907    103  B.104      RIBOSE 5-PHOSPHATE ISOMERASE   1uj4_AA    1uj5_AA   

         Ligands  Classification(?)            motion_type  
0    2xNAD,2xTLA          

In [4]:
# Split into chains
free_structures = structural_rearrangement_data['Free form'].str.split('_', expand=True)
bound_structures = structural_rearrangement_data['Bound form'].str.split('_', expand=True)

# Add columns
structural_rearrangement_data['Free PDB'] = free_structures[0]
structural_rearrangement_data['Free Chains'] = free_structures[1]
structural_rearrangement_data['Bound PDB'] = bound_structures[0]
structural_rearrangement_data['Bound Chains'] = bound_structures[1]

# Drop obsolete structures
obsolete = ['1m80', '1cmw', '1g40', '2ihi', '1hl0', '2gkq', '2glb', '2g2j', '2dpo', '2h98', '2gu9', '2bg1', '1q4o', '1il5', '3cey', '1yks']

structural_rearrangement_data = structural_rearrangement_data.loc[~structural_rearrangement_data['Free PDB'].isin(obsolete)]
structural_rearrangement_data = structural_rearrangement_data.loc[~structural_rearrangement_data['Bound PDB'].isin(obsolete)]

print(structural_rearrangement_data)
#structural_rearrangement_data.reset_index(inplace=True)
structural_rearrangement_data.to_csv('structural_rearrangement_data.csv')

     index  PSCID                      Protein Name Free form Bound form  \
0        0   CD.1  HYPOTHETICAL OXIDOREDUCTASE YIAK   1nxu_AB    1s20_AB   
1        1   CD.2                  ADENYLATE KINASE    4ake_A     2eck_A   
2        2   CD.3                       GLUCOKINASE   1q18_AB    1sz2_AB   
3        3   CD.4                       LACTOFERRIN    1lfh_A     1lfi_A   
4        4   CD.5               ELONGATION FACTOR 2    1n0v_D     1n0u_A   
..     ...    ...                               ...       ...        ...   
903     99  B.100   325AA LONG HYPOTHETICAL PROTEIN   2dec_AB    2df8_AB   
904    100  B.101                         LIPOCALIN    3bu9_A     3bu1_A   
905    101  B.102      AECTYLCITRULLINE DEACETYLASE   2f8h_AA    2f7v_AA   
906    102  B.103                 LIPOYLTRANSFERASE    2qht_A     2qhv_A   
907    103  B.104      RIBOSE 5-PHOSPHATE ISOMERASE   1uj4_AA    1uj5_AA   

         Ligands  Classification(?)            motion_type Free PDB  \
0    2xNAD,2xTLA

In [5]:
# Download PDBs

In [6]:
pdbl = PDBList()
pdb_list = pd.concat([structural_rearrangement_data['Free PDB'], structural_rearrangement_data['Bound PDB']]).unique()

pdbl.download_pdb_files(pdb_list, obsolete=False, pdir='pdbs/', file_format='pdb', overwrite=True)

#Rename PDB files from .ent to .pdb\n
!cd pdbs/; for f in *.ent; do mv -- "$f" "${f%.ent}.pdb"; done
# Remove pdb prefix from PDB file names (e.g. pdb20jj.ent -> 20jj.pdb)\n",
!cd pdbs/; for x in *; do mv $x `echo $x | cut -c 4-`; done

NameError: name 'PDBList' is not defined

In [7]:
# Download Ligands

In [8]:
## Make Graphs
import os

In [9]:
structural_rearrangement_data['Free PDB'].loc[structural_rearrangement_data['Free PDB'].isin(f)]

NameError: name 'f' is not defined

In [10]:
f = [f[:-4] for f in os.listdir('pdbs')]
structural_rearrangement_data['Free PDB'].loc[~structural_rearrangement_data['Bound PDB'].isin(f)]

Series([], Name: Free PDB, dtype: object)