# Convert pdb using ProDy (failed..)

In [1]:
from prody import parseMMCIF, parsePDB, writePDB
import numpy as np

In [2]:
def mmcif2pdb(pdb_id: str, chain: str, output_path: str = None):
    mol = parseMMCIF(pdb_id)
    chids = mol.getChids()
    sel_mol = mol.select('chain {}'.format(chain[: 2]))
    assert sel_mol is not None
    if output_path is None:
        output_path = pdb_id + '_' + chain + '.pdb'
    if len(chain) > 1:
        sel_mol.setChids(chain[0])
    writePDB(output_path, sel_mol)
    with open(output_path, 'r') as f:
        lines = f.readlines()
    with open(output_path, 'w') as f:
        def del_res_prefix(line):
            if line[: 4] == 'ATOM' or line[: 6] == 'HETATM':
                line_str_list = list(line)
                line_str_list[16] = ' '
                line = ''.join(line_str_list)
            return line

        lines = list(map(lambda line: del_res_prefix(line), lines))
        f.writelines(lines)
    return sel_mol, parsePDB(output_path)

In [11]:
# チェイン名が1文字の場合
pdb_id = '3j7p'
chain = 'A'
mmcif2pdb(pdb_id, chain)

@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 3j7p downloaded (3j7p.cif)
@> PDB download via FTP completed (1 downloaded, 0 failed).
@> 221686 atoms and 1 coordinate set(s) were parsed in 2.21s.
@> 1868 atoms and 1 coordinate set(s) were parsed in 0.01s.


(<Selection: 'chain A' from 3j7p (1868 atoms)>,
 <AtomGroup: 3j7p_A (1868 atoms)>)

In [13]:
# チェイン名が1文字の場合
pdb_id = '3j7p'
chain = 'S'
mmcif2pdb(pdb_id, chain)

@> 221686 atoms and 1 coordinate set(s) were parsed in 2.14s.
@> 1454 atoms and 1 coordinate set(s) were parsed in 0.01s.


(<Selection: 'chain S' from 3j7p (1454 atoms)>,
 <AtomGroup: 3j7p_S (1454 atoms)>)

In [14]:
mol = parseMMCIF('3j7p')
mol.select('chain 5')

@> 221686 atoms and 1 coordinate set(s) were parsed in 2.14s.


<Selection: 'chain 5' from 3j7p (78604 atoms)>

In [12]:
# チェイン名が1文字の場合 かつ　数字の場合　バグがある
pdb_id = '3j7p'
chain = '5'
mmcif2pdb(pdb_id, chain)

@> 221686 atoms and 1 coordinate set(s) were parsed in 2.18s.


PDBParseError: invalid or missing coordinate(s) at line 2

In [7]:
# チェイン名が3文字だが，2文字までしかロードされていない．
mol = parseMMCIF('6zzr')
mol.getChids()

@> 20901 atoms and 1 coordinate set(s) were parsed in 0.22s.


array(['AA', 'AA', 'AA', ..., 'HH', 'HH', 'HH'], dtype='<U2')

In [8]:
pdb_id = '6zzr'
chain = 'AAA'
sel_mol, pdb_mol = mmcif2pdb(pdb_id, chain)

@> 20901 atoms and 1 coordinate set(s) were parsed in 0.22s.
@> 2667 atoms and 1 coordinate set(s) were parsed in 0.01s.


In [6]:
pdb_id = '1vvj'
chain = 'QQ'
sel_mol, pdb_mol = mmcif2pdb(pdb_id, chain)

@> 291123 atoms and 1 coordinate set(s) were parsed in 2.90s.
@> 834 atoms and 1 coordinate set(s) were parsed in 0.00s.


In [9]:
# チェイン名が2文字の場合
# チェイン名のせいで出力pdbの列がずれてしまうため，parseできない(修正済み)
mmcif2pdb('4v52', 'BC')

@> 284172 atoms and 1 coordinate set(s) were parsed in 2.72s.
@> 2087 atoms and 1 coordinate set(s) were parsed in 0.01s.


(<Selection: 'chain BC' from 4v52 (2087 atoms)>,
 <AtomGroup: 4v52_BC (2087 atoms)>)

In [10]:
mmcif2pdb('3jai', 'SS')

@> 226453 atoms and 1 coordinate set(s) were parsed in 2.16s.
@> 1139 atoms and 1 coordinate set(s) were parsed in 0.01s.


(<Selection: 'chain SS' from 3jai (1139 atoms)>,
 <AtomGroup: 3jai_SS (1139 atoms)>)

# Convert pdb using biopython

In [15]:
from Bio.PDB import MMCIFParser, PDBParser, PDBIO, Select

In [16]:
parser = MMCIFParser()

In [17]:
structure = parser.get_structure('6zzr', '6zzr.cif')



In [18]:
structure.header

{'name': 'The Crystal Structure of human LDHA from Wuxi Biortus.',
 'head': 'OXIDOREDUCTASE',
 'idcode': '6ZZR',
 'deposition_date': '2020-08-05',
 'structure_method': 'X-RAY DIFFRACTION',
 'resolution': 2.65}

In [19]:
io = PDBIO()
io.set_structure(structure)

In [48]:
class ChainSelect(Select):
    def __init__(self, chain_name):
        self.chain_name = chain_name
    def accept_chain(self, chain):
        print(type(chain))
        if chain.get_id() == self.chain_name:
            return 1
        else:
            return 0

In [15]:
structure.get_chains()

<generator object Structure.get_chains at 0x125f58dd0>

In [49]:
io.save('6zzr_A_bio.pdb', ChainSelect('AAA'))

<class 'Bio.PDB.Chain.Chain'>


TypeError: %c requires int or char

In [15]:
list(structure.get_chains())

[<Chain id=AAA>,
 <Chain id=BBB>,
 <Chain id=CCC>,
 <Chain id=DDD>,
 <Chain id=EEE>,
 <Chain id=FFF>,
 <Chain id=GGG>,
 <Chain id=HHH>]

In [1]:
import urllib

rcsb_download_url = 'https://files.rcsb.org/download/'

def download_file_from_url(url: str, output_path: str):
    urllib.request.urlretrieve(url, output_path)

def download_pdb(pdb_id: str, output_path: str = None):
    pdb_url = rcsb_download_url + pdb_id.upper() + '.pdb'
    if output_path is None:
        output_path = pdb_id.upper() + '.pdb'
    download_file_from_url(pdb_url, output_path)

def download_mmcif(pdb_id: str, output_path: str = None):
    mmcif_url = rcsb_download_url + pdb_id.upper() + '.cif.gz'
    if output_path is None:
        output_path = pdb_id.upper() + '.cif.gz'
    download_file_from_url(mmcif_url, output_path)

In [4]:
import urllib
import gzip
rcsb_download_url = 'https://files.rcsb.org/download/'

def download_and_decompress_file_from_url(url: str, output_path: str):
    urllib.request.urlretrieve(url, output_path)
    with gzip.open(output_path, 'rt') as f:
        lines = f.readlines()
    with open(output_path, 'w') as f:
        f.writelines(lines)

def download_mmcif(pdb_id,output_path = None):
    mmcif_url = rcsb_download_url + pdb_id.upper() + '.cif.gz'
    if output_path is None:
        output_path = pdb_id.upper() + '.cif'
    download_and_decompress_file_from_url(mmcif_url, output_path)

In [45]:
from Bio.PDB import MMCIFParser, PDBIO, Select, PDBExceptions
from prody import parsePDB
import warnings
warnings.simplefilter('ignore', PDBExceptions.PDBConstructionWarning)

class ChainSelect(Select):
    def __init__(self, chain_name):
        self.chain_name = chain_name

    def accept_chain(self, chain):
        print(type(chain))
        if chain.get_id() == self.chain_name:
            return 1
        else:
            return 0

import copy

def biopython_mmcif2pdb(pdb_id, chain):
    out_path = 'bio_' + pdb_id + '_' + chain + '.pdb'
    parser = MMCIFParser()
    structure = parser.get_structure("", pdb_id + '.cif')
    sel_structure = structure[0][chain]
    if len(chain) > 1:
        if chain[0] in [chain.id for chain in structure.get_chains()]:
            structure[0][chain[0]].id = None
        sel_structure.id = chain[0]
    io = PDBIO()
    io.set_structure(sel_structure)
    io.save(out_path)
    return parsePDB(out_path)

In [46]:
type(structure)

Bio.PDB.Structure.Structure

In [47]:
type(structure[0])

Bio.PDB.Model.Model

In [27]:
parser = MMCIFParser()
structure = parser.get_structure('', '5MYJ.cif')

In [10]:
download_mmcif('5MYJ')

In [42]:
biopython_mmcif2pdb('5MYJ', 'AQ')

@> 675 atoms and 1 coordinate set(s) were parsed in 0.00s.


<AtomGroup: bio_5MYJ_AQ (675 atoms)>

In [19]:
download_mmcif('6V39')

In [21]:
biopython_mmcif2pdb('6V39', 'q')

@> 631 atoms and 1 coordinate set(s) were parsed in 0.00s.


<AtomGroup: bio_6V39_q (631 atoms)>

In [43]:
mol = biopython_mmcif2pdb('1VVJ', 'QQ')

@> 834 atoms and 1 coordinate set(s) were parsed in 0.01s.


In [9]:
path = 'bio_1VVJ_QQ.pdb'
parsePDB(path, chain='Q')

@> 834 atoms and 1 coordinate set(s) were parsed in 0.02s.


<AtomGroup: bio_1VVJ_QQQ (834 atoms)>

In [66]:
def biopython_parseMMCIF(pdb_id):
    parser = MMCIFParser()
    structure = parser.get_structure(pdb_id, pdb_id + '.cif')
    return structure

In [90]:
structure = biopython_parseMMCIF('6zzr')
sel_structure = structure[0]['AAA']



In [93]:
biopython_mmcif2pdb('6zzr', 'AAA')

@> 2667 atoms and 1 coordinate set(s) were parsed in 0.03s.


<AtomGroup: bio_6zzr_AAA (2667 atoms)>

In [67]:
structure = biopython_parseMMCIF('3j7p')



In [94]:
biopython_mmcif2pdb('3j7p', 'S')

@> 1454 atoms and 1 coordinate set(s) were parsed in 0.02s.


<AtomGroup: bio_3j7p_S (1454 atoms)>

In [95]:
biopython_mmcif2pdb('3j7p', '5')

@> 78604 atoms and 1 coordinate set(s) were parsed in 0.38s.


<AtomGroup: bio_3j7p_5 (78604 atoms)>