In [6]:
# # (ONLY run this cell if running notebook from BROWSER. Run cell below instead if running from IDE):
# import sys, os, glob, json
# from time import time
# from Bio.PDB.MMCIF2Dict import MMCIF2Dict
#
# proj_root = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
# if proj_root not in sys.path:
#     sys.path.insert(0, proj_root)
# from utils import api_callr as api
# from utils import cif_parsr as cp
# relpath_data = os.path.join('..', 'data')

#### IMPORTS

In [1]:
# (ONLY run this cell if running notebook from IDE. Run cell above instead if running from BROWSER):
import os, glob, json
from time import time
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from src.utils import api_callr as api
from src.utils import cif_parsr as cp

#### 1. Get mmCIFs (SOLUTION NMR) - both homomeric & heteromeric proteins:

In [None]:
# start = time()
# relpath_raw_cifs = os.path.join('..', 'data', 'NMR', 'raw_cifs')
# _meric = 'homomeric'
# relpath_raw_cifs_meric = os.path.join(relpath_raw_cifs, _meric)
# os.makedirs(relpath_raw_cifs_meric, exist_ok=True)
# sol_nmr_homo_686_pdbids = api.call_rcsb_for_pdbids_of_solution_nmr_homomeric(number=686)
#
# for pdbid in sol_nmr_homo_686_pdbids:
#     response = api.call_rcsb_for_cif(pdbid)
#     with open(os.path.join(relpath_raw_cifs_meric, f'{pdbid}.cif'), 'w') as cif_file:
#         cif_file.write(response.text)
#
# _meric = 'heteromeric'
# relpath_raw_cifs_meric = os.path.join(relpath_raw_cifs, _meric)
# os.makedirs(relpath_raw_cifs_meric, exist_ok=True)
# sol_nmr_hetero_1038_pdbids = api.call_rcsb_for_pdbids_of_solution_nmr_heteromeric(number=1038)
#
# for pdbid in sol_nmr_hetero_1038_pdbids:
#     response = api.call_rcsb_for_cif(pdbid)
#     with open(os.path.join(relpath_raw_cifs_meric, f'{pdbid}.cif'), 'w') as cif_file:
#         cif_file.write(response.text)
# print(f'Completed {len(sol_nmr_homo_686_pdbids)} homomeric PDBs and {len(sol_nmr_hetero_1038_pdbids)} heteromeric PDBs in {round((time() - start) / 60)} minutes')

#### 2. Parse 686 homomeric proteins, write to ssvs:
#### 3. Parse 1038 heteromeric proteins, write to ssvs:

In [8]:
start = time()
_meric = 'homomeric'
# _meric = 'heteromeric'
relpath_raw_cifs_meric = os.path.join('..', 'data', 'NMR', 'raw_cifs', _meric)
relpath_token_cifs = os.path.join('..', 'data', 'NMR', 'tokenised_cifs', _meric)
os.makedirs(relpath_token_cifs, exist_ok=True)

relpath_cifs = glob.glob(os.path.join(relpath_raw_cifs_meric, f'*.cif'))
no_CA_pdbids = list()

for relpath_cif in relpath_cifs:
    cif_dict = MMCIF2Dict(relpath_cif)
    pdbid = os.path.basename(relpath_cif).removesuffix('.cif')
    cif_pdfs_per_chain = cp.parse_cif(pdb_id=pdbid, mmcif_dict=cif_dict)  # same function from MSc project, but alpha-carbons only.

    for pdf_chain in cif_pdfs_per_chain:
        with open(os.path.join('..', 'data', 'enumeration', 'residues.json'), 'r') as json_f:
            residues_enumerated = json.load(json_f)
        pdf_chain = pdf_chain.copy()
        pdf_chain.loc[:, 'aa_label_num'] = pdf_chain['S_mon_id'].map(residues_enumerated).astype('Int64')

        if not pdf_chain.empty:
            chain = pdf_chain['S_asym_id'].iloc[0]
        else:
            print(f'{pdf_chain} is empty, i.e. it must have no alpha-carbon in its parsed cif, so cannot include it.')
            no_CA_pdbids.append(pdbid)
            continue

        pdf_chain = pdf_chain[['A_pdbx_PDB_model_num', 'S_seq_id', 'S_mon_id', 'aa_label_num',
                               'A_id', 'A_Cartn_x', 'A_Cartn_y', 'A_Cartn_z']]

        expected_num_of_cols = 8
        assert len(pdf_chain.columns) == expected_num_of_cols, \
        (f'Dataframe should have {expected_num_of_cols} columns. '
         f'But this has {len(pdf_chain.columns)}')

        pdf_chain.to_csv(path_or_buf=os.path.join(relpath_token_cifs, f'{pdbid}_{chain}.ssv'), sep=' ', index=False)

        with open(os.path.join(relpath_token_cifs, '_no_CA_pdbids.txt'), 'w') as f:
            if no_CA_pdbids:
                for pdbid in no_CA_pdbids:
                    f.write(pdbid + '\n')
            else:
                f.write('There are no pdbids that have zero CAs in all chains')
print(f'Completed {len(relpath_cifs)} {_meric} PDBs in {round((time() - start) / 60)} minutes')

Parse 1CO0
Parse 2MRN
Parse 2ADN
Parse 1HS5
Parse 1VL3
Parse 2M3W
Parse 8DSB
Parse 1X9V
Parse 2JO4
Parse 1OLG
Parse 1X9A
Parse 7LOH
Parse 9D9C
Parse 2K1N
Parse 7ZE0
Parse 2L7H
Parse 1WRT
Parse 2LZ3
Parse 2N54
Parse 2RP4
Parse 2PEA
Parse 1KLC
Parse 1R6R
Parse 1E52
Parse 2N97
Parse 6E4H
Parse 2RP5
Parse 2N5T
Parse 2LZS
Parse 8IMH
Parse 2L9U
Parse 1NEI
Parse 2K1O
Parse 9D9B
Parse 1NG7
Parse 2B9Z
Parse 2L7I
Parse 1WJD
Parse 2KDD
Parse 2FXP
Parse 7LOI
Parse 2JO5
Parse 8DPX
Parse 6TWR
Parse 2M0M
Parse 2JUZ
Parse 2JWE
Parse 2JV7
Parse 8HPB
Parse 1COP
Parse 7RPM
Parse 2JXH
Parse 2MDW
Parse 1ZZF
Parse 2HYN
Parse 2MFH
Parse 6TV5
Parse 2C06
Parse 6V4T
Parse 2MK9
Parse 2NWT
Parse 2KJZ
Parse 1L3N
Parse 2LYJ
Parse 2LZF
Parse 2N9B
Parse 1R48
Parse 1NRU
Parse 2WC2
Parse 1NS1
Parse 7A0O
Parse 1KLA
Parse 2NAO
Parse 2LYK
Parse 2NBT
Parse 2LO0
Parse 2K1L
Parse 9D9A
Parse 6EVI
Parse 6BP9
Parse 1QEY
Parse 2MDA
Parse 2M20
Parse 2M3B
Parse 2J5D
Parse 2MDV
Parse 2ADL
Parse 2AF2
Parse 2JXI
Parse 2JZ7
Parse 5OGU

KeyboardInterrupt: 

In [None]:
start = time()
# _meric = 'homomeric'
_meric = 'heteromeric'
relpath_raw_cifs_meric = os.path.join('..', 'data', 'NMR', 'raw_cifs', _meric)
relpath_token_cifs = os.path.join('..', 'data', 'NMR', 'tokenised_cifs', _meric)
os.makedirs(relpath_token_cifs, exist_ok=True)

relpath_cifs = glob.glob(os.path.join(relpath_raw_cifs_meric, f'*.cif'))
no_CA_pdbids = list()

for relpath_cif in relpath_cifs:
    cif_dict = MMCIF2Dict(relpath_cif)
    pdbid = os.path.basename(relpath_cif).removesuffix('.cif')
    # pdbid = '1CO0'
    cif_pdfs_per_chain = cp.parse_cif(pdb_id=pdbid, mmcif_dict=cif_dict)  # same function from MSc project, but alpha-carbons only.

    for pdf_chain in cif_pdfs_per_chain:
        with open(os.path.join('..', 'data', 'enumeration', 'residues.json'), 'r') as json_f:
            residues_enumerated = json.load(json_f)
        pdf_chain = pdf_chain.copy()
        pdf_chain.loc[:, 'aa_label_num'] = pdf_chain['S_mon_id'].map(residues_enumerated).astype('Int64')

        if not pdf_chain.empty:
            chain = pdf_chain['S_asym_id'].iloc[0]
        else:
            print(f'{pdf_chain} is empty, i.e. it must have no alpha-carbon in its parsed cif, so cannot include it.')
            no_CA_pdbids.append(pdbid)
            continue

        pdf_chain = pdf_chain[['A_pdbx_PDB_model_num', 'S_seq_id', 'S_mon_id', 'aa_label_num',
                               'A_id', 'A_Cartn_x', 'A_Cartn_y', 'A_Cartn_z']]

        expected_num_of_cols = 8
        assert len(pdf_chain.columns) == expected_num_of_cols, \
        (f'Dataframe should have {expected_num_of_cols} columns. '
         f'But this has {len(pdf_chain.columns)}')

        pdf_chain.to_csv(path_or_buf=os.path.join(relpath_token_cifs, f'{pdbid}_{chain}.ssv'), sep=' ', index=False)

        with open(os.path.join(relpath_token_cifs, '_no_CA_pdbids.txt'), 'w') as f:
            if no_CA_pdbids:
                for pdbid in no_CA_pdbids:
                    f.write(pdbid + '\n')
            else:
                f.write('There are no pdbids that have zero CAs in all chains')
print(f'Completed {len(relpath_cifs)} {_meric} PDBs in {round((time() - start) / 60)} minutes')

#### 4. Write list of those with more than 1 model:

In [9]:
# THIS (AND NEXT) CELL USE THE PDBS FROM THE TOKENISED DIR, HENCE ALREADY FILTERED OUT THOSE WITH NO CA ATOMS:

import pandas as pd
start = time()

# _meric = 'homomeric'
_meric = 'heteromeric'
relpath_token_cifs = os.path.join('..', 'data', 'NMR', 'tokenised_cifs', _meric)

ssv_files = glob.glob(os.path.join(relpath_token_cifs, '*.ssv'))

multimodel_pdbids = []
single_model_pdbids = []

for file_path in ssv_files:
    try:
        df = pd.read_csv(file_path, sep=' ', dtype=str)
        unique_models = df['A_pdbx_PDB_model_num'].unique()

        if len(unique_models) > 1:
            multimodel_pdbids.append(os.path.basename(file_path))
        else:
            single_model_pdbids.append(os.path.basename(file_path))

    except Exception as e:
        print(f"Error reading {file_path}: {e}")

print(f"{len(single_model_pdbids)} {_meric} pdb files with only 1 unique model:")

for fname in single_model_pdbids:
    print(fname.removesuffix('.ssv'), end=' ')

list_dir = os.path.join('..', 'data', 'NMR', 'multimodel_PDBids')
os.makedirs(list_dir, exist_ok=True)
output_path = os.path.join(list_dir, f'{_meric}_multimodel_{len(multimodel_pdbids)}_pdbids.txt')

with open(output_path, 'w') as f:
    for fname in multimodel_pdbids:
        f.write(fname.removesuffix('.ssv') + '\n')

print(f"\nSaved {len(multimodel_pdbids)} {_meric} pdbid_chains with >1 unique model to '{output_path}'.")

288 heteromeric pdb files with only 1 unique model:
1FU5_B 2N8R_C 1C17_K 5AIY_L 4BY9_M 4AIY_L 1KQE_B 2LV6_A 1BZV_A 1TCE_A 1HAA_B 1WLP_B 1QG1_B 1KQE_C 4BY9_L 1J4L_A 2MJ5_B 1C17_J 2N8R_B 1FU5_A 1C17_H 4BY9_N 1KQE_A 2LV6_B 1TCE_B 1BZV_B 1HAA_A 1WLP_A 1QG1_A 1J4L_B 2MJ5_A 1C17_I 2N8R_A 3GBQ_B 4BY9_K 4AIY_J 5AIY_J 1C17_M 1KQE_D 2PJH_A 1ILQ_C 1J4Q_A 1ILQ_B 2FFK_B 1CSZ_B 1C17_L 5AIY_K 4BY9_J 4AIY_K 1D6G_B 2N8R_D 3GBQ_A 5AIY_I 4AIY_I 4BY9_H 2PJH_B 1J4Q_B 1ILQ_A 2FFK_A 1CSZ_A 4AIY_H 4BY9_I 5AIY_H 1D6G_A 3AIY_J 1E08_B 1F95_C 1QNZ_C 2MMA_B 2LE9_D 2LGF_A 2MZW_B 1NCP_B 1QNZ_B 2MP0_A 1KLQ_B 1F95_B 1F3R_A 1E08_C 3AIY_K 3AIY_I 1E08_A 2A9H_E 2MMA_A 2LGF_B 1NCP_A 2MZW_A 1QNZ_A 2A9H_D 2MP0_B 1KLQ_A 1F95_A 1F3R_B 3AIY_H 1O4X_C 3AIY_L 1RGJ_B 1IRS_B 2BTX_A 2VER_B 1BXL_A 2LE9_C 2LE9_B 1OO4_A 1SHC_B 2A9H_A 1J4K_B 1F95_D 2A7U_B 1J4P_B 1G5J_A 1O4X_D 1RGJ_A 1IRS_A 2A9H_C 2BTX_B 2VER_A 1BXL_B 2LE9_A 1OO4_B 1SHC_A 2A9H_B 1J4K_A 1J4P_A 2A7U_A 1G5J_B 2JOD_B 3AIY_C 1GBQ_A 2PLD_A 6O22_C 2IXQ_A 1BOM_B 2FYL_A 1AZG_A 2JZ