In [6]:
# # (ONLY run this cell if running notebook from BROWSER. Run cell below instead if running from IDE):
# import sys, os, glob, json
# from time import time
# from Bio.PDB.MMCIF2Dict import MMCIF2Dict
#
# proj_root = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
# if proj_root not in sys.path:
#     sys.path.insert(0, proj_root)
# from utils import api_callr as api
# from utils import cif_parsr as cp
# relpath_data = os.path.join('..', 'data')

#### IMPORTS

In [2]:
# (ONLY run this cell if running notebook from IDE. Run cell above instead if running from BROWSER):
import os, glob, json
from time import time
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from src.utils import api_callr as api
from src.utils import cif_parsr as cp

#### 1. Get mmCIFs (SOLUTION NMR) - both homomeric & heteromeric proteins:

In [None]:
# start = time()
# relpath_raw_cifs = os.path.join('..', 'data', 'NMR', 'raw_cifs')
# _meric = 'homomeric'
# relpath_raw_cifs_meric = os.path.join(relpath_raw_cifs, _meric)
# os.makedirs(relpath_raw_cifs_meric, exist_ok=True)
# sol_nmr_homo_686_pdbids = api.call_rcsb_for_pdbids_of_solution_nmr_homomeric(number=686)
#
# for pdbid in sol_nmr_homo_686_pdbids:
#     response = api.call_rcsb_for_cif(pdbid)
#     with open(os.path.join(relpath_raw_cifs_meric, f'{pdbid}.cif'), 'w') as cif_file:
#         cif_file.write(response.text)
#
# _meric = 'heteromeric'
# relpath_raw_cifs_meric = os.path.join(relpath_raw_cifs, _meric)
# os.makedirs(relpath_raw_cifs_meric, exist_ok=True)
# sol_nmr_hetero_1038_pdbids = api.call_rcsb_for_pdbids_of_solution_nmr_heteromeric(number=1038)
#
# for pdbid in sol_nmr_hetero_1038_pdbids:
#     response = api.call_rcsb_for_cif(pdbid)
#     with open(os.path.join(relpath_raw_cifs_meric, f'{pdbid}.cif'), 'w') as cif_file:
#         cif_file.write(response.text)
# print(f'Completed {len(sol_nmr_homo_686_pdbids)} homomeric PDBs and {len(sol_nmr_hetero_1038_pdbids)} heteromeric PDBs in {round((time() - start) / 60)} minutes')

#### 2. Parse 686 homomeric proteins, write to ssvs:
#### 3. Parse 1038 heteromeric proteins, write to ssvs:

In [2]:
import os, glob, json
from time import time
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from src.utils import cif_parsr as cp
start = time()
_meric = 'homomeric'
# _meric = 'heteromeric'
relpath_raw_cifs_meric = os.path.join('..', 'data', 'NMR', 'raw_cifs', _meric)
relpath_token_cifs = os.path.join('..', 'data', 'NMR', 'tokenised_cifs', _meric)
os.makedirs(relpath_token_cifs, exist_ok=True)

relpath_cifs = glob.glob(os.path.join(relpath_raw_cifs_meric, f'*.cif'))
no_CA_pdbids = list()

for relpath_cif in relpath_cifs:
    cif_dict = MMCIF2Dict(relpath_cif)
    pdbid = os.path.basename(relpath_cif).removesuffix('.cif')
    cif_pdfs_per_chain = cp.parse_cif(pdb_id=pdbid, mmcif_dict=cif_dict)  # same function from MSc project, but alpha-carbons only.
    with open(os.path.join('..', 'data', 'enumeration', 'residues.json'), 'r') as json_f:
        residues_enumerated = json.load(json_f)

    for pdf_chain in cif_pdfs_per_chain:
        pdf_chain = pdf_chain.copy()
        pdf_chain.loc[:, 'aa_label_num'] = pdf_chain['S_mon_id'].map(residues_enumerated).astype('Int64')

        if not pdf_chain.empty:
            chain = pdf_chain['S_asym_id'].iloc[0]
        else:
            print(f'{pdf_chain} is empty, so cannot include it.')
            no_CA_pdbids.append(pdbid)
            continue

        pdf_chain = pdf_chain[['A_pdbx_PDB_model_num', 'S_seq_id', 'S_mon_id', 'aa_label_num',
                               'A_id', 'A_Cartn_x', 'A_Cartn_y', 'A_Cartn_z']]

        expected_num_of_cols = 8
        assert len(pdf_chain.columns) == expected_num_of_cols, \
        (f'Dataframe should have {expected_num_of_cols} columns. '
         f'But this has {len(pdf_chain.columns)}')

        pdf_chain.to_csv(path_or_buf=os.path.join(relpath_token_cifs, f'{pdbid}_{chain}.ssv'), sep=' ', index=False)

        with open(os.path.join(relpath_token_cifs, '_no_CA_pdbids.txt'), 'w') as f:
            if no_CA_pdbids:
                for pdbid in no_CA_pdbids:
                    f.write(pdbid + '\n')
            else:
                f.write('There are no pdbids that have zero CAs in all chains')
print(f'Completed {len(relpath_cifs)} {_meric} PDBs in {round((time() - start) / 60)} minutes')

parsing 1CO0
parsing 2MRN
parsing 2ADN
parsing 1HS5
parsing 1VL3
parsing 2M3W
parsing 8DSB
parsing 1X9V
parsing 2JO4
parsing 1OLG
parsing 1X9A
parsing 7LOH
parsing 9D9C
parsing 2K1N
parsing 7ZE0
parsing 2L7H
parsing 1WRT
parsing 2LZ3
parsing 2N54
parsing 2RP4
parsing 2PEA
parsing 1KLC
parsing 1R6R
parsing 1E52
parsing 2N97
parsing 6E4H
parsing 2RP5
parsing 2N5T
parsing 2LZS
parsing 8IMH
parsing 2L9U
parsing 1NEI
parsing 2K1O
parsing 9D9B
parsing 1NG7
parsing 2B9Z
parsing 2L7I
parsing 1WJD
parsing 2KDD
parsing 2FXP
parsing 7LOI
parsing 2JO5
parsing 8DPX
parsing 6TWR
parsing 2M0M
parsing 2JUZ
parsing 2JWE
parsing 2JV7
parsing 8HPB
parsing 1COP
parsing 7RPM
parsing 2JXH
parsing 2MDW
parsing 1ZZF
parsing 2HYN
parsing 2MFH
parsing 6TV5
parsing 2C06
parsing 6V4T
parsing 2MK9
parsing 2NWT
parsing 2KJZ
parsing 1L3N
parsing 2LYJ
parsing 2LZF
parsing 2N9B
parsing 1R48
parsing 1NRU
parsing 2WC2
parsing 1NS1
parsing 7A0O
parsing 1KLA
parsing 2NAO
parsing 2LYK
parsing 2NBT
parsing 2LO0
parsing 2K1L

#### 4. Write list of those with more than 1 model:

In [10]:
# # THIS (AND NEXT) CELL USE THE PDBS FROM THE TOKENISED DIR, HENCE ALREADY FILTERED OUT THOSE WITH NO CA ATOMS:
#
# import pandas as pd
# start = time()
#
# _meric = 'homomeric'
# # _meric = 'heteromeric'
# relpath_token_cifs = os.path.join('..', 'data', 'NMR', 'tokenised_cifs', _meric)
#
# ssv_files = glob.glob(os.path.join(relpath_token_cifs, '*.ssv'))
#
# multimodel_pdbids = []
# single_model_pdbids = []
#
# for file_path in ssv_files:
#     try:
#         df = pd.read_csv(file_path, sep=' ', dtype=str)
#         unique_models = df['A_pdbx_PDB_model_num'].unique()
#
#         if len(unique_models) > 1:
#             multimodel_pdbids.append(os.path.basename(file_path))
#         else:
#             single_model_pdbids.append(os.path.basename(file_path))
#
#     except Exception as e:
#         print(f"Error reading {file_path}: {e}")
#
# line1 = f"{len(single_model_pdbids)} {_meric} PDB files with only 1 unique model:\n"
# for i, fname in enumerate(single_model_pdbids):
#     newline = '\n' if i==17 else ''
#     singlemods = f'{fname.removesuffix('.ssv')} {newline}'
#
# list_dir = os.path.join('..', 'data', 'NMR', 'multimodel_PDBids')
# os.makedirs(list_dir, exist_ok=True)
# output_path = os.path.join(list_dir, f'{_meric}_multimodel_{len(multimodel_pdbids)}_pdbids.txt')
#
# with open(output_path, 'w') as f:
#     for fname in multimodel_pdbids:
#         f.write(fname.removesuffix('.ssv') + '\n')
#
# print(f"\nSaved {len(multimodel_pdbids)} {_meric} pdbid_chains with >1 unique model to '{output_path}'.")

186 homomeric pdb files with only 1 unique model:
1ARR_A 1YUR_B 1PES_D 1Q6A_B 2MBO_B 1SAL_D 1OLG_D 2PEA_B 1ARR_B 6F3K_G 1YUR_A 1Q6A_A 2MBO_A 2PEA_A 1N3J_A 2MZ6_B 1SAL_C 1OLG_C 1PES_C 1DBD_B 1IHV_A 1JNO_B 2EZO_B 2E8J_A 1DOM_B 2EZO_C 1PES_B 1OVX_B 1OLG_B 1YUT_B 1SAL_B 1RQV_B 2MZ6_A 1N3J_B 1DBD_A 1IHV_B 2EZO_A 1JNO_A 2E8J_B 1DOM_A 1PES_A 1OVX_A 1SAL_A 1YUT_A 1OLG_A 1RQV_A 1ZZF_B 2LZS_B 1NIQ_B 2MUZ_C 1QTG_A 1J4V_B 2MUZ_B 1YFB_B 1VL3_B 2LZS_C 1SAK_A 1ZZF_A 1SAK_C 2LZS_A 6R8N_L 1NIQ_A 1QTG_B 1J4V_A 2MUZ_A 1VL3_A 1YFB_A 1SAK_B 6R8N_I 2LZS_D 2MBQ_B 1RQU_B 2PE9_A 2MUZ_D 1WTU_B 1N9J_B 2JXG_B 6R8N_H 2LZS_E 1UTR_A 2FXP_A 2FXP_C 2LZS_G 6R8N_J 2MBQ_A 1RQU_A 2PE9_B 1WTU_A 2JXG_A 1N9J_A 2LZS_F 1SAK_D 6R8N_K 1UTR_B 2FXP_B 1JO4_B 1NRM_A 1PFS_B 6R8N_F 1HZE_A 1A1U_B 2L9H_D 1WJC_A 1IL8_B 1MSG_B 1WJA_B 2JXI_B 6R8N_G 1JO4_A 1NRM_B 1PFS_A 6R8N_E 2LZS_H 1HZE_B 1A1U_A 1WJC_B 1IL8_A 1MSG_A 1WJA_A 1C7U_D 1JQ1_D 2JXI_A 6R8N_D 2LZS_I 2L9H_B 1B53_B 1L5E_A 1JQ1_A 2L9H_C 1NT5_A 1WCR_A 6R8N_A 6R8N_C 1WCR_C 2L9H_A 1JQ1_