In [3]:
import os, sys, json
import numpy as np

self_dir = os.getcwd()
root_dir = os.path.normpath(self_dir + '/..' * 2)
package_dir = os.path.join(root_dir, 'src')
sys.path.append(package_dir)

from ml_modules.data.retrievers import UniProt_Retriever

kb_dir = '../../data/external/UniProt/kb'

retriever = UniProt_Retriever()

coupling_types = ['codir', 'coord', 'deform']
edge_types = ['contact'] + coupling_types

tag = '20250805'

n_entries_to_process = 150


In [4]:
src_dir = '../20250526-1 true and baseline vs dynamics performance/stats'

src_list = [
    f'{src_dir}/accessions - largest improvement (baseline 0 vs dynamics 4).csv',
    f'{src_dir}/accessions - largest improvement (baseline 1 vs dynamics 4).csv',
    f'{src_dir}/accessions - largest improvement (baseline 2 vs dynamics 4).csv',
]

stats_dir = f'stats'
os.makedirs(stats_dir, exist_ok=True)

accessions = {
    src_file: np.loadtxt(
        os.path.abspath(src_file),
        usecols=0,
        delimiter=',',
        dtype=np.str_
    )[:n_entries_to_process] for src_file in src_list
}
print(accessions)


{'../20250526-1 true and baseline vs dynamics performance/stats/accessions - largest improvement (baseline 0 vs dynamics 4).csv': array(['Q6L225', 'Q72IP3', 'Q72L85', 'Q72I39', 'Q6L2H9', 'P02930',
       'O34687', 'Q6L0G3', 'P0A8U0', 'Q6L2R6', 'P40423', 'Q72GB7',
       'Q72J05', 'Q8MQ13', 'P23715', 'Q6L1M5', 'Q6L2G4', 'Q3U7Z6',
       'P28707', 'P28248', 'P61526', 'Q9RA61', 'Q20574', 'O76075',
       'P32939', 'P0A794', 'P25294', 'Q72IH1', 'P30750', 'Q6KZF2',
       'P61981', 'Q72J82', 'Q9LQK7', 'Q6L2P7', 'P09029', 'Q9NZL9',
       'O04420', 'Q8BPE4', 'Q46893', 'Q72GD0', 'I2HA94', 'Q9XWN1',
       'P09372', 'P0A6J8', 'Q9VX88', 'P0AFE4', 'Q96AD5', 'Q9XTX0',
       'Q6L1Q5', 'P0ACN7', 'P09734', 'P35659', 'P75853', 'Q72HR4',
       'P61221', 'Q00059', 'Q72JU0', 'Q9DB28', 'O16406', 'Q72GB9',
       'Q9FH05', 'Q3UI84', 'Q93168', 'Q21557', 'Q6L1U0', 'Q9CQ54',
       'Q6L2P6', 'Q8L7B2', 'O80934', 'P36771', 'P53040', 'P02394',
       'P54167', 'P54819', 'Q93WN0', 'Q72HM0', 'Q17352', 'P0A7J3',

In [5]:
# GET INFOR FROM UNIPROTKB

all_available_accessions = set()
for src_file in accessions:
    available_accessions, _ = retriever.batch_retrieve(accessions[src_file])
    all_available_accessions.update(available_accessions)

print('Number of accessions:', len(all_available_accessions))
print(list(all_available_accessions)[:5])


P48036      : 100%|##########| 150/150 [00:12<00:00, 12.05it/s] 
Q5Y5T2      : 100%|##########| 150/150 [01:09<00:00,  2.15it/s]
R4YJC2      : 100%|##########| 150/150 [01:10<00:00,  2.14it/s]

Number of accessions: 325
['Q12338', 'G5EFV5', 'Q966C7', 'Q72IP3', 'O16406']





In [6]:

accessions_with_FT = []
FT_categories = []

for accession in all_available_accessions:
    # print(f' >> {accession}')
    unprotkb_file = f'{kb_dir}/{accession}.txt'

    with open(unprotkb_file, 'r') as f:
        content = f.read().split('\n')

    for line in content:
        if line.startswith('FT'):

            if line[5] != ' ':
                FT_categories.append(line.split()[1])

FT_categories = np.unique(FT_categories)

print('Number of unique categories for FT:', len(FT_categories))
print(FT_categories)


Number of unique categories for FT: 32
['ACT_SITE' 'BINDING' 'CARBOHYD' 'CHAIN' 'COILED' 'COMPBIAS' 'CONFLICT'
 'CROSSLNK' 'DISULFID' 'DNA_BIND' 'DOMAIN' 'HELIX' 'INIT_MET' 'INTRAMEM'
 'LIPID' 'MOD_RES' 'MOTIF' 'MUTAGEN' 'NON_TER' 'PROPEP' 'REGION' 'REPEAT'
 'SIGNAL' 'SITE' 'STRAND' 'TOPO_DOM' 'TRANSIT' 'TRANSMEM' 'TURN' 'VARIANT'
 'VAR_SEQ' 'ZN_FING']


In [7]:

all_FT_info = {}

for accession in available_accessions:
    print(f' >> {accession}')
    unprotkb_file = f'{kb_dir}/{accession}.txt'

    with open(unprotkb_file, 'r') as f:
        content = f.read().split('\n')

    FT_for_accession = {
        FT_category: [] for FT_category in FT_categories
    }
    for FT_category in FT_categories:

        resnums = None
        for line_idx, line in enumerate(content):
            if line.startswith(f'FT   {FT_category}'):
                resnums = line.split()[2]

                annotation = {}
                line_idx += 1
                while content[line_idx].startswith('FT') and content[line_idx][5] == ' ':
                    line = content[line_idx][21:]

                    if line.startswith('/'):
                        key = line.split('=')[0][1:]
                        annotation[key] = line.split('=')[1].strip('"')
                    else:
                        annotation[key] += ' '
                        annotation[key] += line.strip('"')

                    line_idx += 1

            if resnums is not None:
                FT_for_accession[FT_category].append({
                    'resnums': resnums,
                    'annotation': annotation
                })
                resnums = None

    # remove if empty
    FT_for_accession = {
        k: v for k, v in FT_for_accession.items()
        if v
    }

    all_FT_info[accession] = FT_for_accession
    print(list(FT_for_accession))


 >> P23396
['CHAIN', 'COMPBIAS', 'CONFLICT', 'CROSSLNK', 'DOMAIN', 'HELIX', 'INIT_MET', 'MOD_RES', 'MUTAGEN', 'REGION', 'STRAND', 'TURN', 'VAR_SEQ']
 >> P21365
['CHAIN', 'CONFLICT', 'TOPO_DOM', 'TRANSMEM', 'VARIANT']
 >> P03899
['CHAIN', 'CONFLICT', 'HELIX', 'STRAND', 'TRANSMEM']
 >> Q8MQ13
['DOMAIN', 'REGION']
 >> Q6L225
['BINDING', 'CHAIN', 'ZN_FING']
 >> Q9H0U4
['BINDING', 'CHAIN', 'HELIX', 'LIPID', 'MOD_RES', 'MOTIF', 'MUTAGEN', 'REGION', 'STRAND', 'TURN']
 >> P81101
['CHAIN', 'CONFLICT', 'INIT_MET']
 >> P53497
['CHAIN', 'INIT_MET', 'MOD_RES']
 >> P00903
['ACT_SITE', 'CHAIN', 'DOMAIN', 'MUTAGEN']
 >> P33204
['CHAIN']
 >> P0A8U0
['CHAIN', 'HELIX', 'STRAND', 'TURN']
 >> Q7ZU67
['COMPBIAS', 'DOMAIN', 'MOTIF', 'REGION']
 >> P84085
['BINDING', 'CHAIN', 'HELIX', 'INIT_MET', 'LIPID', 'STRAND', 'TURN']
 >> A7MCK9
['DOMAIN', 'TRANSMEM']
 >> P62258
['CHAIN', 'CONFLICT', 'CROSSLNK', 'HELIX', 'MOD_RES', 'MUTAGEN', 'REGION', 'SITE', 'STRAND', 'TURN', 'VAR_SEQ']
 >> Q6Q546
['CHAIN', 'CONFLICT', 

In [8]:
# SAVE STATS

with open(f'{stats_dir}/uniprotkb_info - {tag}.json', 'w') as f:
    json.dump(all_FT_info, f, indent=4)
