# Catalogueing human MHC gene reference data

In [1]:
import sys
import os
sys.path.append('/home/yutanagano/Projects/graph-gang-databases/')
os.chdir('/home/yutanagano/Projects/graph-gang-databases/')

In [2]:
import json
import pandas as pd
import re
from xml.etree import ElementTree as ET

## Catalogueing alternate/deprecated names for HLA genes

In [3]:
hgnc = pd.read_csv('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hgnc/hgnc.tsv', sep='\t')

In [4]:
mhc_genes = hgnc[hgnc['Gene group name'].notna()]
mhc_genes = mhc_genes[mhc_genes['Gene group name'].str.contains('Histocompatibility complex')]

In [5]:
mhc_genes_with_aliases = mhc_genes[mhc_genes['Alias symbols'].notna()][['Approved symbol', 'Alias symbols']]
mhc_genes_with_aliases['Alias symbols'] = mhc_genes_with_aliases['Alias symbols'].map(lambda x: x.split(', '))
mhc_genes_with_aliases.columns = ['Approved symbol', 'Synonym']
mhc_genes_with_aliases = mhc_genes_with_aliases.explode('Synonym')

In [6]:
mhc_genes_with_depnames = mhc_genes[mhc_genes['Previous symbols'].notna()][['Approved symbol', 'Previous symbols']]
mhc_genes_with_depnames['Previous symbols'] = mhc_genes_with_depnames['Previous symbols'].map(lambda x: x.split(', '))
mhc_genes_with_depnames.columns = ['Approved symbol', 'Synonym']
mhc_genes_with_depnames = mhc_genes_with_depnames.explode('Synonym')

In [7]:
mhc_synonyms = pd.concat([mhc_genes_with_aliases, mhc_genes_with_depnames])
mhc_synonyms = mhc_synonyms.applymap(lambda symbol: symbol.replace('HLA-', ''))
mhc_synonyms = mhc_synonyms.groupby('Synonym').aggregate(lambda x: x.tolist())

In [8]:
# Discard ambiguous synonyms
mhc_synonyms = mhc_synonyms[mhc_synonyms['Approved symbol'].map(len) == 1].copy()
mhc_synonyms['Approved symbol'] = mhc_synonyms['Approved symbol'].map(lambda x: x.pop())

In [9]:
# Discard redundant items (synonym == approved symbol)
mhc_synonyms = mhc_synonyms[mhc_synonyms.index != mhc_synonyms['Approved symbol']]

In [10]:
mhc_synonyms['Approved symbol'].to_json('src/resources/human_mhc_synonyms.json', indent=4)

## Catalogueing all known MHC proteins, G groups and P groups

In [11]:
hla_alleles = pd.read_csv(
    '/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hnc_imgt/imgt_hla_allele_list.csv',
    skiprows=6
)

In [12]:
gp_xml = ET.parse('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hnc_imgt/hla.xml')
alleles_xml = gp_xml.getroot()

In [13]:
# Get all G groups
g_groups = list(dict.fromkeys(sorted([
    g_group.attrib['status'] for g_group in
    alleles_xml.iter('{http://hla.alleles.org/xml}hla_g_group')
    if g_group.attrib['status'] != 'None'
])))

In [14]:
# Get all P groups
p_groups = list(dict.fromkeys(sorted([
    p_group.attrib['status'] for p_group in
    alleles_xml.iter('{http://hla.alleles.org/xml}hla_p_group')
    if p_group.attrib['status'] != 'None'
])))

In [15]:
def decompose_hla(gene_str: str, max_spec_field_depth: int = 2):
    m = re.match(r'^([A-Z]|D[M-R][AB]\d?)\*([\dGP:]+)[LSCAQN]?$', gene_str)

    if m is None:
        # exclude all strictly non-HLA gene allele data (e.g. HFE, MICA/B)
        print(f'Ignoring {gene_str}...')
        return None
    
    gene = m.group(1)
    spec_fields = m.group(2).split(':')[:max_spec_field_depth]
    
    return (gene,) + tuple(spec_fields)

In [16]:
g_groups_decomposed = [decompose_hla(g_group, 4) for g_group in g_groups if decompose_hla(g_group, 4) is not None]
p_groups_decomposed = [decompose_hla(p_group, 4) for p_group in p_groups if decompose_hla(p_group, 4) is not None]
proteins_decomposed = [
    decompose_hla(allele) for allele in hla_alleles['Allele']
    if not 'N' in allele and decompose_hla(allele) is not None
]

Ignoring HFE*001:01:01...
Ignoring HFE*001:01:02...
Ignoring HFE*001:01:03...
Ignoring HFE*002...
Ignoring HFE*003...
Ignoring HFE*004...
Ignoring MICA*001:01...
Ignoring MICA*001:02...
Ignoring MICA*002:01:01...
Ignoring MICA*002:01:02...
Ignoring MICA*002:01:03...
Ignoring MICA*002:01:04...
Ignoring MICA*002:01:05...
Ignoring MICA*002:01:06...
Ignoring MICA*002:01:07...
Ignoring MICA*002:01:08...
Ignoring MICA*002:01:09...
Ignoring MICA*002:01:10...
Ignoring MICA*002:01:11...
Ignoring MICA*002:01:12...
Ignoring MICA*002:01:13Q...
Ignoring MICA*002:01:14...
Ignoring MICA*002:01:15...
Ignoring MICA*002:01:16...
Ignoring MICA*002:02...
Ignoring MICA*002:03...
Ignoring MICA*002:04...
Ignoring MICA*002:05...
Ignoring MICA*002:06...
Ignoring MICA*002:07...
Ignoring MICA*002:08...
Ignoring MICA*002:09...
Ignoring MICA*002:10...
Ignoring MICA*002:11...
Ignoring MICA*002:12...
Ignoring MICA*004:01:01...
Ignoring MICA*004:01:02...
Ignoring MICA*004:01:03...
Ignoring MICA*004:01:04...
Ignoring 

In [17]:
combined_decomposed = list(
    dict.fromkeys(
        sorted(g_groups_decomposed + p_groups_decomposed + proteins_decomposed)
    )
)

In [18]:
def make_hla_tree(current_root: dict, token_lists: list) -> None:
    first_tokens = list(
        dict.fromkeys(
            sorted([token_list[0] for token_list in token_lists])
        )
    )

    for token in first_tokens:
        current_root[token] = {}

        new_token_lists = [
            token_list[1:] for token_list in token_lists \
            if token_list[0] == token and \
            len(token_list) > 1
        ]

        make_hla_tree(current_root[token], new_token_lists)

In [19]:
hla_tree = {}

make_hla_tree(hla_tree, combined_decomposed)

In [20]:
with open('src/resources/human_mhc_alleles_exhaustive.json', 'w') as f:
    json.dump(hla_tree, f, indent=4)