In [1]:
import sys
import os
from pathlib import Path

if not 'PROJECT_PATH' in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [2]:
import json
import pandas as pd
from pathlib import Path
import re

## Load and clean Mus musculus MHC data from IMGT

In [3]:
raw = pd.read_excel(Path('data')/'musmusculus_mhc_imgt.ods', engine='odf')

In [4]:
cleaned = raw.ffill()
cleaned.columns = ['class', 'subclass', 'gene', 'synonym']
cleaned = cleaned.applymap(
    lambda e: re.sub(r'\[.+\]|\(.+\)', '', e)
)
cleaned = cleaned.applymap(
    lambda e: e.strip()
)
cleaned['synonym'] = cleaned['synonym'].map(
    lambda s: [''.join(syn.split()) for syn in s.split(',')]
)
cleaned = cleaned.explode('synonym')

In [5]:
cleaned.head()

Unnamed: 0,class,subclass,gene,synonym
0,IIa,H2-A,MH2-AA,H2-Aa
0,IIa,H2-A,MH2-AA,H-2Aa
0,IIa,H2-A,MH2-AA,H2-IA-alpha
0,IIa,H2-A,MH2-AA,I-AD-A
0,IIa,H2-A,MH2-AA,I-AK-A


In [6]:
def parse_gene(org: str) -> tuple[str]:
    if m := re.match(r'^(M?H[12])-([A-Z0-9\-]+)$', org):
        base = m.group(1)
        gene = m.group(2)
    
    else:
        raise RuntimeError(f'Cannot parse gene {org}.')
    
    return (base, gene)

def parse_synonym(syn: str) -> tuple[str]:
    syn = syn.upper()
    syn = syn.replace('-', '')
    syn = syn.replace('ALPHA', 'A')
    syn = syn.replace('BETA', 'B')

    if m := re.match(r'^H2([A-Z0-9\.]+)$', syn):
        base = 'H2'
        gene = m.group(1)
    
    elif m := re.match(r'^[A-Z0-9\.\/\']+$', syn):
        base = None
        gene = syn

    else:
        raise RuntimeError(f'Cannot parse synonym {syn}.')
    
    return (base, gene)

In [7]:
cleaned['parsed gene'] = cleaned['gene'].map(parse_gene)
cleaned['parsed synonym'] = cleaned['synonym'].map(parse_synonym)

In [8]:
def make_tree(parsed: list) -> dict:
    tree = dict()
    
    for base, gene in parsed:
        if base in tree:
            tree[base].append(gene)
        
        else:
            tree[base] = [gene]

    return tree

In [9]:
alleles = make_tree(cleaned['parsed gene'].unique())

In [10]:
unamb_syns = cleaned[['parsed gene', 'parsed synonym']].drop_duplicates()
unamb_syns = unamb_syns.groupby('parsed synonym').aggregate(lambda x: x.tolist())
unamb_syns = unamb_syns[unamb_syns['parsed gene'].map(len) == 1]
unamb_syns['parsed gene'] = unamb_syns['parsed gene'].map(lambda l: l.pop())
unamb_syns = unamb_syns[unamb_syns.index != unamb_syns['parsed gene']]

syns = dict()

for (base, gene), row in unamb_syns.iterrows():
    real = row['parsed gene']

    if base in syns:
        syns[base][gene] = real
    
    else:
        syns[base] = {gene: real}

## Export list of all MHCs

In [11]:
with open(Path('src')/'tidytcells'/'resources'/'mhc_alleles_musmusculus.json', 'w') as f:
    json.dump(alleles, f, indent=4)

## Export dict mapping synonyms

In [12]:
with open(Path('src')/'tidytcells'/'resources'/'mhc_synonyms_musmusculus.json', 'w') as f:
    json.dump(syns, f, indent=4)