In [1]:
import sys
import os
from pathlib import Path

if not 'PROJECT_PATH' in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [35]:
import json
import pandas as pd
from pathlib import Path
import re

## Load and clean Mus musculus MHC data from IMGT

In [10]:
raw = pd.read_excel(Path('data')/'musmusculus_mhc_imgt.ods', engine='odf')

In [23]:
cleaned = raw.ffill()
cleaned.columns = ['class', 'subclass', 'gene', 'synonyms']
cleaned = cleaned.applymap(
    lambda e: re.sub(r'\[.+\]|\(.+\)', '', e)
)
cleaned['synonyms'] = cleaned['synonyms'].map(
    lambda s: [syn.strip() for syn in s.split(',')]
)

In [26]:
cleaned.head()

Unnamed: 0,class,subclass,gene,synonyms
0,IIa,H2-A,MH2-AA,"[H2-Aa, H-2Aa, H2-IA-alpha, I-A D-A, I-A K-A, ..."
1,IIa,H2-A,H2-AB,"[H2-Ab, H-2Ab, H2-Ab1, H2-IA-beta, H2 I-A beta..."
2,Ia,H2-D,H2-D1,"[H-2 D-B, H-2 D-D, H-2D, Db, Dd]"
3,Ia,H2-D,H2-D2,[D2d]
4,Ia,H2-D,H2-D3,[D3d]


## Export list of all MHCs

In [27]:
mhc_alleles_musmusculus = cleaned['gene'].tolist()

In [36]:
with open(Path('src')/'tidytcells'/'resources'/'mhc_alleles_musmusculus.json', 'w') as f:
    json.dump(mhc_alleles_musmusculus, f, indent=4)

## Export dict mapping synonyms

In [52]:
synonyms = cleaned[['gene', 'synonyms']]
synonyms = synonyms.explode('synonyms')
synonyms = synonyms.groupby('synonyms').aggregate(lambda x: x.tolist())
synonyms = synonyms[synonyms['gene'].map(len) == 1].copy()
synonyms = synonyms['gene'].map(lambda x: x.pop())

In [54]:
synonyms.to_json(Path('src')/'tidytcells'/'resources'/'mhc_synonyms_musmusculus.json', indent=4)