# Catalogue MHC genes (Mus musculus)

## Setup

In [16]:
import sys
import os
from pathlib import Path

if not "PROJECT_PATH" in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [17]:
import json
import pandas as pd
import re

In [18]:
df = pd.read_excel(Path("data") / "musmusculus_mhc.ods").drop(
    columns=["Unnamed: 2", "Unnamed: 4"]
)
df.columns = ["group", "subgroup", "gene name", "synonym"]
df["group"] = df["group"].fillna(method="ffill")
df["subgroup"] = df["subgroup"].fillna(method="ffill")

df["synonym"] = df["synonym"].map(lambda s: re.sub(r"[\[\(].*[\]\)]", "", s))
df["synonym"] = df["synonym"].map(
    lambda s: ["".join(syn.split()).upper().replace("-", "") for syn in s.split(",")]
)
df = df.explode("synonym", ignore_index=True)

df = df.applymap(lambda x: x.strip())
df = df.drop_duplicates()

In [19]:
df.head()

Unnamed: 0,group,subgroup,gene name,synonym
0,MH2,MH2-A,MH2-AA,H2AA
3,MH2,MH2-A,MH2-AA,H2IAALPHA
4,MH2,MH2-A,MH2-AA,IADA
5,MH2,MH2-A,MH2-AA,IAKA
6,MH2,MH2-A,MH2-AA,IAA


## List all MHC genes

In [20]:
# Get all genes
genes = df["gene name"].unique().tolist()

In [21]:
with open(Path("src") / "tidytcells" / "_resources" / "musmusculus_mhc.json", "w") as f:
    json.dump(genes, f, indent=4)

## Get deprecated names/synonyms

In [22]:
mhc_synonyms = df[["gene name", "synonym"]]

# Group together by synonym
mhc_synonyms = mhc_synonyms.groupby("synonym").aggregate(lambda x: x.tolist())

In [23]:
# Discard ambiguous synonyms
mhc_synonyms = mhc_synonyms[mhc_synonyms["gene name"].map(len) == 1].copy()
mhc_synonyms["gene name"] = mhc_synonyms["gene name"].map(lambda x: x.pop())

In [24]:
# Discard redundant items (synonym == approved symbol)
mhc_synonyms = mhc_synonyms[mhc_synonyms.index != mhc_synonyms["gene name"]]

In [25]:
# Discard synonyms that are also names of other valid genes
mhc_synonyms = mhc_synonyms[mhc_synonyms.index.map(lambda x: x not in genes)]

In [26]:
mhc_synonyms["gene name"].to_json(
    Path("src") / "tidytcells" / "_resources" / "musmusculus_mhc_synonyms.json",
    indent=4,
)