# Catalogueing Mus musculus TCR gene reference data

In [4]:
import sys
import os
from pathlib import Path

if not 'PROJECT_PATH' in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [5]:
import json
import pandas as pd
import re

## Catalogueing a table of all known TCR genes/alleles

In [7]:
travs = pd.read_csv('data/musmusculus_trav_imgt.csv')
trajs = pd.read_csv('data/musmusculus_traj_imgt.csv')
trbvs = pd.read_csv('data/musmusculus_trbv_imgt.csv')
trbjs = pd.read_csv('data/musmusculus_trbj_imgt.csv')

In [8]:
def compile_tcr(base, num1, num2, p, or92, d_designation) -> str:
    compiled = 'TR' + base + num1

    if num2:
        compiled = compiled + '-' + num2

    if p:
        compiled = compiled + 'P'

    if d_designation:
        compiled = compiled + 'DV' + d_designation
    
    if or92:
        compiled = compiled + 'OR9-2'

    return compiled

def decompose_v(df: pd.DataFrame, chain: str) -> pd.DataFrame:
    gene_base_to_d_designation = {}

    def _decompose_row(row, chain: str) -> tuple:
        m = re.match(r'^(\d+)(-(\d+))?(/DV(\d+))?(/OR9-2)?$', row['gene_name'])

        if m is None:
            print(row['gene_name'])

        num1 = m.group(1)
        num2 = m.group(3)
        or92 = True if m.group(6) else False
        d_designation = m.group(5)

        gene_str = compile_tcr(
            chain+'V',
            num1,
            num2,
            False,
            or92,
            d_designation
        )

        if d_designation:
            gene_base_to_d_designation[
                f'TR{chain}V{num1}' if num2 is None
                else f'TR{chain}V{num1}-{num2}'
            ] = d_designation

        allele_num = row['allele_name'].split('*')[-1]

        return (gene_str, allele_num)
    
    result = pd.DataFrame()
    result[['approved_symbol', 'alleles']] = df.apply(
        lambda row: _decompose_row(row, chain),
        axis=1,
        result_type='expand'
    )
    
    return result, gene_base_to_d_designation

def decompose_j(df: pd.DataFrame) -> pd.DataFrame:
    def _decompose_row(row) -> tuple:
        m = re.match(r'^TR([AB]J)(\d+)(-(\d+)(P)?)?\*(\d+)$', row['allele_name'])

        if m is None:
            print(row['allele_name'])

        base = m.group(1)
        num1 = m.group(2)
        num2 = m.group(4)
        p = True if m.group(5) else False

        gene_str = compile_tcr(
            base,
            num1,
            num2,
            p,
            False,
            None
        )

        allele_num = m.group(6)

        return (gene_str, allele_num)
    
    result = pd.DataFrame()
    result[['approved_symbol', 'alleles']] = df.apply(
        lambda row: _decompose_row(row),
        axis=1,
        result_type='expand'
    )
    
    return result

In [9]:
travs_decomposed, trav_ds = decompose_v(travs, 'A')
trbvs_decomposed, trbv_ds = decompose_v(trbvs, 'B')

trajs_decomposed = decompose_j(trajs)
trbjs_decomposed = decompose_j(trbjs)

tcr_alleles_exhaustive = pd.concat(
    [travs_decomposed, trbvs_decomposed, trajs_decomposed, trbjs_decomposed]
).groupby('approved_symbol').aggregate(lambda x: x.tolist())

AttributeError: 'NoneType' object has no attribute 'group'

In [17]:
tcr_alleles_exhaustive['alleles'].to_json(
    'src/resources/human_tcr_alleles_exhaustive.json',
    indent=4
)

In [18]:
with open('src/resources/human_tcr_d_designations.json', 'w') as f:
    json.dump({**trav_ds, **trbv_ds},f,indent=4)