# Catalogueing Mus musculus TCR gene reference data

In [1]:
import sys
import os
from pathlib import Path

if not 'PROJECT_PATH' in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [2]:
import json
import pandas as pd
import re

## Catalogueing a table of all known TCR genes/alleles

In [3]:
travs = pd.read_csv('data/musmusculus_trav_imgt.csv')
trajs = pd.read_csv('data/musmusculus_traj_imgt.csv')
trbvs = pd.read_csv('data/musmusculus_trbv_imgt.csv')
trbjs = pd.read_csv('data/musmusculus_trbj_imgt.csv')

In [4]:
def compile_tcr(base, num1, num1_d, num2, dv_1, dv_1_d, dv_2) -> str:
    compiled = 'TR' + base + num1

    if num1_d:
        compiled = compiled + 'D'

    if num2:
        compiled = compiled + '-' + num2

    if dv_1:
        compiled = compiled + '/DV' + dv_1

    if dv_1_d:
        compiled = compiled + 'D'
    
    if dv_2:
        compiled = compiled + '-' + dv_2

    return compiled

def compile_tcr_without_dv(base, num1, num1_d, num2):
    compiled = 'TR' + base + num1

    if num1_d:
        compiled = compiled + 'D'

    if num2:
        compiled = compiled + '-' + num2

    return compiled

def decompose_v(df: pd.DataFrame, chain: str) -> pd.DataFrame:
    gene_base_to_d_designation = {}

    def _decompose_row(row, chain: str) -> tuple:
        m = re.match(r'^(TR[AB]V)?(\d+)(D)?(-(\d+))?(/DV(\d+)(D)?(-(\d))?)?$', row['gene_name'])

        if m is None:
            print(row['gene_name'])

        num1 = m.group(2)
        num1_d = True if m.group(3) else False
        num2 = m.group(5)
        dv_1 = m.group(7)
        dv_1_d = True if m.group(8) else False
        dv_2 = m.group(10)

        gene_str = compile_tcr(
            chain+'V',
            num1,
            num1_d,
            num2,
            dv_1,
            dv_1_d,
            dv_2
        )

        if dv_1:
            gene_base_to_d_designation[
                compile_tcr_without_dv(
                    chain+'V',
                    num1,
                    num1_d,
                    num2
                )
            ] = {
                'dv_1': dv_1,
                'dv_1_d': dv_1_d,
                'dv_2': dv_2
            }

        allele_num = row['allele_name'].split('*')[-1]

        return (gene_str, allele_num)
    
    result = pd.DataFrame()
    result[['approved_symbol', 'alleles']] = df.apply(
        lambda row: _decompose_row(row, chain),
        axis=1,
        result_type='expand'
    )
    
    return result, gene_base_to_d_designation

def decompose_j(df: pd.DataFrame) -> pd.DataFrame:
    def _decompose_row(row) -> tuple:
        m = re.match(r'^TR([AB]J)(\d+)(-(\d+))?$', row['gene_name'])

        if m is None:
            print(row['allele_name'])

        base = m.group(1)
        num1 = m.group(2)
        num2 = m.group(4)

        gene_str = compile_tcr(
            base,
            num1,
            False,
            num2,
            None,
            False,
            None
        )

        allele_num = row['allele_name'].split('*')[-1]

        return (gene_str, allele_num)
    
    result = pd.DataFrame()
    result[['approved_symbol', 'alleles']] = df.apply(
        lambda row: _decompose_row(row),
        axis=1,
        result_type='expand'
    )
    
    return result

In [5]:
travs_decomposed, trav_ds = decompose_v(travs, 'A')
trbvs_decomposed, trbv_ds = decompose_v(trbvs, 'B')

trajs_decomposed = decompose_j(trajs)
trbjs_decomposed = decompose_j(trbjs)

tcr_alleles_exhaustive = pd.concat(
    [travs_decomposed, trbvs_decomposed, trajs_decomposed, trbjs_decomposed]
).groupby('approved_symbol').aggregate(lambda x: x.tolist())

In [6]:
tcr_alleles_exhaustive['alleles'].to_json(
    'src/tidytcells/resources/tcr_alleles_musmusculus.json',
    indent=4
)

In [7]:
with open('src/tidytcells/resources/tcr_d_designations_musmusculus.json', 'w') as f:
    json.dump({**trav_ds, **trbv_ds},f,indent=4)