# Catalogueing human TCR gene reference data

In [1]:
import sys
import os
sys.path.append('/home/yutanagano/Projects/graph-gang-databases/')
os.chdir('/home/yutanagano/Projects/graph-gang-databases/')

In [2]:
import json
import pandas as pd
from pathlib import Path
import re

## Catalogueing alternate/deprecated names for TCR genes

In [3]:
hgnc = pd.read_csv('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hgnc/hgnc.tsv', sep='\t')

In [4]:
tcr_genes = hgnc[hgnc['Locus type'].str.contains('T cell receptor gene')]
tcr_genes = tcr_genes[
    tcr_genes['Approved symbol'].str.contains('TRA') | tcr_genes['Approved symbol'].str.contains('TRB')
]

In [5]:
tcr_genes_with_aliases = tcr_genes[tcr_genes['Alias symbols'].notna()][['Approved symbol', 'Alias symbols']]
tcr_genes_with_aliases['Alias symbols'] = tcr_genes_with_aliases['Alias symbols'].map(lambda x: x.split(', '))
tcr_genes_with_aliases.columns = ['Approved symbol', 'Synonym']
tcr_genes_with_aliases = tcr_genes_with_aliases.explode('Synonym')

In [6]:
tcr_genes_with_depnames = tcr_genes[tcr_genes['Previous symbols'].notna()][['Approved symbol', 'Previous symbols']]
tcr_genes_with_depnames['Previous symbols'] = tcr_genes_with_depnames['Previous symbols'].map(lambda x: x.split(', '))
tcr_genes_with_depnames.columns = ['Approved symbol', 'Synonym']
tcr_genes_with_depnames = tcr_genes_with_depnames.explode('Synonym')

In [7]:
tcr_synonyms = pd.concat([tcr_genes_with_aliases, tcr_genes_with_depnames])
tcr_synonyms = tcr_synonyms.groupby('Synonym').aggregate(lambda x: x.tolist())

In [8]:
ambiguous_tcr_synonyms = tcr_synonyms[tcr_synonyms['Approved symbol'].map(len) > 1].copy()
ambiguous_tcr_synonyms['Approved symbol'] = ambiguous_tcr_synonyms['Approved symbol'].map(lambda x: ', '.join(x))

In [9]:
tcr_synonyms = tcr_synonyms[tcr_synonyms['Approved symbol'].map(len) == 1].copy()
tcr_synonyms['Approved symbol'] = tcr_synonyms['Approved symbol'].map(lambda x: x.pop())

### Save simple csv tables of ambiguous and nonambiguous synonyms for reference

In [10]:
tcr_synonyms.to_csv('docs/human_tcr_synonyms.csv')
ambiguous_tcr_synonyms.to_csv('docs/ambiguous_human_tcr_synonyms.csv')

### Decompose the nonambiguous translations and save data as json for use in standardisation programme

In [11]:
def decompose_translation(row: pd.Series) -> tuple:
    m = re.match(r'^TR([AB][CDVJ])?(\d+)(-(\d+))?(DV(\d+))?(OR9-2)?(P)?$', row.iloc[0])

    if m is None:
        print(row.iloc[0])

    base = m.group(1)
    num1 = m.group(2)
    num2 = m.group(4)
    d_designation = m.group(6)
    or92 = True if m.group(7) else False
    p = True if m.group(8) else False

    return (base, num1, num2, d_designation, or92, p)

In [12]:
tcr_synonyms_decomposed = tcr_synonyms.copy()
tcr_synonyms_decomposed.columns = ['approved_symol']
tcr_synonyms_decomposed[
    ['base', 'num1', 'num2', 'd_designation', 'OR9-2', 'P']
] = tcr_synonyms.apply(decompose_translation, axis=1, result_type='expand')

In [13]:
tcr_synonyms_decomposed.to_json('src/resources/human_tcr_synonyms.json', orient='index', indent=4)

## Catalogueing a table of all known TCR genes/alleles

In [14]:
travs = pd.read_csv('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hnc_imgt/human_trav_imgt.csv')
trajs = pd.read_csv('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hnc_imgt/human_traj_imgt.csv')
trbvs = pd.read_csv('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hnc_imgt/human_trbv_imgt.csv')
trbjs = pd.read_csv('/home/yutanagano/UCLOneDrive/graph-gang-databases/data/hnc_imgt/human_trbj_imgt.csv')

In [15]:
def compile_tcr(base, num1, num2, p, or92, d_designation) -> str:
    compiled = 'TR' + base + num1

    if num2:
        compiled = compiled + '-' + num2

    if p:
        compiled = compiled + 'P'

    if d_designation:
        compiled = compiled + 'DV' + d_designation
    
    if or92:
        compiled = compiled + 'OR9-2'

    return compiled

def decompose_v(df: pd.DataFrame, chain: str) -> pd.DataFrame:
    gene_base_to_d_designation = {}

    def _decompose_row(row, chain: str) -> tuple:
        m = re.match(r'^(\d+)(-(\d+))?(/DV(\d+))?(/OR9-2)?$', row['gene_name'])

        num1 = m.group(1)
        num2 = m.group(3)
        or92 = True if m.group(6) else False
        d_designation = m.group(5)

        gene_str = compile_tcr(
            chain+'V',
            num1,
            num2,
            False,
            or92,
            d_designation
        )

        if d_designation:
            gene_base_to_d_designation[
                f'TR{chain}V{num1}' if num2 is None
                else f'TR{chain}V{num1}-{num2}'
            ] = d_designation

        allele_num = row['allele_name'].split('*')[-1]

        return (gene_str, allele_num)
    
    result = pd.DataFrame()
    result[['approved_symbol', 'alleles']] = df.apply(
        lambda row: _decompose_row(row, chain),
        axis=1,
        result_type='expand'
    )
    
    return result, gene_base_to_d_designation

def decompose_j(df: pd.DataFrame) -> pd.DataFrame:
    def _decompose_row(row) -> tuple:
        m = re.match(r'^TR([AB]J)(\d+)(-(\d+)(P)?)?\*(\d+)$', row['allele_name'])

        if m is None:
            print(row['allele_name'])

        base = m.group(1)
        num1 = m.group(2)
        num2 = m.group(4)
        p = True if m.group(5) else False

        gene_str = compile_tcr(
            base,
            num1,
            num2,
            p,
            False,
            None
        )

        allele_num = m.group(6)

        return (gene_str, allele_num)
    
    result = pd.DataFrame()
    result[['approved_symbol', 'alleles']] = df.apply(
        lambda row: _decompose_row(row),
        axis=1,
        result_type='expand'
    )
    
    return result

In [16]:
travs_decomposed, trav_ds = decompose_v(travs, 'A')
trbvs_decomposed, trbv_ds = decompose_v(trbvs, 'B')

trajs_decomposed = decompose_j(trajs)
trbjs_decomposed = decompose_j(trbjs)

tcr_alleles_exhaustive = pd.concat(
    [travs_decomposed, trbvs_decomposed, trajs_decomposed, trbjs_decomposed]
).groupby('approved_symbol').aggregate(lambda x: x.tolist())

In [17]:
tcr_alleles_exhaustive['alleles'].to_json(
    'src/resources/human_tcr_alleles_exhaustive.json',
    indent=4
)

In [18]:
with open('src/resources/human_tcr_d_designations.json', 'w') as f:
    json.dump({**trav_ds, **trbv_ds},f,indent=4)