# Catalogue TCR genes (Homo sapiens)

In [18]:
import sys
import os
from pathlib import Path

if not 'PROJECT_PATH' in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [19]:
import json
import pandas as pd

## List all TCR genes and alleles

In [20]:
alleles = dict()

with open(Path('data')/'homosapiens_tcr.fasta', 'r') as f:
    for line in f.readlines():
        if line.startswith('>'):
            fields = line.split('|')
            allele_name = fields[1]
            gene = allele_name.split('*')[0]
            allele_designation = allele_name.split('*')[1]
            functionality = fields[3].strip('()[]')

            if not gene in alleles:
                alleles[gene] = dict()
            
            alleles[gene][allele_designation] = functionality

In [21]:
with open(Path('src')/'tidytcells'/'resources'/'homosapiens_tcr.json', 'w') as f:
    json.dump(alleles, f, indent=4)

## Get deprecated names/synonyms

In [22]:
hgnc = pd.read_csv(Path('data')/'hgnc.tsv', sep='\t')

In [23]:
# Get only TCR genes
tcr_genes = hgnc[hgnc['Locus type'].str.contains('T cell receptor gene')].copy()

In [24]:
# Put back slashes behind DV designations and OR designations
tcr_genes['Approved symbol'] = tcr_genes['Approved symbol'].str.replace(r'(?<!TR)DV', '/DV', regex=True)
tcr_genes['Approved symbol'] = tcr_genes['Approved symbol'].str.replace(r'OR', '/OR', regex=True)

In [25]:
# Only keep genes whose 'approved symbols' are in our IMGT list
tcr_genes = tcr_genes[tcr_genes['Approved symbol'].map(lambda x: x in alleles)].copy()

In [26]:
# Get TCR genes with aliases
tcr_genes_with_aliases = tcr_genes[tcr_genes['Alias symbols'].notna()][['Approved symbol', 'Alias symbols']]
tcr_genes_with_aliases['Alias symbols'] = tcr_genes_with_aliases['Alias symbols'].map(lambda x: x.split(', '))
tcr_genes_with_aliases.columns = ['Approved symbol', 'Synonym']
tcr_genes_with_aliases = tcr_genes_with_aliases.explode('Synonym')

In [27]:
# Get TCR genes with deprecated names
tcr_genes_with_depnames = tcr_genes[tcr_genes['Previous symbols'].notna()][['Approved symbol', 'Previous symbols']]
tcr_genes_with_depnames['Previous symbols'] = tcr_genes_with_depnames['Previous symbols'].map(lambda x: x.split(', '))
tcr_genes_with_depnames.columns = ['Approved symbol', 'Synonym']
tcr_genes_with_depnames = tcr_genes_with_depnames.explode('Synonym')

In [28]:
# Combine both tables
tcr_synonyms = pd.concat([tcr_genes_with_aliases, tcr_genes_with_depnames])

# Remove any names that are now redundant (approved symbol and synonym are the same)
tcr_synonyms = tcr_synonyms[tcr_synonyms['Approved symbol'] != tcr_synonyms['Synonym']]

# Group together by synonym
tcr_synonyms = tcr_synonyms.groupby('Synonym').aggregate(lambda x: x.tolist())

In [29]:
# Remove ambiguous synonyms
tcr_synonyms = tcr_synonyms[tcr_synonyms['Approved symbol'].map(len) == 1].copy()
tcr_synonyms['Approved symbol'] = tcr_synonyms['Approved symbol'].map(lambda x: x.pop())
tcr_synonyms.index = tcr_synonyms.index.str.upper()

In [30]:
tcr_synonyms['Approved symbol'].to_json(
    Path('src')/'tidytcells'/'resources'/'homosapiens_tcr_synonyms.json',
    indent=4
)