# Practical test of standardiser coverage on IEDB data

## Setup

In [1]:
import sys
import os
from pathlib import Path

if not 'PROJECT_PATH' in globals():
    PROJECT_PATH = Path.cwd().parent.resolve()

sys.path.append(PROJECT_PATH)
os.chdir(PROJECT_PATH)

In [2]:
import gzip
from itertools import product
import json
import pandas as pd
import re
import tidytcells

In [3]:
with gzip.open(Path('data')/'iedb_species_decoder.json.gz', 'r') as f:
    species_decoder = json.load(f)

In [4]:
df = pd.read_csv(Path('data')/'iedb.csv.zip')
df = df[df['Response Type'] == 'T cell']

  df = pd.read_csv(Path('data')/'iedb.csv.zip')


In [5]:
df.head()

Unnamed: 0,Group Receptor ID,Receptor ID,Reference IRI,Epitope IRI,Description,Antigen,Organism,Response Type,Assay IDs,MHC Allele Names,...,Chain 2 CDR1 Start Curated,Chain 2 CDR1 End Curated,Chain 2 CDR1 Start Calculated,Chain 2 CDR1 End Calculated,Chain 2 CDR2 Curated,Chain 2 CDR2 Calculated,Chain 2 CDR2 Start Curated,Chain 2 CDR2 End Curated,Chain 2 CDR2 Start Calculated,Chain 2 CDR2 End Calculated
145,47,57,http://www.iedb.org/reference/1004539,http://www.iedb.org/epitope/69921,VMAPRTLIL,"HLA class I histocompatibility antigen, Cw-3 a...",Homo sapiens (human),T cell,"1548960, 1583178","HLA-E*01:01, HLA-E*01:03",...,,,25.0,29.0,,FVKESK,,,47.0,52.0
146,47,57,http://www.iedb.org/reference/1004539,http://www.iedb.org/epitope/69921,VMAPRTLIL,"HLA class I histocompatibility antigen, Cw-3 a...",Homo sapiens (human),T cell,1583178,HLA-E*01:03,...,,,25.0,29.0,,FVKESK,,,47.0,52.0
147,8493,58,http://www.iedb.org/reference/1004580,http://www.iedb.org/epitope/16878,FLRGRAYGL,nuclear antigen EBNA-3,Human herpesvirus 4 (Epstein Barr virus),T cell,"1814845, 1814846, 1814847",HLA-B8,...,,,25.0,29.0,,FQNEAQ,,,47.0,52.0
148,8493,58,http://www.iedb.org/reference/1004580,http://www.iedb.org/epitope/144889,FLRGRFYGL,,,T cell,1831737,HLA-B8,...,,,25.0,29.0,,FQNEAQ,,,47.0,52.0
149,8493,58,http://www.iedb.org/reference/1017865,http://www.iedb.org/epitope/142137,EEYLQAFTY,ATP-binding cassette sub-family D member 3,Homo sapiens (human),T cell,1778798,HLA-B*44:05,...,,,25.0,29.0,,FQNEAQ,,,47.0,52.0


In [6]:
def rename_species(species):
    if type(species) != str:
        return pd.NA

    if 'sapiens' in species.lower():
        return 'HomoSapiens'
    
    if 'musculus' in species.lower():
        return 'MusMusculus'
    
    return species

df['Chain 1 Species'] = df['Chain 1 Species'].map(lambda x: pd.NA if pd.isna(x) else species_decoder[str(int(x))])
df['Chain 1 Species'] = df['Chain 1 Species'].map(rename_species)

In [7]:
tcr_genes = pd.concat(
    [
        df[[' '.join(parts), 'Chain 1 Species']].rename(columns={' '.join(parts): 'gene', 'Chain 1 Species': 'species'}) for parts in product(
            ('Curated', 'Calculated'),
            ('Chain 1', 'Chain 2'),
            ('V Gene', 'J Gene')
        )
    ]
)
tcr_genes = tcr_genes[tcr_genes['gene'].notna()]

In [8]:
tcr_genes

Unnamed: 0,gene,species
145,TRAV26-1*01,HomoSapiens
415,TRAV21*01,HomoSapiens
896,TRAV26-2*01,HomoSapiens
897,TRAV26-2*01,HomoSapiens
1524,TCRAV1-2,HomoSapiens
...,...,...
209499,TRBJ1-2*01,HomoSapiens
209500,TRBJ2-7*01,HomoSapiens
209501,TRBJ1-1*01,HomoSapiens
209502,TRBJ2-3*01,HomoSapiens


In [9]:
mhc_genes = df[['MHC Allele Names', 'Chain 1 Species']].rename(columns={'MHC Allele Names': 'gene', 'Chain 1 Species':'species'})
mhc_genes = mhc_genes[mhc_genes['gene'].notna()]
mhc_genes['gene'] = mhc_genes['gene'].str.split(', ')
mhc_genes = mhc_genes.explode('gene')

In [10]:
mhc_genes

Unnamed: 0,gene,species
145,HLA-E*01:01,HomoSapiens
145,HLA-E*01:03,HomoSapiens
146,HLA-E*01:03,HomoSapiens
147,HLA-B8,HomoSapiens
148,HLA-B8,HomoSapiens
...,...,...
211479,HLA-DRB1*15:01,
211480,HLA-DRB1*15:01,
211481,HLA-DRB1*15:01,
211482,HLA-DRB1*15:01,


## Test TCR standardisation

In [11]:
tcr_genes.apply(
    lambda row: pd.NA if row.isna()['gene'] else tidytcells.tcr.standardise(row['gene'], species=row['species']),
    axis=1
).notna().sum() / len(tcr_genes)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


0.9472613689293347

## Test MHC standardisation

In [12]:
mhc_genes.apply(
    lambda row: pd.NA if pd.isna(row['gene']) else tidytcells.mhc.standardise(row['gene'], species=row['species']),
    axis=1
).notna().sum() / len(mhc_genes)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


0.9976352335631737