### Setup

In [1]:
import os
from pathlib import Path
import sys

def set_current_directory_of_notebook_interpreter_to_project_root():
    if "__project_dir__" not in globals():
        __project_dir__ = Path.cwd().parents[1].resolve()

    sys.path.append(__project_dir__)
    os.chdir(__project_dir__)

set_current_directory_of_notebook_interpreter_to_project_root()

In [2]:
import math
import pandas as pd
from pandas import DataFrame

### Load data

In [3]:
tanno_dataframe = pd.read_csv("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/tanno/combined.csv")
minervina_dataframe= pd.read_csv("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/minervina/preprocessed.csv")

In [4]:
tanno_dataframe.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,clone_count
0,TRAV1-1,CAAANQAGTALIF,TRAJ15,TRBV6-1,CASSDGVAETQYF,TRBJ2-5,,,,1
1,TRAV1-1,CAAANTGFQKLVF,TRAJ8,TRBV27,CASSPLQGYEQYF,TRBJ2-7,,,,1
2,TRAV1-1,CAAAPGFQKLVF,TRAJ8,TRBV7-2,CASSPTVSQETQYF,TRBJ2-5,,,,1
3,TRAV1-1,CAAAQAGTALIF,TRAJ15,TRBV30,CAWRDVAGAEKLFF,TRBJ1-4,,,,1
4,TRAV1-1,CAAAQGGKLIF,TRAJ23,TRBV7-9,CASTGSNYGYTF,TRBJ1-2,,,,1


In [5]:
minervina_dataframe.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,clone_count
0,TRAV1-1,CAAEMNFNKFYF,TRAJ21,TRBV27,CASSSTVTLAKNIQYF,TRBJ2-4,VYF,HLA-A*24,,1
1,TRAV1-1,CAGLEALLNDMRF,TRAJ43,TRBV28,CASERANTGELFF,TRBJ2-2,TTD,HLA-A*01,,1
2,TRAV1-1,CAGTDTGGFKTIF,TRAJ9,TRBV5-1,CASSSTDGNTGELFF,TRBJ2-2,FTS,HLA-A*01,,1
3,TRAV1-1,CAMPTDKLIF,TRAJ34,TRBV15,CATSPGLAGGETGELFF,TRBJ2-2,VYF,HLA-A*24,,1
4,TRAV1-1,CAPDSNYQLIW,TRAJ33,TRBV7-9,CASSLESADRAHYGYTF,TRBJ1-2,VYI,HLA-A*24,,1


### Paired chain exact coincidences

In [6]:
def get_exact_coincidence_probability(dataframe: DataFrame) -> float:
    total_num_pairs_without_replacement = get_total_num_pairs_without_replacement(dataframe)
    num_exact_coincidences_without_replacement = get_num_exact_coincidences_without_replacement(dataframe)
    return num_exact_coincidences_without_replacement / total_num_pairs_without_replacement


def get_total_num_pairs_without_replacement(dataframe: DataFrame) -> int:
    tcrs_grouped_by_specificity = dataframe.groupby("Epitope", dropna=False)
    num_tcrs_in_specificity_groups = tcrs_grouped_by_specificity.aggregate({"clone_count": "sum"})
    num_pairs_in_specificity_groups = num_tcrs_in_specificity_groups.clone_count.map(lambda n: math.comb(n, 2))
    return num_pairs_in_specificity_groups.sum()


def get_num_exact_coincidences_without_replacement(dataframe: DataFrame) -> int:
    num_exact_coincidences_per_unique_tcr_epitope_combination = dataframe.clone_count.map(lambda n: math.comb(n, 2))
    return num_exact_coincidences_per_unique_tcr_epitope_combination.sum()

In [7]:
tanno_ab_pc = get_exact_coincidence_probability(tanno_dataframe)
minervina_ab_pc = get_exact_coincidence_probability(minervina_dataframe)
ab_enrichment = minervina_ab_pc / tanno_ab_pc

print(f"tanno AB Pc:     {tanno_ab_pc}")
print(f"minervina AB Pc: {minervina_ab_pc}")
print(f"AB enrichment:   {ab_enrichment}")

tanno AB Pc:     7.307721481634151e-08
minervina AB Pc: 3.01908947048422e-05
AB enrichment:   413.13690978396346


### Beta chain only exact coincidences

In [8]:
def generate_unique_beta_only_dataframe_from_paired_dataframe(paired_dataframe: DataFrame) -> DataFrame:
    return paired_dataframe.groupby(
        by=["TRBV", "CDR3B", "TRBJ", "Epitope"],
        as_index=False,
        dropna=False
    ).aggregate({"clone_count": "sum"})

In [9]:
tanno_beta_dataframe = generate_unique_beta_only_dataframe_from_paired_dataframe(tanno_dataframe)
minervina_beta_dataframe = generate_unique_beta_only_dataframe_from_paired_dataframe(minervina_dataframe)

In [10]:
tanno_beta_dataframe.head()

Unnamed: 0,TRBV,CDR3B,TRBJ,Epitope,clone_count
0,TRBV10-1,CAETAGQGVYEQYF,TRBJ2-7,,1
1,TRBV10-1,CAEVGGRSYEQYF,TRBJ2-7,,1
2,TRBV10-1,CAFFGAGANVLTF,TRBJ2-6,,1
3,TRBV10-1,CAIFRDGHQDGKLFF,TRBJ1-4,,1
4,TRBV10-1,CAIILAGPSYEQYF,TRBJ2-7,,1


In [11]:
minervina_beta_dataframe.head()

Unnamed: 0,TRBV,CDR3B,TRBJ,Epitope,clone_count
0,TRBV10-1,CASASELADTDTQYF,TRBJ2-3,TTD,1
1,TRBV10-1,CASSDRGLAGTSSYEQYF,TRBJ2-7,VYF,1
2,TRBV10-1,CASSDSGFGYGYTF,TRBJ1-2,LTD,1
3,TRBV10-1,CASSDVSGENNSPLHF,TRBJ1-6,ALS,1
4,TRBV10-1,CASSEFQGESTDTQYF,TRBJ2-3,TTD,1


In [12]:
tanno_b_pc = get_exact_coincidence_probability(tanno_beta_dataframe)
minervina_b_pc = get_exact_coincidence_probability(minervina_beta_dataframe)
b_enrichment = minervina_b_pc / tanno_b_pc

print(f"tanno B Pc:     {tanno_b_pc}")
print(f"minervina B Pc: {minervina_b_pc}")
print(f"B enrichment:   {b_enrichment}")

tanno B Pc:     1.732526944107104e-07
minervina B Pc: 0.0003111984531114504
B enrichment:   1796.2113326429874


### Pairing signal within Minervina

In [13]:
def get_exact_coincidence_probability_disregarding_epitopes(dataframe: DataFrame) -> float:
    total_num_pairs_without_replacement = get_total_num_pairs_without_replacement_disregarding_epitopes(dataframe)
    num_exact_coincidences_without_replacement = get_num_exact_coincidences_without_replacement_disregarding_epitopes(dataframe)
    return num_exact_coincidences_without_replacement / total_num_pairs_without_replacement


def get_total_num_pairs_without_replacement_disregarding_epitopes(dataframe: DataFrame) -> int:
    total_num_tcrs = dataframe.clone_count.sum()
    return math.comb(total_num_tcrs, 2)


def get_num_exact_coincidences_without_replacement_disregarding_epitopes(dataframe: DataFrame) -> int:
    unique_tcrs = dataframe.groupby(
        by=["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ"],
        as_index=False,
        dropna=False
    ).aggregate({"clone_count": "sum"})
    num_exact_coincidences = unique_tcrs.clone_count.map(lambda n: math.comb(n, 2))
    return num_exact_coincidences.sum()

In [17]:
minervina_ab_pc_disregarding_epitopes = get_exact_coincidence_probability_disregarding_epitopes(minervina_dataframe)

print(minervina_ab_pc / minervina_ab_pc_disregarding_epitopes)

4.524993724557368


6.672030182272894e-06