# **PART 3:** TCRex results & statistics

In [141]:
import os
# Set the working directory to the repository directory
os.chdir("/home/sebastiaan/PhD/Repositories/book_chapter/")

Once again, we will need the `pandas` library for handling the data.

In [None]:
import pandas as pd
import scipy

from statsmodels.stats.multitest import multipletests

### Concatenate results

In [144]:
def read_results(folder, file):
    
    """
    Read in a TCRex results file as a pandas dataframe. 
    Ignores meta data information preceded with a '#' sign.
    
    Args:
    - folder: The folder where the TCRex results file is located
    - file: The name of the TCRex results file
    """
    return pd.read_csv(os.path.join(folder, fn), sep = "\t", comment = "#")


def concatenate_data(folder, name):
    
    """
    Concatenate TCRex results from different files into one file.
    
    Args:
    - folder: Path to the directory where the folder with the TCRex results files are located.
    - name: Name of the folder holding all the TCRex results file that need to be concatenated.
    """

    # Get a list of all files in the folder
    files = os.listdir(folder)
        
    # Concatenate all dataframes in the results list
    all_results = pd.concat(
        objs = [read_results(folder, fn) for fn in files]
        )

    # Save concatenated dataframe in a new folder
    new_folder = '../results/parsed_results'
    new_file = os.path.join(new_folder, '.'.join([name,'tsv']))
    
    # If new folder does not exist, create it
    if not os.path.exists(new_folder):
            os.mkdir(new_folder)
    
    # Write results to a new file
    all_results.to_csv(new_file, sep = '\t', index = False)
    

In [146]:
concatenate_data(os.path.join(base_dir,'results/test'),'filename')

### Get metrics

In [None]:
def identification_rate(nr_identified, repertoire_size):
    
    """
    Calculate the percentage of epitope-specific TCRs in a repertoire.
    
    Args:
    - nr_identified: The number of identified epitope-specific TCRs 
    - repertoire_size: The number of TCRs in the original repertoire, reported on the TCRex results page
    
    """
    return (nr_identified / repertoire_size) * 100

In [1]:
def enrichment_analysis(
    nr_identified: int, 
    repertoire_size: int, 
    threshold: float = 0.001
    ):
    
     """
     Calculate the p value of a one sided binomial test.
     
     Args:
     - nr_identified:  The number of identified epitope-specific TCRs 
     - repertoire_size: The number of TCRs in the original repertoire, reported on the TCRex results page
     - enrichment_threshold: Probability of success as defined in a binomial test.
     
     """ 
    return scipy.stats.binom_test(
         x = nr_identified, 
         n = repertoire_size,
         p = threshold,
         alternative = 'greater'
         )

In [147]:
# Read the resuls
results = pd.read_csv(os.path.join('../results/parsed_results','test.tsv'), sep = '\t')

# Calculate the number of identified epitope-specific TCRs 
nr_identified = results.shape[0]  

# Define the repertoire size
repertoire_size = 100000

p = enrichment_analysis(nr_identified, repertoire_size) # p-value
ir = identification_rate(nr_identified, repertoire_size) # identification rate

# Calculate the identification metrics
print(f"p value: {p}"
print(f"Identification rate: {ir}")

0.037

In [9]:
import pandas as pd

pd.DataFrame([[0,1]], columns = ["a", "b"])

Unnamed: 0,a,b
0,0,1


### For multiple epitopes


In [148]:
def calculate_metrics(results, repertoire_size, threshold):
    """
    Calculate the identification rate and enrichment analysis p value for every epitope in a TCRex results file.
    
    Args:
    - results: Pandas DataFrame containing the TCRex results
    - repertoire_size: The number of TCRs in the original repertoire, reported on the TCRex results page
    - enrichment_threshold: Probability of success as defined in a binomial test.
    """
    # For every epitope, store the calculated metrics in a dictionary
    cols = ["identification_rate", "p_value"]
    metrics = pd.DataFrame(columns = cols)
    
    for epitope in set(results['epitope'].tolist()):
        
        # Retrieve all TCRs specific for the epitope
        epitope_data = results[results['epitope'] == epitope]
        # Calculate the number of epitope-specific TCRs
        nr_identified = epitope_data.shape[0]
        
        ir = identification_rate(nr_identified, repertoire_size)
        p = enrichment_analysis(nr_identified, repertoire_size, threshold)
        
        metrics = pd.concat([metrics, pd.DataFrame([[ir, p]], columns = cols)])
        
    return metrics

    

In [149]:
# Read the resuls
results = read_results(os.path.join(base_dir,'results'), 'multiple.tsv')

metrics_df = calculate_metrics(
    results = results,
    repertoire_size = 10000,
    threshold = 0.0001
    )

metrics_df.head()

Unnamed: 0,identification_rate,p_value
QYDPVAALF,0.16,1.848163e-14
QIKVRVKMV,0.14,4.484526e-12
YSEHPTFTSQY,0.41,1.041819e-50
NLVPMVATV,27.13,0.0
IPSINVHHY,0.13,6.317656e-11
VTEHDTLLY,0.96,2.394793e-151
TPRVTGGGAM,0.27,3.3911620000000003e-29


#### multiple testing correction

In [150]:
p_adj = multipletests(
    pvals = metrics_df.p_value, 
    method = 'fdr_bh', 
    is_sorted = False
    )[1]

metrics_df['adjusted_p_value'] = p_adj
metrics_df.head()