In [15]:
import pickle
import pandas as pd
from scipy.stats import fisher_exact
from tqdm import tqdm

Load data

In [2]:
path_to_load = "../../02-Data/clustering/gmm_results.pkl"

with open(path_to_load, 'rb') as file:
    gmm_results = pickle.load(file)
gmm_results[0][:2]

Unnamed: 0,slopes,predictions,percentages,counts,file_id
0,0.074953,1,13.385827,178589,P00030
1,0.044891,0,7.874016,175402,P00078


In [3]:
emerson = pd.read_csv('../../02-Data/data_preparation/processed_files/metadata_merged_full.tsv', sep='\t')
emerson[:1]

Unnamed: 0.1,Unnamed: 0,sample_name,species,locus,product_subtype,hla_class_i,hla_class_ii,sample_amount_ng,sample_cells_mass_estimate,counting_method,...,HLA-DQAB*01:03_03:01,HLA-DQAB*01:02_03:02,HLA-DQAB*02:01_06:09,HLA-DQAB*02:01_06:02,HLA-DQAB*02:01_06:03,HLA-DQAB*01:03_06:01,HLA-DQAB*01:03_06:02,HLA-DPAB*01:03_10:01,HLA-DQAB*01:02_04:02,HLA-DPAB*02:02_05:01
0,0,P00030,Human,TCRB,Deep,,,1165.71997,179341.0,v2,...,False,False,False,False,False,False,False,False,False,False


#### Join each of the points in the clusters with its HLA info


In [4]:
emerson.set_index('sample_name', inplace=True) # Set 'SampleID' as the index to facilitate row-wise operations

# Select the columns from index 18 to 190 (both inclusive). This leaves out the combined TCRs. Change to 287 if want to include them
selected_columns = emerson.iloc[:, 17:190]

patient_hlas = {}
for index, row in selected_columns.iterrows():
    true_columns = row.index[row == True].tolist()
    patient_hlas[index] = true_columns

for df in gmm_results:
    df['HLAs'] = df['file_id'].map(patient_hlas)

gmm_results[0][:2]

Unnamed: 0,slopes,predictions,percentages,counts,file_id,HLAs
0,0.074953,1,13.385827,178589,P00030,"[HLA-A*02:01, HLA-A*24:02, HLA-B*51:01, HLA-B*..."
1,0.044891,0,7.874016,175402,P00078,"[HLA-A*26:01, HLA-A*02:01, HLA-B*49:01, HLA-B*..."


#### Perform fisher exact test for each of the HLAs in each of the clusters

In [18]:

# Assuming gmm_results is a list of DataFrames
# A threshold for statistical significance
p_value_threshold = 0.05

fisher_results = {}

# To store HLAs significantly associated with predictions
significant_hlas = {}

for cluster_index, cluster in tqdm(enumerate(gmm_results), total=len(gmm_results), desc="Processing Clusters"):
    fisher_cluster = {}
    unique_hlas = set()

    # Iterate through each entry in the 'HLAs' column
    for hla_list in cluster['HLAs']:
        # Add each HLA in the list to the set of unique HLAs
        unique_hlas.update(hla_list)

    for chosen_hla in tqdm(unique_hlas, desc=f"Processing HLAs for cluster {cluster_index}", leave=False):
        contingency_table = [[0, 0], [0, 0]]  # [[HLA present & prediction 0, HLA present & prediction 1], [HLA absent & prediction 0, HLA absent & prediction 1]]

        for index, row in cluster.iterrows():
            hla_present = chosen_hla in row['HLAs']
            prediction = row['predictions']
            
            if hla_present:
                if prediction == 0:
                    contingency_table[0][0] += 1
                else:
                    contingency_table[0][1] += 1
            else:
                if prediction == 0:
                    contingency_table[1][0] += 1
                else:
                    contingency_table[1][1] += 1
        
        # Perform Fisher's exact test on the contingency table
        _, p_value = fisher_exact(contingency_table)
        
        # Store the results
        fisher_cluster[chosen_hla] = p_value

        # Check for significance and store if significant
        if p_value < p_value_threshold:
            if cluster_index not in significant_hlas:
                significant_hlas[cluster_index] = []
            significant_hlas[cluster_index].append((chosen_hla, p_value))

    # Use the cluster index as the key
    fisher_results[cluster_index] = fisher_cluster

# Print significant HLAs
for cluster_index, hla_list in significant_hlas.items():
    print(f"Cluster {cluster_index}: Significant HLAs with p-value < {p_value_threshold}")
    for hla, p_value in hla_list:
        print(f"  HLA: {hla}, p-value: {p_value}")

# To print all results, you can uncomment the line below
# print(fisher_results)

Processing Clusters: 100%|██████████| 20/20 [02:08<00:00,  6.44s/it]

Cluster 0: Significant HLAs with p-value < 0.05
  HLA: HLA-DPB1*02:02, p-value: 0.03414490325800958
  HLA: HLA-DQB1*06:03, p-value: 0.00499743171306426
  HLA: HLA-DQA1*01:03, p-value: 0.03581774908369885
  HLA: HLA-A*30:02, p-value: 0.010645454064427224
  HLA: HLA-DRB1*13:01, p-value: 0.010486453728465413
Cluster 1: Significant HLAs with p-value < 0.05
  HLA: HLA-C*05:01, p-value: 0.04854656687123618
  HLA: HLA-B*51:01, p-value: 0.027518579648552338
  HLA: HLA-C*14:02, p-value: 0.003551528368457256
  HLA: HLA-DRB1*11:01, p-value: 0.04649364593964033
  HLA: HLA-DRB1*08:01, p-value: 0.012605137194238497
  HLA: HLA-DQB1*03:03, p-value: 0.013463777054119481
  HLA: HLA-DQA1*04:01, p-value: 0.027934283707440347
  HLA: HLA-A*11:01, p-value: 0.019202599535451313
Cluster 2: Significant HLAs with p-value < 0.05
  HLA: HLA-DRB1*13:01, p-value: 0.04663436111963842
Cluster 3: Significant HLAs with p-value < 0.05
  HLA: HLA-C*16:01, p-value: 0.0393941854303108
  HLA: HLA-DQA1*01:03, p-value: 0.00809




In [15]:
# Just as info
hla_columns = ["HLA-A*33:01", "HLA-A*33:03", "HLA-A*26:01", "HLA-A*31:01", "HLA-A*11:01",
               "HLA-A*23:01", "HLA-A*03:01", "HLA-A*24:03", "HLA-A*34:01", "HLA-A*03:02",
               "HLA-A*25:01", "HLA-A*32:01", "HLA-A*01:01", "HLA-A*29:02", "HLA-A*29:01",
               "HLA-A*66:01", "HLA-A*02:01", "HLA-A*24:02", "HLA-A*02:05", "HLA-A*02:06",
               "HLA-A*30:01", "HLA-A*30:02", "HLA-A*68:01", "HLA-A*68:02", "HLA-B*55:01",
               "HLA-B*38:01", "HLA-B*38:02", "HLA-B*15:17", "HLA-B*50:01", "HLA-B*45:01",
               "HLA-B*51:01", "HLA-B*57:01", "HLA-B*49:01", "HLA-B*14:02", "HLA-B*52:01",
               "HLA-B*48:01", "HLA-B*58:01", "HLA-B*40:02", "HLA-B*40:01", "HLA-B*40:06",
               "HLA-B*18:01", "HLA-B*27:05", "HLA-B*15:03", "HLA-B*39:06", "HLA-B*15:01",
               "HLA-B*13:02", "HLA-B*39:01", "HLA-B*15:07", "HLA-B*14:01", "HLA-B*44:03",
               "HLA-B*44:02", "HLA-B*08:01", "HLA-B*15:18", "HLA-B*07:02", "HLA-B*41:01",
               "HLA-B*41:02", "HLA-B*35:08", "HLA-B*07:05", "HLA-B*35:03", "HLA-B*35:02",
               "HLA-B*35:01", "HLA-B*56:01", "HLA-B*37:01", "HLA-B*53:01", "HLA-C*12:02",
               "HLA-C*12:03", "HLA-C*06:02", "HLA-C*05:01", "HLA-C*14:02", "HLA-C*04:01",
               "HLA-C*03:04", "HLA-C*03:02", "HLA-C*03:03", "HLA-C*01:02", "HLA-C*15:05",
               "HLA-C*15:02", "HLA-C*08:03", "HLA-C*08:02", "HLA-C*08:01", "HLA-C*16:01",
               "HLA-C*16:02", "HLA-C*02:02", "HLA-C*07:02", "HLA-C*17:01", "HLA-C*07:01",
               "HLA-C*07:04", "HLA-DRB1*12:01", "HLA-DRB1*12:02", "HLA-DRB1*11:01", "HLA-DRB1*11:03",
               "HLA-DRB1*11:02", "HLA-DRB1*11:04", "HLA-DRB1*04:03", "HLA-DRB1*04:02", "HLA-DRB1*04:01",
               "HLA-DRB1*14:01", "HLA-DRB1*04:07", "HLA-DRB1*14:07", "HLA-DRB1*14:04", "HLA-DRB1*04:04",
               "HLA-DRB1*04:08", "HLA-DRB1*09:01", "HLA-DRB1*03:01", "HLA-DRB1*13:03", "HLA-DRB1*14:02",
               "HLA-DRB1*01:01", "HLA-DRB1*01:02", "HLA-DRB1*01:03", "HLA-DRB1*13:05", "HLA-DRB1*15:01",
               "HLA-DRB1*13:02", "HLA-DRB1*13:01", "HLA-DRB1*15:02", "HLA-DRB1*08:04", "HLA-DRB1*08:03",
               "HLA-DRB1*15:03", "HLA-DRB1*08:01", "HLA-DRB1*16:01", "HLA-DRB1*16:02", "HLA-DRB1*07:01",
               "HLA-DRB1*04:05", "HLA-DRB1*10:01", "HLA-DRB1*14:54", "HLA-DRB1*08:02", "HLA-DQA1*01:04",
               "HLA-DQA1*01:05", "HLA-DQA1*04:01", "HLA-DQA1*01:01", "HLA-DQA1*01:02", "HLA-DQA1*01:03",
               "HLA-DQA1*06:01", "HLA-DQA1*03:02", "HLA-DQA1*03:03", "HLA-DQA1*03:01", "HLA-DQA1*02:01",
               "HLA-DQA1*05:01", "HLA-DQA1*05:05", "HLA-DQB1*04:02", "HLA-DQB1*04:01", "HLA-DQB1*05:03",
               "HLA-DQB1*06:09", "HLA-DQB1*06:04", "HLA-DQB1*03:05", "HLA-DQB1*03:02", "HLA-DQB1*03:03",
               "HLA-DQB1*06:03", "HLA-DQB1*06:02", "HLA-DQB1*03:01", "HLA-DQB1*06:01", "HLA-DQB1*02:01",
               "HLA-DQB1*05:01", "HLA-DQB1*05:02", "HLA-DQB1*02:02", "HLA-DPA1*02:01", "HLA-DPA1*01:04",
               "HLA-DPA1*02:02", "HLA-DPA1*01:03", "HLA-DPB1*04:02", "HLA-DPB1*04:01", "HLA-DPB1*14:01",
               "HLA-DPB1*01:01", "HLA-DPB1*17:01", "HLA-DPB1*15:01", "HLA-DPB1*02:02", "HLA-DPB1*06:01",
               "HLA-DPB1*13:01", "HLA-DPB1*03:01", "HLA-DPB1*02:01", "HLA-DPB1*104:01", "HLA-DPB1*16:01",
               "HLA-DPB1*11:01", "HLA-DPB1*05:01", "HLA-DPB1*10:01"]