# Create MHC + CH status dataframe  
- Barbara Walkowiak (bw450)
- 2024-06-19, modified 2024-09-07

- This script is used to generate a dataframe with MHC and CH data for all individuals screened for CH to be used for later analysis (main + supplementary figures)

- Dataframe contents
    - Person ID for everyone screened for CH
    - their MHC genotype (df is filtered to only include people w/ confidently imputed MHC genotype)
    - MHC-associated parameters (heterozygosity status for each locus, overall, nr of alleles) 
    - CH status (gene_var = variant they carry, or NaN if they don't carry a variant) 
    - VAF for the variant they carry (or NaN if no CH) + read depth (total) + var read depth (nr reads with the variant)
    - MHCI-based score for each variant (40 examined variants for now)
    - MHCII-based score for each variant (40 examined variants for now) 
    - median score for each variant (the same for everyone because there is only one median score for a given variant)
    - group for each of the 40 variants (top binding vs bottom binding; population split in half) 

# Set up

In [39]:
# IMPORTS

import random
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib.patches import Polygon
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from matplotlib import cm
import scipy.special
from scipy import integrate
import scipy.integrate as it
from scipy.interpolate import interp1d
from scipy.stats import kde
import copy
import glob, os
import re
# from sklearn import datasets, linear_model
import pandas as pd
from decimal import *
from operator import itemgetter    
from collections import OrderedDict
import timeit
import time 
import csv
import seaborn as sns 
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.api as sm
import plotly.express as px
import kaleido


In [40]:
# specify font for plotting 
plt.rcParams.update({'font.sans-serif':'Helvetica'})

# stop printing warnings 
import warnings
warnings.filterwarnings("ignore")

# get current date 
timestr = time.strftime("%Y%m%d") 

# GOAL: df with everyone screened for CH: ID, MHC I / II, CH status, age

# Determine IDs of participants who were screened for CH
- here, find IDs of EVERYONE screened to determine if they had CH or not

In [41]:

# import indexes screened in each batch
folder_path = '/Users/barbarawalkowiak/Desktop/msc_thesis/data/ukb_hotspot_calls/batch_ids'  # Path to ukb files from Hamish 

# Identify all tsv files in the folder
files_ids = glob.glob(os.path.join(folder_path, '*.tsv'))  

# read each file
indexes = {}

# read each CSV 
for file in files_ids:
    
    id = pd.read_csv(file, sep = '\t')
    id = id.rename(columns={'batch ID': 'sample_ID'})
    id['sample_ID'] = id['sample_ID'].str.split('_', n = 1).str[0]
    indexes[file] = id

for file_name, df in indexes.items():
    
    variable_name = file_name.split('/')[8].split('.')[0] # get rid of tsv 
    globals()[variable_name] = df 

print('Number of data batches screened for CH:', len(indexes)) # should be 51, as 51 batches were screened

indexes = { k: v.set_index('sample_ID') for k, v in indexes.items()}

indexes_df = pd.concat(indexes, axis=1)
indexes_df.columns = indexes_df.columns.droplevel(-1)
indexes_df = indexes_df.reset_index()
indexes_df.head()

indexes_list = indexes_df['sample_ID'].tolist()
indexes_list = [int(id) for id in indexes_list]
print('Number of people who were screened for CH:', len(indexes_list))
print('Person_IDs:', indexes_list[0:4]) # check that Person IDs are in the correct format (numeric)


Number of data batches screened for CH: 51
Number of people who were screened for CH: 454800
Person_IDs: [1876197, 1866600, 1891748, 1874461]


# Load the CH variant data
- add CH status (CH-positive if >= 2 reads with one of the pre-defined hotspot positions)
- presence of > 1 hotspot mutation is allowed (i.e., possible for someone to carry 2 or more variants)

In [42]:

# Import all the data (all calls: include singletons and higher read numbers, for all changes at a position affected in CH)

# read in the data (files received from Hamish MacGregor)
df_all_variants = pd.read_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/data/ukb_healthy_curated_variants_not_excluding_multi_individuals.tsv', sep = '\t') 
df_tp53_variants = pd.read_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/data/TP53_putative_somatic.tsv', sep = '\t') # separate file for TP53 calling 

# remove individuals with mCAs (most are in JAK2)
df_all_variants = df_all_variants[df_all_variants['overlap'] == False]

# this dataset has been filtered to only include individuals who have >= 2 reads corresponding to a given CH variant 
print('Minimum number of reads in the sample:', min(df_all_variants['var_depth']))
print('Minimum number of reads in the sample (TP53):', min(df_tp53_variants['var_depth'])) # for TP53, using 3 reads because the prior on them being real CH and not an error is lower

# print the number of SAMPLES and INDIVIDUALS with mutations in CH hotspot POSITIONS
print('Number of samples with >=2 reads with a change in CH-hotspot POSITION:', df_all_variants.shape[0])
print('Number of individuals with >=2 reads with a change in CH-hotspot POSITION:', len(df_all_variants.sample_ID.unique()))
print('Number of samples with >=2 reads with a change in TP53-hotspot POSITION:', df_tp53_variants.shape[0])
print('Number of individuals with >=2 reads with a change in TP53-hotspot POSITION:', len(df_tp53_variants.sample_ID.unique()))

Minimum number of reads in the sample: 2
Minimum number of reads in the sample (TP53): 3
Number of samples with >=2 reads with a change in CH-hotspot POSITION: 11310
Number of individuals with >=2 reads with a change in CH-hotspot POSITION: 10931
Number of samples with >=2 reads with a change in TP53-hotspot POSITION: 226
Number of individuals with >=2 reads with a change in TP53-hotspot POSITION: 226


In [43]:
# Filtering: only include variants carried by more than 15 individuals (common enough)

# TP53 variants: select only the two most common ones (require > 15 people who carry the variant)
tp53_counts = pd.DataFrame(df_tp53_variants.varID.value_counts()).reset_index()
tp53_hotspots = tp53_counts[tp53_counts['count'] > 15].varID.tolist()

# subset to only include individuals positive for the common hotspots  
df_tp53_hotspots = df_tp53_variants[df_tp53_variants['varID'].isin(tp53_hotspots)]


In [44]:
# This dataframe includes variant annotation in the varID column 

# Check which variants were identified 
print('Number of CH variants identified:', len(df_all_variants['varID'].unique()) + len(tp53_hotspots))
print('All CH variants identified in the dataset:', df_all_variants['varID'].sort_values().unique(), tp53_hotspots)

# show a table which shows the number of cases identified for each variant (not including TP53 here - this has already been filtered)
ch_var_counts = pd.DataFrame(df_all_variants.varID.value_counts()).reset_index()
ch_var_counts

Number of CH variants identified: 76
All CH variants identified in the dataset: ['DNMT3A L859F' 'DNMT3A L859L' 'DNMT3A P904L' 'DNMT3A P904Q'
 'DNMT3A P904R' 'DNMT3A R320*' 'DNMT3A R320G' 'DNMT3A R320R'
 'DNMT3A R326C' 'DNMT3A R326G' 'DNMT3A R326S' 'DNMT3A R598*'
 'DNMT3A R598G' 'DNMT3A R598R' 'DNMT3A R729G' 'DNMT3A R729R'
 'DNMT3A R729W' 'DNMT3A R736C' 'DNMT3A R736G' 'DNMT3A R736H'
 'DNMT3A R736L' 'DNMT3A R736P' 'DNMT3A R736S' 'DNMT3A R771*'
 'DNMT3A R771G' 'DNMT3A R771R' 'DNMT3A R882C' 'DNMT3A R882G'
 'DNMT3A R882H' 'DNMT3A R882L' 'DNMT3A R882P' 'DNMT3A R882S'
 'DNMT3A Y735C' 'DNMT3A Y735F' 'DNMT3A Y735S' 'GNB1 K57*' 'GNB1 K57E'
 'IDH1 R132H' 'IDH1 R132L' 'IDH2 R140L' 'IDH2 R140P' 'IDH2 R140Q'
 'IDH2 R172K' 'IDH2 R172M' 'JAK2 V617F' 'JAK2 V617I' 'KIT D816G'
 'KIT D816N' 'KIT D816V' 'KRAS G12A' 'KRAS G12C' 'KRAS G12D' 'KRAS G12R'
 'KRAS G12S' 'KRAS G12V' 'MPL W515*' 'MPL W515L' 'MPL W515S' 'NPM1 L287L'
 'NPM1 W288G' 'NPM1 W288R' 'NRAS G12A' 'NRAS G12C' 'NRAS G12D' 'NRAS G12R'
 'NRAS G1

Unnamed: 0,varID,count
0,DNMT3A R882H,1860
1,DNMT3A R882C,1109
2,DNMT3A Y735C,873
3,DNMT3A R736H,647
4,DNMT3A P904L,516
...,...,...
69,NRAS G12R,1
70,KIT D816G,1
71,DNMT3A L859L,1
72,SF3B1 K666K,1


In [45]:
# Exclude synonymous variants (which were called due to nucleotide-level difference)
syn_variants = ['DNMT3A L859L', 'DNMT3A R320R', 'DNMT3A R598R', 'DNMT3A R729R', 'DNMT3A R771R', 'NPM1 L287L', 'SF3B1 K666K']
df_syn_variants = df_all_variants[df_all_variants['varID'].isin(syn_variants)]
# these are clearly not hotspots / drivers so can be excluded

In [46]:
# look for variants with count >= 15 
common_ch_variants = ch_var_counts[ch_var_counts['count'] >= 15].varID # identify variants present in more than 15 individuals 
common_ch_variants = [var for var in common_ch_variants if var not in syn_variants] # remove variants which are classified as synonymous 
df_common_variants = df_all_variants[df_all_variants['varID'].isin(common_ch_variants)]

print('Number of common (>= 15 cases) CH variants identified in the dataset:', len(df_common_variants['varID'].unique()))
print('All common (>= 15 cases) CH variants identified in the dataset:', df_common_variants['varID'].sort_values().unique())

df_common_variants

Number of common (>= 15 cases) CH variants identified in the dataset: 38
All common (>= 15 cases) CH variants identified in the dataset: ['DNMT3A P904L' 'DNMT3A P904Q' 'DNMT3A P904R' 'DNMT3A R320*'
 'DNMT3A R326C' 'DNMT3A R326G' 'DNMT3A R326S' 'DNMT3A R598*'
 'DNMT3A R729G' 'DNMT3A R729W' 'DNMT3A R736C' 'DNMT3A R736G'
 'DNMT3A R736H' 'DNMT3A R736L' 'DNMT3A R736S' 'DNMT3A R771*'
 'DNMT3A R882C' 'DNMT3A R882H' 'DNMT3A R882L' 'DNMT3A R882P'
 'DNMT3A R882S' 'DNMT3A Y735C' 'DNMT3A Y735F' 'DNMT3A Y735S' 'GNB1 K57E'
 'IDH1 R132H' 'IDH2 R140Q' 'IDH2 R172K' 'JAK2 V617F' 'KRAS G12D'
 'KRAS G12S' 'MPL W515L' 'NRAS G12D' 'SF3B1 K666N' 'SF3B1 K700E'
 'SRSF2 P95H' 'SRSF2 P95L' 'SRSF2 P95R']


Unnamed: 0,sample_ID,chromosome,position,end_position,ref,alt,depth,var_depth,VAF,variant_type,...,VARIANT_CLASS,varID,overlap,Sex_v0,Age.when.attended.assessment.centre_v0,first.diagnosed.cancer.date1,first.diagnosed.cancer.age1,first.diagnosed.cancer.type1,Date.of.death_v0,Age.at.death_v0
0,1000130,chr2,25234373,25234373,C,T,81,2,0.024691,SNV,...,SNV,DNMT3A R882H,False,Female,41,,,,,
1,1000559,chr2,25234374,25234374,G,A,56,2,0.035714,SNV,...,SNV,DNMT3A R882C,False,Male,68,,,,,
2,1000724,chr2,25240313,25240313,G,A,98,2,0.020408,SNV,...,SNV,DNMT3A R771*,False,Female,62,,,,,
3,1001048,chr2,25247629,25247629,G,C,88,2,0.022727,SNV,...,SNV,DNMT3A R326G,False,Female,43,,,,,
4,1001150,chr2,25247647,25247647,G,A,74,2,0.027027,SNV,...,SNV,DNMT3A R320*,False,Male,67,2012-12-04,72.5,C92,2013-06-12,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11492,6022096,chr2,25244214,25244214,G,A,113,2,0.017699,SNV,...,SNV,DNMT3A R598*,False,Male,41,,,,,
11493,6024372,chr9,5073770,5073770,G,T,20,2,0.100000,SNV,...,SNV,JAK2 V617F,False,Male,47,,,,,
11495,6025111,chr2,25244214,25244214,G,A,89,2,0.022472,SNV,...,SNV,DNMT3A R598*,False,Female,62,,,,,
11496,6025356,chr2,25247629,25247629,G,T,152,2,0.013158,SNV,...,SNV,DNMT3A R326S,False,Male,55,,,,,


In [47]:

print('Number of samples with common CH variants (>=2 reads, non-TP53):', df_common_variants.shape[0])
print('Number of individuals with common CH variants (>=2 reads, non-TP53):', len(df_common_variants.sample_ID.unique()))

print('Number of samples with common CH variants (>=3 reads, non-TP53):', df_common_variants[df_common_variants['var_depth']>=3].shape[0])
print('Number of individuals with common CH variants (>=3 reads, non-TP53):', len(df_common_variants[df_common_variants['var_depth']>=3].sample_ID.unique()))

print('Total number of common variants to look at: 40')
print('Total number of samples with any of these variants:', df_common_variants.shape[0] + df_tp53_hotspots.shape[0])

Number of samples with common CH variants (>=2 reads, non-TP53): 10547
Number of individuals with common CH variants (>=2 reads, non-TP53): 10198
Number of samples with common CH variants (>=3 reads, non-TP53): 5153
Number of individuals with common CH variants (>=3 reads, non-TP53): 4978
Total number of common variants to look at: 40
Total number of samples with any of these variants: 10602


In [48]:
# merge dataframes so that you have everyone who carries a variant you intend to look at
df_common_variants_all = pd.concat([df_common_variants, df_tp53_hotspots])
print('Number of CH-positive individuals:', len(df_common_variants_all.sample_ID.unique().tolist()))
print('Number of CH variants in the UKB:', df_common_variants_all.shape[0])

Number of CH-positive individuals: 10250
Number of CH variants in the UKB: 10602


# Load MHC genotyping data 
- add MHC I and II genotyping data 
- add age data (this does matter as CH is age-dependent so may be helpful, and useful for regression)

In [49]:
# add MHC genotype data to CH cases

# path to file
file_hla = "/Users/barbarawalkowiak/Desktop/msc_thesis/data/UKBB/ukb_typed_hla_with_ids_07_threshold_processed.txt"

# get the header 
header = pd.read_csv(file_hla, sep='\t', nrows=1, header=None).values.tolist()
head = [item for sublist in header for item in sublist]

# load the dataframe 
df = pd.read_csv(file_hla, skiprows = 1, sep = ' ', header = None)

# add columns 
df.columns = head 

# subset data for HLA-I and HLA-II class alleles 
df_hla1 = df.filter(regex='^(Person_|A_|B_|C_)') # 488377 cases 
df_hla2 = df.filter(regex='^(Person_|D)') # 488377 cases 

# clean up the dataset (filtering according to Krishna et al., 2024 Science: remove people in whom not all alleles were genotyped with sufficient confidence)
df_clean_hla1 = df_hla1[~df_hla1.isin([0.5]).any(axis=1)] # exclude people for whom the genotype is missing 
df_clean_hla1 = df_clean_hla1[~df_clean_hla1.isin([1.5]).any(axis=1)] # exclude people for whom the genotype is missing
df_clean_hla2 = df_hla2[~df_hla2.isin([0.5]).any(axis=1)] # exclude people for whom the genotype is missing
df_clean_hla2 = df_clean_hla2[~df_clean_hla2.isin([1.5]).any(axis=1)] # exclude people for whom the genotype is missing

print("Number of samples with imputed HLA:", df_hla1.shape[0])
print("Number of samples with confidently imputed MHC I:", df_clean_hla1.shape[0])
print("Number of samples with confidently imputed MHC II:", df_clean_hla2.shape[0])

# identify people who have correctly genotyped MHC class I and class II 
ids_hla1 = df_clean_hla1.Person_ID.tolist()
ids_hla2 = df_clean_hla2.Person_ID.tolist()
ids_correct_genotype = list(set(ids_hla1).intersection(ids_hla2))
print('Number of UKBB participants with correctly genotyped MHC class I and II:', len(ids_correct_genotype))

# dataframe with poeple who have both MHC I and MHCII imputed
df_clean_hla1_2 = df_hla1[df_hla1['Person_ID'].isin(ids_correct_genotype)]
print('Number of UKBB participants with correctly genotyped MHC class I and II:', df_clean_hla1_2.shape[0]) # to check that the # people is the same 


Number of samples with imputed HLA: 488377
Number of samples with confidently imputed MHC I: 443620
Number of samples with confidently imputed MHC II: 378317
Number of UKBB participants with correctly genotyped MHC class I and II: 346968
Number of UKBB participants with correctly genotyped MHC class I and II: 346968


### Add age data 

In [50]:
# This dataframe has been cleaned up to only include healthy individuals
# therefore, I will only include those individuals for whom I have age, as for the others, they are likely to have been removed due to a cancer diagnosis
# NOTE: there are some individuals who have sth in the 'first diagnosed cancer' column etc.
# these are not excluded because they are not treated with chemo (which could affect CH) and may have been diagnosed with dysplasia 

age_data = pd.read_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/data/2024-04-24_healthy_pheno_more_cancer_cols.tsv', sep = '\t')

age_df = age_data[['ID_v0', 'Age.when.attended.assessment.centre_v0']]
age_df.columns.values[0] = 'Person_ID'
age_df.columns.values[1] = 'age'

print('Number of individuals for whom age data is available:', age_df.shape[0])


Number of individuals for whom age data is available: 424089


In [51]:
# add age data (DO NOT allow age to be missing since if it is missing, likely to be due to a cancer diganosis)
df_clean_hla1_age = pd.merge(df_clean_hla1, age_df, on = 'Person_ID')
df_clean_hla2_age = pd.merge(df_clean_hla2, age_df, on = 'Person_ID')
df_clean_hla1_2_age = pd.merge(df_clean_hla1_2, age_df, on = 'Person_ID')

# check the number of individuals is correct 
print("Number of samples with confidently imputed MHC I, healthy:", df_clean_hla1_age.shape[0])
print("Number of samples with confidently imputed MHC II, healthy:", df_clean_hla2_age.shape[0])

Number of samples with confidently imputed MHC I, healthy: 384600
Number of samples with confidently imputed MHC II, healthy: 327961


In [53]:
# subset to only include people who were screened for CH
df_clean_hla1_age_screened = df_clean_hla1_age[df_clean_hla1_age['Person_ID'].isin(indexes_list)]
df_clean_hla2_age_screened = df_clean_hla2_age[df_clean_hla2_age['Person_ID'].isin(indexes_list)]
df_clean_hla1_2_age_screened = df_clean_hla1_2_age[df_clean_hla1_2_age['Person_ID'].isin(indexes_list)]
print('Number of individuals with MHC I who were screened for CH:', df_clean_hla1_age_screened.shape[0])
print('Number of individuals with MHC II who were screened for CH:', df_clean_hla2_age_screened.shape[0])
print('Number of individuals with MHC I AND II who were screened for CH:', df_clean_hla1_2_age_screened.shape[0])

Number of individuals with MHC I who were screened for CH: 384600
Number of individuals with MHC II who were screened for CH: 327961
Number of individuals with MHC I AND II who were screened for CH: 300638


In [54]:
# format the df with common CH variants so it can be merged with other dataframes 
df_common_variants_all.rename(columns={'sample_ID': 'Person_ID'}, inplace=True) # rename column from sample to person 
df_common_variants_all.rename(columns={'Age.when.attended.assessment.centre_v0': 'age'}, inplace = True) # rename age column
df_common_variants_all['gene_var'] = df_common_variants_all['varID'].str.replace(' ', '_') # create a column that will have the same format as predictions
df_common_variants_all 

Unnamed: 0,Person_ID,chromosome,position,end_position,ref,alt,depth,var_depth,VAF,variant_type,...,Giacomelli_LOF_LR,SGR_LR,Personal_and_Family_History_LR_Combined,Breast_Tumor_Pathology_LR,Population_Allele_Frequency_LR,BS2_LR,Comments,Splicing_prediction,ClinVar__March_2021,gene_var
0,1000130,chr2,25234373,25234373.0,C,T,81,2,0.024691,SNV,...,,,,,,,,,,DNMT3A_R882H
1,1000559,chr2,25234374,25234374.0,G,A,56,2,0.035714,SNV,...,,,,,,,,,,DNMT3A_R882C
2,1000724,chr2,25240313,25240313.0,G,A,98,2,0.020408,SNV,...,,,,,,,,,,DNMT3A_R771*
3,1001048,chr2,25247629,25247629.0,G,C,88,2,0.022727,SNV,...,,,,,,,,,,DNMT3A_R326G
4,1001150,chr2,25247647,25247647.0,G,A,74,2,0.027027,SNV,...,,,,,,,,,,DNMT3A_R320*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,5645571,chr17,7673802,,C,T,84,30,0.357143,,...,82.04,3237.15,1.0,1.0,1.0,1.0,,,Pathogenic_Last_reviewed_Aug_28_2019,TP53_R273H
210,5744845,chr17,7675088,,C,T,52,8,0.153846,,...,6.20,22639.68,1.0,1.0,1.0,1.0,,,Pathogenic_Last_reviewed_Jan_7_2020,TP53_R175H
211,5751556,chr17,7673802,,C,T,62,8,0.129032,,...,82.04,3237.15,1.0,1.0,1.0,1.0,,,Pathogenic_Last_reviewed_Aug_28_2019,TP53_R273H
219,5859256,chr17,7673802,,C,T,64,6,0.093750,,...,82.04,3237.15,1.0,1.0,1.0,1.0,,,Pathogenic_Last_reviewed_Aug_28_2019,TP53_R273H


## Add MHC-related parameters (heterozygosity status etc.)

In [55]:
# define a function to get names of columns which indicate alleles present in the sample 
def get_columns_with_values(row, subset_columns):
    return [col for col in subset_columns if row[col] > 0]

# define a function to check if someone is heterozygous for a given class of alleles 
def is_heterozygous_allele(row, col):

    # column = column with list of A, B, C alleles present in the sample
    
    # count the number of alleles someone has 
    # count the number of commas + add 1 = number of entires
    count = row[col].count(',') + 1  
    
    # determine if they are heterozygous or homozygous 
    if count == 2:
        return True # has two alleles so heterozygous
    elif count == 1:
        return False # has 1 alleles so homozygous
    else:
        return None # we definitely should not have anything else !!! so return None and see in how many cases this happens

In [56]:
# reminder: filtered data to use: df_clean_hla1_age_screened, df_clean_hla2_age_screened

# apply function to class I alleles

# filter columns with MHC class I alleles (start with A_, B_, C_)
subset_A = [col for col in df_clean_hla1_age_screened.columns if col.startswith(('A_'))]
subset_B = [col for col in df_clean_hla1_age_screened.columns if col.startswith(('B_'))]
subset_C = [col for col in df_clean_hla1_age_screened.columns if col.startswith(('C_'))]

subsets_1 = [subset_A, subset_B, subset_C]

for subset in subsets_1:
    
    new_column_name = 'allele_I_' + subset[0][0] # the first letter will indicate class
    df_clean_hla1_age_screened[new_column_name] = df_clean_hla1_age_screened.apply(get_columns_with_values, axis=1, subset_columns=subset).apply(lambda x: ', '.join(x))

df_clean_hla1_age_screened.head()

# apply function to class II alleles
subset_DR = [col for col in df_clean_hla2_age_screened.columns if 'DRB1' in col] # we are only looking at DRB1, as DRB3/4/5 are mostly missing
subset_DPA = [col for col in df_clean_hla2_age_screened.columns if 'DPA' in col]
subset_DPB = [col for col in df_clean_hla2_age_screened.columns if 'DPB' in col]
subset_DQA = [col for col in df_clean_hla2_age_screened.columns if 'DQA' in col]
subset_DQB = [col for col in df_clean_hla2_age_screened.columns if 'DQB' in col]

subsets_2 = [subset_DR, subset_DPA, subset_DPB, subset_DQA, subset_DQB]

for subset in subsets_2:
    
    new_column_name = 'allele_II_' + subset[0][:3] # the first three letters indicate class
    df_clean_hla2_age_screened[new_column_name] = df_clean_hla2_age_screened.apply(get_columns_with_values, axis=1, subset_columns=subset).apply(lambda x: ', '.join(x))

df_clean_hla2_age_screened.head()


Unnamed: 0,Person_ID,DRB5_101,DRB5_102,DRB5_202,DRB5_9901,DRB4_101,DRB4_103,DRB4_9901,DRB3_101,DRB3_202,...,DPA1_201,DPA1_202,DPA1_301,DPA1_401,age,allele_II_DRB,allele_II_DPA,allele_II_DPB,allele_II_DQA,allele_II_DQB
0,4860169,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,46,"DRB1_301, DRB1_1501",DPA1_103,"DPB1_301, DPB1_401","DQA1_102, DQA1_501","DQB1_201, DQB1_602"
1,3381323,0.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,52,"DRB1_101, DRB1_401",DPA1_103,"DPB1_201, DPB1_401","DQA1_101, DQA1_301","DQB1_302, DQB1_501"
2,2805252,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,65,"DRB1_401, DRB1_1501","DPA1_103, DPA1_202","DPB1_101, DPB1_401","DQA1_102, DQA1_301","DQB1_301, DQB1_602"
3,3318036,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,50,DRB1_701,DPA1_201,DPB1_1101,DQA1_201,DQB1_202
4,4120291,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,...,2.0,0.0,0.0,0.0,41,"DRB1_103, DRB1_701",DPA1_201,"DPB1_1001, DPB1_1101","DQA1_101, DQA1_201","DQB1_202, DQB1_501"


In [57]:
# filter columns with MHC class I alleles (start with A_, B_, C_) / MHC class II alleles 
# list of columns we want to look at to determine heterozygosity status
columns_hla1 = ['allele_I_A', 'allele_I_B', 'allele_I_C']
columns_hla2 = ['allele_II_DRB', 'allele_II_DPA', 'allele_II_DPB', 'allele_II_DQA', 'allele_II_DQB']

for col in columns_hla1:
    new_column_name = 'het_' + col # the first three letters indicate class
    df_clean_hla1_age_screened[new_column_name] = df_clean_hla1_age_screened.apply(is_heterozygous_allele, axis=1, col = col) # add relevant column to the dataframe

for col in columns_hla2:
    new_column_name = 'het_' + col # the first three letters indicate class
    df_clean_hla2_age_screened[new_column_name] = df_clean_hla2_age_screened.apply(is_heterozygous_allele, axis=1, col = col) # add relevant column to the dataframe

In [32]:
subset_classI = [col for col in df_clean_hla1_age_screened.columns if col.startswith(('A_', 'B_', 'C_'))] # all class I alleles 
subset_classII = [col for col in df_clean_hla2_age_screened.columns if col.startswith(('D'))] # all class II alleles 


In [58]:

# add total heterozygosity 

# determine the number of unique alleles typed (so if someone has HLA_A0101, HLA_A0101, HLA_B0101, HLA_B0101, HLA_C0101, HLA_C0101, the count is 3)
df_clean_hla1_age_screened['count_class_I'] = df_clean_hla1_age_screened[subset_classI].gt(0).sum(axis=1)
df_clean_hla2_age_screened['count_class_II'] = df_clean_hla2_age_screened[subset_classII].gt(0).sum(axis=1)

# this is so we get only counts for the 10 alleles we are looking at
subset_classII_drb = [col for col in df_clean_hla2_age_screened.columns if col.startswith('DRB1') or col.startswith('DP') or col.startswith(('DQ'))] # all class II alleles 
df_clean_hla2_age_screened['count_class_II_drb'] = df_clean_hla2_age_screened[subset_classII_drb].gt(0).sum(axis=1)

# determine the number of alleles typed (so if someone has HLA_A0101, HLA_A0101, HLA_B0101, HLA_B0101, HLA_C0101, HLA_C0101, the sum is 6)
df_clean_hla1_age_screened['sum_class_I'] = df_clean_hla1_age_screened[subset_classI].sum(axis=1)
df_clean_hla2_age_screened['sum_class_II'] = df_clean_hla2_age_screened[subset_classII].sum(axis=1)

# note that some people may not have been fully genotyped > I guess we should think about what to do with these cases 
df_clean_hla1_age_screened['het_all_class_I'] = (df_clean_hla1_age_screened['count_class_I'] == 6) # we dropped people who did not have genotype data available so if they dont have 6 alleles they are homozygous by our definition
df_clean_hla2_age_screened['het_all_class_II'] = (df_clean_hla2_age_screened['count_class_II'] == 10) 

# one way to check for total heterozygosity is to see whether you have 'TRUE' in all fields (het_allele A, B, C / het allele DRB, DPA, DPB, DQA, DQB)
df_clean_hla1_age_screened['het_all_class_I_from_allele'] = (df_clean_hla1_age_screened['het_allele_I_A'] & df_clean_hla1_age_screened['het_allele_I_B'] & df_clean_hla1_age_screened['het_allele_I_C'])
df_clean_hla2_age_screened['het_all_class_II_from_allele'] = (df_clean_hla2_age_screened['het_allele_II_DRB'] & df_clean_hla2_age_screened['het_allele_II_DPB'] & df_clean_hla2_age_screened['het_allele_II_DPA'] & df_clean_hla2_age_screened['het_allele_II_DQB'] & df_clean_hla2_age_screened['het_allele_II_DQA'])


## Add CH status (genetic variant, VAF, read depth)

In [59]:

# add CH status to the df with age + MHC genotype for everyone you screened 
df_common_variants_sub = df_common_variants_all[['Person_ID', 'VAF', 'gene_var', 'depth', 'var_depth']]
df_hla1_with_ch = pd.merge(df_clean_hla1_age_screened, df_common_variants_sub, on = 'Person_ID', how = 'left')
df_hla2_with_ch = pd.merge(df_clean_hla2_age_screened, df_common_variants_sub, on = 'Person_ID', how = 'left')

print('Number of individuals with MHC I data checked for CH status:', len(df_hla1_with_ch.Person_ID.unique().tolist())) 
print('Number of individuals with MHC II data checked for CH status:', len(df_hla2_with_ch.Person_ID.unique().tolist()))

# check number of cases of CH
print('Number of CH samples with MHC I data:', df_hla1_with_ch[~df_hla1_with_ch['gene_var'].isna()].shape[0]) 
print('Number of CH samples with MHC II data:', df_hla2_with_ch[~df_hla2_with_ch['gene_var'].isna()].shape[0])

print('Number of CH individuals with MHC I data:', len(df_hla1_with_ch[~df_hla1_with_ch['gene_var'].isna()].Person_ID.unique().tolist())) 
print('Number of CH individuals with MHC II data:', len(df_hla2_with_ch[~df_hla2_with_ch['gene_var'].isna()].Person_ID.unique().tolist()))

# add CH status (overall, any CH variant or nothing)
df_hla1_with_ch['ch_status'] = np.where(~df_hla1_with_ch['gene_var'].isna(), 1, 0)
df_hla2_with_ch['ch_status'] = np.where(~df_hla2_with_ch['gene_var'].isna(), 1, 0)

# in the .head(), all is NaN because most people do not have CH
df_hla1_with_ch.head()

Number of individuals with MHC I data checked for CH status: 384600
Number of individuals with MHC II data checked for CH status: 327961
Number of CH samples with MHC I data: 9624
Number of CH samples with MHC II data: 8212
Number of CH individuals with MHC I data: 9309
Number of CH individuals with MHC II data: 7932


Unnamed: 0,Person_ID,A_101,A_102,A_103,A_201,A_202,A_203,A_205,A_206,A_207,...,het_allele_I_C,count_class_I,sum_class_I,het_all_class_I,het_all_class_I_from_allele,VAF,gene_var,depth,var_depth,ch_status
0,2812213,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,True,5,6.0,False,False,,,,,0
1,4860169,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,5,6.0,False,False,,,,,0
2,3381323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,6,6.0,True,True,,,,,0
3,2805252,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,5,6.0,False,False,,,,,0
4,1118855,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,True,6,6.0,True,True,,,,,0


In [61]:
# save dataframes to files 
df_hla1_with_ch.to_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/results/dataframes/20240907_df_hla1_ch_status.csv')
df_hla2_with_ch.to_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/results/dataframes/20240907_df_hla2_ch_status.csv')

: 