# MHC - CH variants on all data 
- Barbara Walkowiak (bw450)

- 2024-07-10, modified 2024-09-07

- In this script, for each individual screened for CH in the UKB I add MHC binding scores based on their MHC genotype (class II)


# Set up

In [1]:
# IMPORTS

import random
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib.patches import Polygon
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from matplotlib import cm
import scipy.special
from scipy import integrate
import scipy.integrate as it
from scipy.interpolate import interp1d
from scipy.stats import kde
import copy
import glob, os
import re
# from sklearn import datasets, linear_model
import pandas as pd
from decimal import *
from operator import itemgetter    
from collections import OrderedDict
import timeit
import time 
import csv
import seaborn as sns 
import scipy as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.api as sm
import plotly.express as px
import kaleido
from itertools import combinations # needed for DP / DQ allele combinations 




In [2]:
# specify font for plotting (Helvetica is what Caroline recommends :))
plt.rcParams['font.family'] = 'Helvetica'

# stop printing warnings 
import warnings
warnings.filterwarnings("ignore")

# this is to make things editable in Illustrator
plt.rcParams['pdf.fonttype'] = 42 

# get current date 
timestr = time.strftime("%Y%m%d") 

# Read in dataframe with imputed MHC II genotype 

In [3]:
# read in the file (has imputed MHC II genotype + CH status (genetic variant carried, VAF) + age etc) 
df_hla2 = pd.read_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/results/dataframes/20240907_df_hla2_ch_status.csv')

In [4]:
# will need to have combinations of DP and DQ alleles: define necessary functions

# I do not want to predict stuff for 2 alpha or 2 beta chains because these are not created so we can remove these columns
def is_valid_column_name(col):
    parts = col.split('-') # two part of the name of the column
    return 'A' in parts[0] and 'B' in parts[1] or 'B' in parts[0] and 'A' in parts[1]

# we also want to rename columns from XXB-XXA to XXA-XXB
def switch_parts(col):
    parts = col.split('-') 
    new_name = parts[1] + '-' + parts[0]
    return new_name

In [5]:
# for DP and DQ alleles, what matters for binding is the combination of alleles

# subset dataframes for specific allele classes 
df_hla2_dp = df_hla2.filter(regex='^(Person_|DP)')
df_hla2_dq = df_hla2.filter(regex='^(Person_|DQ)')
df_hla2_dr = df_hla2.filter(regex='^(Person_|DR)')

## DP
# combinations for DP 
cc = list(combinations(df_hla2_dp.columns[1:],2)) # possible combinations of these columns (first is Person_ID)
df_hla2_dp_comb = pd.concat([df_hla2_dp[c[0]].multiply(df_hla2_dp[c[1]]) for c in cc], axis=1, keys=cc) 
df_hla2_dp_comb.columns = df_hla2_dp_comb.columns.map('-'.join) # join with a '-'

## DQ 
# combinations for DQ
cc = list(combinations(df_hla2_dq.columns[1:],2)) # possible combinations of these columns (first is Person_ID)
df_hla2_dq_comb = pd.concat([df_hla2_dq[c[0]].multiply(df_hla2_dq[c[1]]) for c in cc], axis=1, keys=cc) 
df_hla2_dq_comb.columns = df_hla2_dq_comb.columns.map('-'.join) # join with a '-'

# remove columns with BB or AA combinations (only retain valid column names)
filtered_columns_dp = [col for col in df_hla2_dp_comb.columns if is_valid_column_name(col)]
filtered_columns_dq = [col for col in df_hla2_dq_comb.columns if is_valid_column_name(col)]

# Create a new df with the filtered and reordered columns
new_df_dp = df_hla2_dp_comb[filtered_columns_dp]
new_df_dq = df_hla2_dq_comb[filtered_columns_dq]

renamed_columns_dp = [switch_parts(col) for col in new_df_dp.columns]
renamed_columns_dq = [switch_parts(col) for col in new_df_dq.columns]

new_df_dp.columns = renamed_columns_dp
new_df_dq.columns = renamed_columns_dq


In [6]:
# combine with the rest of the columns 

# DP
df_hla2_dp_all = pd.concat([df_hla2_dp, new_df_dp], axis = 1)
df_hla2_dp_all.replace(2, 1, inplace=True) # if someone got a 2 (2 * 1), replace to 1
df_hla2_dp_all.replace(4, 1, inplace=True) # if someone got a 2 (2 * 2), replace to 1

# let's remove columns where noone has the combination (it's possible that some combinations are never seen in our dataset)
colsums = pd.DataFrame(df_hla2_dp_all.sum()).reset_index()
colsums.rename(columns={'index': 'col_name', 0:'col_sum'}, inplace=True)
col_to_retain = colsums[colsums['col_sum']>=1]['col_name'] # these are the columns to retain 
df_hla2_dp_all = df_hla2_dp_all[col_to_retain]
df_hla2_dp_all 

# DQ
df_hla2_dq_all = pd.concat([df_hla2_dq, new_df_dq], axis = 1)
df_hla2_dq_all.replace(2, 1, inplace=True) # if someone got a 2 (2 * 1), replace to 1
df_hla2_dq_all.replace(4, 1, inplace=True) # if someone got a 2 (2 * 2), replace to 1
colsums = pd.DataFrame(df_hla2_dq_all.sum()).reset_index()
colsums.rename(columns={'index': 'col_name', 0:'col_sum'}, inplace=True)
col_to_retain = colsums[colsums['col_sum']>=1]['col_name'] # these are the columns to retain 
df_hla2_dq_all = df_hla2_dq_all[col_to_retain]
df_hla2_dq_all  


Unnamed: 0,Person_ID,DQB1_201,DQB1_202,DQB1_301,DQB1_302,DQB1_303,DQB1_304,DQB1_401,DQB1_402,DQB1_501,...,DQA1_102-DQB1_609,DQA1_103-DQB1_609,DQA1_201-DQB1_609,DQA1_301-DQB1_609,DQA1_302-DQB1_609,DQA1_303-DQB1_609,DQA1_401-DQB1_609,DQA1_501-DQB1_609,DQA1_505-DQB1_609,DQA1_601-DQB1_609
0,4860169,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3381323,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2805252,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3318036,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4120291,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328236,4478244,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328237,4478244,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328238,3573995,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328239,3025735,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# now you need to merge the data (df_hla2, with alleles and CH status including genetic variant carried, VAF etc) with files with combinations
# select combinations
df_hla2_dp_comb = df_hla2_dp_all.filter(regex=r'^(Person_|.*-.*)') # either column starts with Person_ID or has a hyphen somewhere (indivative of combination)
df_hla2_dq_comb = df_hla2_dq_all.filter(regex=r'^(Person_|.*-.*)') # either column starts with Person_ID or has a hyphen somewhere (indivative of combination)

# add HLA genotype data to samples with annotated CH variant and age
df_hla2_ch = df_hla2[df_hla2.columns.drop(list(df_hla2.filter(regex='^(DQ|DP|DR)')))] # remove columns with imputation specific alleles 
df_hla2_dr = pd.merge(df_hla2_ch, df_hla2_dr, on = 'Person_ID') 
df_hla2_dp = pd.merge(df_hla2_ch, df_hla2_dp_comb, on = 'Person_ID')
df_hla2_dq = pd.merge(df_hla2_ch, df_hla2_dq_comb, on = 'Person_ID')

print('Number of samples with annotated variants CH variant and HLA genotype (DR):', len(df_hla2_dr.Person_ID.unique().tolist()))
print('Number of samples with annotated variants CH variant and HLA genotype (DP):', len(df_hla2_dp.Person_ID.unique().tolist()))
print('Number of samples with annotated variants CH variant and HLA genotype (DQ):', len(df_hla2_dq.Person_ID.unique().tolist()))


Number of samples with annotated variants CH variant and HLA genotype (DR): 327961
Number of samples with annotated variants CH variant and HLA genotype (DP): 327961
Number of samples with annotated variants CH variant and HLA genotype (DQ): 327961


In [8]:
# remove duplicated rows 
df_hla2_dr = df_hla2_dr.drop_duplicates()
df_hla2_dp = df_hla2_dp.drop_duplicates()
df_hla2_dq = df_hla2_dq.drop_duplicates()
df_hla2_dp_comb = df_hla2_dp_comb.drop_duplicates()
df_hla2_dq_comb = df_hla2_dq_comb.drop_duplicates()
df_hla2_dq_comb

Unnamed: 0,Person_ID,DQA1_101-DQB1_201,DQA1_102-DQB1_201,DQA1_103-DQB1_201,DQA1_104-DQB1_201,DQA1_201-DQB1_201,DQA1_301-DQB1_201,DQA1_302-DQB1_201,DQA1_303-DQB1_201,DQA1_401-DQB1_201,...,DQA1_102-DQB1_609,DQA1_103-DQB1_609,DQA1_201-DQB1_609,DQA1_301-DQB1_609,DQA1_302-DQB1_609,DQA1_303-DQB1_609,DQA1_401-DQB1_609,DQA1_501-DQB1_609,DQA1_505-DQB1_609,DQA1_601-DQB1_609
0,4860169,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3381323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2805252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3318036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4120291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328235,1331351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328236,4478244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328238,3573995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328239,3025735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# create a df with all relevant columns (all HLAs together)
df_hla2_drp = pd.merge(df_hla2_dr, df_hla2_dp_comb, on = 'Person_ID')
df_hla2_drpq = pd.merge(df_hla2_drp, df_hla2_dq_comb, on = 'Person_ID')
df_hla2_drpq = df_hla2_drpq.drop(columns = 'Unnamed: 0')
df_hla2_drpq

Unnamed: 0,Person_ID,age,allele_II_DRB,allele_II_DPA,allele_II_DPB,allele_II_DQA,allele_II_DQB,het_allele_II_DRB,het_allele_II_DPA,het_allele_II_DPB,...,DQA1_102-DQB1_609,DQA1_103-DQB1_609,DQA1_201-DQB1_609,DQA1_301-DQB1_609,DQA1_302-DQB1_609,DQA1_303-DQB1_609,DQA1_401-DQB1_609,DQA1_501-DQB1_609,DQA1_505-DQB1_609,DQA1_601-DQB1_609
0,4860169,46,"DRB1_301, DRB1_1501",DPA1_103,"DPB1_301, DPB1_401","DQA1_102, DQA1_501","DQB1_201, DQB1_602",True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3381323,52,"DRB1_101, DRB1_401",DPA1_103,"DPB1_201, DPB1_401","DQA1_101, DQA1_301","DQB1_302, DQB1_501",True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2805252,65,"DRB1_401, DRB1_1501","DPA1_103, DPA1_202","DPB1_101, DPB1_401","DQA1_102, DQA1_301","DQB1_301, DQB1_602",True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3318036,50,DRB1_701,DPA1_201,DPB1_1101,DQA1_201,DQB1_202,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4120291,41,"DRB1_103, DRB1_701",DPA1_201,"DPB1_1001, DPB1_1101","DQA1_101, DQA1_201","DQB1_202, DQB1_501",True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328236,4478244,55,"DRB1_1104, DRB1_1501",DPA1_103,"DPB1_301, DPB1_402","DQA1_102, DQA1_501","DQB1_301, DQB1_602",True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328237,4478244,55,"DRB1_1104, DRB1_1501",DPA1_103,"DPB1_301, DPB1_402","DQA1_102, DQA1_501","DQB1_301, DQB1_602",True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328238,3573995,62,DRB1_301,DPA1_103,DPB1_401,DQA1_501,DQB1_201,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
328239,3025735,51,"DRB1_404, DRB1_1101",DPA1_103,"DPB1_201, DPB1_401","DQA1_301, DQA1_501","DQB1_301, DQB1_402",True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load binding predictions for MHC II from NetMHC II

In [10]:

# we need to move the format of the DP/DQ predictions to something that matches our UKBB genotyping and everything else 
def transform_format_DPQ(input_string):
    # Define a regular expression pattern to match the input format
    pattern = re.compile(r'HLA-(\w{3})(\d{1})(\d{4})-(\w{3})(\d{1})(\d{4})') # okay so this is the pattern we are trying to match

    # check if there is a match
    match = pattern.match(input_string)

    # if match, apply transformation
    if match:
        group1 = match.group(1) # we are not including HLA annotations, this is the name of the first allele   
        group2 = int(match.group(2)) # there will be no zeroes, leave as it is 
        group3 = int(match.group(3)) # remove zeros at the start 
        group4 = match.group(4) # name of the second allele in the combination
        group5 = int(match.group(5)) # there will be no zeroes, leave as it is 
        group6 = int(match.group(6)) # remove zeros at the start 

        # Format the output string
        output_string = f'{group1}{group2}_{group3}-{group4}{group5}_{group6}' # stitch back 

        return output_string # return transformed string 

    # if no much, return original string 
    return 0

def transform_format_DR(input_string):
    # Define a regular expression pattern to match the input format
    pattern = re.compile(r'(\w{3})(\d{1})_(\d+)') # okay so this is the pattern we are trying to match

    # check if there is a match
    match = pattern.match(input_string)

    # if match, apply transformation
    if match:
        group1 = match.group(1) # we are not including HLA annotations, this is the name of the first allele   
        group2 = int(match.group(2)) # there will be no zeroes, leave as it is 
        group3 = int(match.group(3)) # remove zeros at the start 

        # Format the output string
        output_string = f'{group1}{group2}_{group3}' # stitch back 

        return output_string # return transformed string 

    # if no much, return original string 
    return 0

In [11]:

# Import NetMHC II scores (has scores for all variants)
pred_file_dr = '/Users/barbarawalkowiak/Desktop/msc_thesis/netMHCII_out/scores/20240816_NetMHC_HLA_UKBB_with_affinities_DR_bestscores_allvariants.csv' # EL scores, BA scores, EL rank, BA rank, affinity prediction
pred_file_dpq = '/Users/barbarawalkowiak/Desktop/msc_thesis/netMHCII_out/scores/20240816_NetMHC_HLA_UKBB_with_affinities_DPQ_bestscores_allvariants.csv' # EL scores, BA scores, EL rank, BA rank, affinity prediction

pred_filename_dr = pred_file_dr.split('/')[2].split('.')[0] # identify file name 
pred_filename_dpq = pred_file_dpq.split('/')[2].split('.')[0] # identify file name 
pred_method_dr = pred_file_dr.split('_out')[0] # identify method used to obtain predictions

# load the csvs (have separate DR and DPQ)
pred_df_dr = pd.read_csv(pred_file_dr) # load the csv 
pred_df_dr = pred_df_dr.rename(columns={'Affinity (nM)': 'Aff_nM'})
pred_df_dpq = pd.read_csv(pred_file_dpq) # load the csv 
pred_df_dpq = pred_df_dpq.rename(columns={'Affinity (nM)': 'Aff_nM'})

# specify HLA allele naming format 
pred_df_dpq['HLA_formatted'] = pred_df_dpq['HLA'].map(transform_format_DPQ)
pred_df_dr['HLA_formatted'] = pred_df_dr['HLA'].map(transform_format_DR)

# combine all predictions
pred_df_all = pd.concat([pred_df_dpq, pred_df_dr], axis = 0)
pred_df_all.head()
 
# replace variants with STOP codons (STOP > *)
pred_df_all['gene_var'] = pred_df_all['gene_var'].str.replace('STOP', '*')
pred_df_all = pred_df_all.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1)


In [12]:
# select required columns and sort values 
pred_df_all = pred_df_all[['HLA_formatted', 'Peptide', '%Rank_EL', 'Score_EL', '%Rank_BA', 'Score_BA', 'Aff_nM', 'gene_var', 'genotype']]
pred_df_all = pred_df_all.sort_values(by=['HLA_formatted', 'gene_var', 'genotype'])

# identify gene variants 
pred_df_all['gene_var_gt'] = pred_df_all['gene_var'] + '_' + pred_df_all['genotype'] # add complete genotype data
pred_df_all['varID'] = pred_df_all['gene_var'] # this is a column where the variant ID is in the same format as in the CH cases dataframe (in case I need to use different ones)
scores_pred_df_all = pred_df_all[['HLA_formatted', 'Score_EL', '%Rank_EL', 'Score_BA', '%Rank_BA', 'Aff_nM', 'varID', 'gene_var', 'gene_var_gt']] # select columns of interest

# Print the results 
print('Number of alleles (combinations) for which predictions are available (NetMHC II):', len(pred_df_all.HLA_formatted.unique()))

# Find MHC alleles which have been typed in the UKBB
hla_ukbb = df_hla2_drpq.filter(regex='\d').columns # HLA from all UKBB
print('Number of HLA alleles which have been identified in the UK BioBank:', len(hla_ukbb))

# Now look at variants
print('Number of unique variants with predictions available (NetMHC II):',  len(pred_df_all.gene_var.unique()))

Number of alleles (combinations) for which predictions are available (NetMHC II): 593
Number of HLA alleles which have been identified in the UK BioBank: 446
Number of unique variants with predictions available (NetMHC II): 54


In [16]:
# # specify parameter to obtain predictions for 
# param = '%Rank_EL'

# # select df with DR / DPQ alleles
# pred_df_dr = pred_df_all[pred_df_all['HLA_formatted'].str.contains('DR')]
# pred_df_dpq = pred_df_all[(pred_df_all['HLA_formatted'].str.contains('DP'))|(pred_df_all['HLA_formatted'].str.contains('DQ'))]

# # DR
# pred_sub_dr = pred_df_dr[['HLA_formatted', 'gene_var_gt', param]]
# pred_sub_dr_wide = pd.pivot(pred_sub_dr, index='gene_var_gt', columns='HLA_formatted', values=param)
# pred_sub_dr_wide = pred_sub_dr_wide.reset_index() # this is to make sure that you have the gene_var column in there too

# hla_ukbb_dr = df_hla2_dr.filter(regex='\d').columns.tolist() # relevant HLAs
# hla_intersect_dr = pred_sub_dr_wide.columns[pred_sub_dr_wide.columns.isin(hla_ukbb_dr)] # HLA in the UKBB which I have predictions for 
# hla_intersect_dr_list = hla_intersect_dr.tolist() 

# # prepare gene variants names to match names in the Patient file 
# # pred_sub_dr = pred_sub_dr_wide[hla_intersect_dr_list + pred_sub_dr_wide.columns[pred_sub_dr_wide.columns.str.contains('gene_var')].tolist()]
# pred_sub_dr = pred_sub_dr[pred_sub_dr['gene_var_gt'].str.contains('_ch', regex=True)] # retain CH scores only 
# pred_sub_dr['gene_var'] = pred_sub_dr['gene_var_gt'].str.replace('_ch', '') # remove the ch / refseq annotation
# pred_sub_dr['gene_var'] = pred_sub_dr['gene_var'].str.replace('_refseq', '') # remove refseq if present 
# pred_sub_dr['gene_var'] = pred_sub_dr['gene_var'].astype(str)

# ch_hla_sub_dr = df_hla2_dr[hla_intersect_dr_list + df_hla2_dr.columns[df_hla2_dr.columns.str.contains('gene_var')].tolist()]
# ch_hla_sub_dr = pd.concat([df_hla2_dr["Person_ID"], ch_hla_sub_dr], axis=1) # add CH cases 
# ch_hla_sub_dr['score'] = ch_hla_sub_dr.apply(find_best_score_for_variant_carried, df=pred_sub_dr, param=param, axis=1) # add score for the parameter
# ch_hla_scores_dr = ch_hla_sub_dr.dropna() # remove NA (incorrectly annotated cases)

# # merge scores with VAF and age
# age_vaf_dr = df_hla2_dr[['Person_ID', 'VAF', 'var_depth', 'depth', 'age', 'gene_var']]
# ch_hla_merge_dr = pd.merge(ch_hla_scores_dr, age_vaf_dr, on = ['Person_ID', 'gene_var'])

# # now add the columns with VAF and age 
# col_to_select = ['Person_ID', 'gene_var', 'score', 'age', 'var_depth', 'VAF'] # subset the data 
# ch_hla_merge_sub_dr = ch_hla_merge_dr[col_to_select]
# ch_hla_merge_sub_dr['log_score'] = -1*np.log10(ch_hla_merge_sub_dr['score']) # convert score to -log10(score)
# ch_hla_merge_sub_dr['allele_type'] = 'DR'

# # DP
# pred_sub_dpq = pred_df_dpq[['HLA_formatted', 'gene_var_gt', param]]
# pred_sub_dpq = pred_sub_dpq[~pred_sub_dpq.duplicated()] # remove duplicate columns 
# pred_sub_dpq_reset = pred_sub_dpq.reset_index()
# pred_sub_dpq_wide = pd.pivot(pred_sub_dpq_reset, index='gene_var_gt', columns='HLA_formatted', values=param)
# pred_sub_dpq_wide = pred_sub_dpq_wide.reset_index() # this is to make sure that you have the gene_var column in there too

# hla_ukbb_dp = df_hla2_dp.filter(regex='\d').columns.tolist() # relevant HLAs
# hla_intersect_dp = pred_sub_dpq_wide.columns[pred_sub_dpq_wide.columns.isin(hla_ukbb_dp)] # HLA in the UKBB which I have predictions for 
# hla_intersect_dp_list = hla_intersect_dp.tolist() 

# # prepare gene variants names to match names in the Patient file 
# pred_sub_dp = pred_sub_dpq_wide[hla_intersect_dp_list + pred_sub_dpq_wide.columns[pred_sub_dpq_wide.columns.str.contains('gene_var')].tolist()]
# pred_sub_dp = pred_sub_dp[pred_sub_dp['gene_var_gt'].str.contains('_ch', regex=True)] # retain CH scores only 
# pred_sub_dp['gene_var'] = pred_sub_dp['gene_var_gt'].str.replace('_ch', '') # remove the ch / refseq annotation
# pred_sub_dp['gene_var'] = pred_sub_dp['gene_var'].str.replace('_refseq', '') # remove refseq if present 
# pred_sub_dp['gene_var'] = pred_sub_dp['gene_var'].astype(str)

# # look at predictions for a specific allele class  
# ch_hla_sub_dp = df_hla2_dp[hla_intersect_dp_list + df_hla2_dp.columns[df_hla2_dp.columns.str.contains('gene_var')].tolist()]
# ch_hla_sub_dp = pd.concat([df_hla2_dp["Person_ID"], ch_hla_sub_dp], axis=1) # add CH cases 
# ch_hla_sub_dp['score'] = ch_hla_sub_dp.apply(find_best_score_for_variant_carried, df=pred_sub_dp, param=param, axis=1) # add score for the parameter
# ch_hla_scores_dp = ch_hla_sub_dp.dropna() # remove NA (incorrectly annotated cases)

# # merge scores with VAF and age
# age_vaf_dp = df_hla2_dp[['Person_ID', 'VAF', 'var_depth', 'age', 'gene_var']]
# ch_hla_merge_dp = pd.merge(ch_hla_scores_dp, age_vaf_dp, on = ['Person_ID', 'gene_var'])

# # now add the columns with VAF and age 
# col_to_select = ['Person_ID', 'gene_var', 'score', 'age', 'var_depth', 'VAF'] # subset the data 
# ch_hla_merge_sub_dp = ch_hla_merge_dp[col_to_select]
# ch_hla_merge_sub_dp['log_score'] = -1*np.log10(ch_hla_merge_sub_dp['score']) # convert score to -log10(score)
# ch_hla_merge_sub_dp['allele_type'] = 'DP'

# # DQ
# hla_ukbb_dq = df_hla2_dq.filter(regex='\d').columns.tolist() # relevant HLAs
# hla_intersect_dq = pred_sub_dpq_wide.columns[pred_sub_dpq_wide.columns.isin(hla_ukbb_dq)] # HLA in the UKBB which I have predictions for 
# hla_intersect_dq_list = hla_intersect_dq.tolist() 

# # prepare gene variants names to match names in the Patient file 
# pred_sub_dq = pred_sub_dpq_wide[hla_intersect_dq_list + pred_sub_dpq_wide.columns[pred_sub_dpq_wide.columns.str.contains('gene_var')].tolist()]
# pred_sub_dq = pred_sub_dq[pred_sub_dq['gene_var_gt'].str.contains('_ch', regex=True)] # retain CH scores only 
# pred_sub_dq['gene_var'] = pred_sub_dq['gene_var_gt'].str.replace('_ch', '') # remove the ch / refseq annotation
# pred_sub_dq['gene_var'] = pred_sub_dq['gene_var'].str.replace('_refseq', '') # remove refseq if present 
# pred_sub_dq['gene_var'] = pred_sub_dq['gene_var'].astype(str)

# # subset batch_gene_age_hla file 
# ch_hla_sub_dq = df_hla2_dq[hla_intersect_dq_list + df_hla2_dq.columns[df_hla2_dq.columns.str.contains('gene_var')].tolist()]
# ch_hla_sub_dq = pd.concat([df_hla2_dq["Person_ID"], ch_hla_sub_dq], axis=1) # add CH cases 
# ch_hla_sub_dq['score'] = ch_hla_sub_dq.apply(find_best_score_for_variant_carried, df=pred_sub_dq, param=param, axis=1) # add score for the parameter
# ch_hla_scores_dq = ch_hla_sub_dq.dropna() # remove NA (incorrectly annotated cases)

# # merge scores with VAF and age
# age_vaf_dq = df_hla2_dq[['Person_ID', 'VAF', 'var_depth', 'age', 'gene_var']]
# ch_hla_merge_dq = pd.merge(ch_hla_scores_dq, age_vaf_dq, on = ['Person_ID', 'gene_var'])

# # now add the columns with VAF and age 
# col_to_select = ['Person_ID', 'gene_var', 'score', 'age', 'var_depth', 'VAF'] # subset the data 
# ch_hla_merge_sub_dq = ch_hla_merge_dq[col_to_select]
# ch_hla_merge_sub_dq['log_score'] = -1*np.log10(ch_hla_merge_sub_dq['score']) # convert score to -log10(score)
# ch_hla_merge_sub_dq['allele_type'] = 'DQ'

# # we can actually combine all these scores now 
# ch_hla_merge_sub_all = pd.concat([ch_hla_merge_sub_dr, ch_hla_merge_sub_dp, ch_hla_merge_sub_dq], axis = 0)

# # filter so this only includes variants with > 15 samples present
# gene_counts = ch_hla_merge_sub_all.gene_var.value_counts().reset_index()
# variants_to_examine = gene_counts[gene_counts['count'] > 15].gene_var.tolist()

# ch_hla_merge_sub_all = ch_hla_merge_sub_all[ch_hla_merge_sub_all['gene_var'].isin(variants_to_examine)]
# ch_hla_merge_sub_dr = ch_hla_merge_sub_dp[ch_hla_merge_sub_dp['gene_var'].isin(variants_to_examine)]
# ch_hla_merge_sub_dp = ch_hla_merge_sub_dq[ch_hla_merge_sub_dq['gene_var'].isin(variants_to_examine)]
# ch_hla_merge_sub_dq = ch_hla_merge_sub_dr[ch_hla_merge_sub_dr['gene_var'].isin(variants_to_examine)]


NameError: name 'find_best_score_for_variant_carried' is not defined

In [17]:

# # define function to find the best score for the variant that is carried (ie present in the person)
# def find_best_score_for_variant_carried_all_hla(row, df, param):

#     '''
#     This functiion is applied to the CH dataset
#     The df is the dataset with predictions for given allele and genetic variant 
#     parameter is what to base this prediction on (here will only be using %Rank_EL)
#     '''
    
#     # these are the values of rows to seach for 
#     row_values = pd.to_numeric(row[1:-1], errors='coerce')
    
#     # find alleles which are present (NOTE this is changed from the previous function bc formatting is different)
#     hla = row.index[1:-1][row_values >= 1]  
    
#     # find values corresponding to these alleles 
#     vals = df.loc[df['gene_var'] == row['gene_var'], hla].values.flatten() 
    
#     # it can be that nothing was found e.g., bc there were no predictions made for this variant
#     if vals.size == 0: 
#         value = None
    
#     else: 
    
#         # note here we will only be using '%Rank_EL'
#         if param == "Aff_nM":
#             value = min(vals) # highest affinity corresponds to lowest value (see plots above)
#         elif param == "%Rank_EL":
#             value = min(vals) # the best rank is the lowest one (indicates peptide in top x % of binders)
#         elif param == "Score_EL":
#             value = max(vals) # the best score is the highest one 
#         elif param == "%Rank_BA":
#             value = min(vals) # the best rank is the lowest one (indicates peptide in top x % of binders)
#         elif param == "Score_BA":
#             value = max(vals) # the best score is the highest one
#         else:
#             print('Incorrect parameter provided') 
    
#     return value  

In [19]:
# # specify parameter to obtain predictions for 
# param = '%Rank_EL'

# # add a single score based on all of the MHC class II alleles someone has (not breaking down into DR, DP, DQ)
# pred_sub_all = pred_df_all[['HLA_formatted', 'gene_var_gt', param]]
# pred_sub_all = pred_sub_all[~pred_sub_all.duplicated()] # remove duplicate columns 
# pred_sub_all_reset = pred_sub_all.reset_index()
# pred_sub_all_wide = pd.pivot(pred_sub_all_reset, index='gene_var_gt', columns='HLA_formatted', values=param).reset_index()

# hla_ukbb_all = df_hla2_drpq.columns[22:].tolist() # this includes both separate DP / DQ and in combinations
# hla_intersect_all = pred_sub_all_wide.columns[pred_sub_all_wide.columns.isin(hla_ukbb_all)] # HLA in the UKBB which I have predictions for 
# hla_intersect_all_list = hla_intersect_all.tolist() 

# # prepare gene variants names to match names in the Patient file 
# pred_sub_all = pred_sub_all_wide[hla_intersect_all_list + pred_sub_all_wide.columns[pred_sub_all_wide.columns.str.contains('gene_var')].tolist()]
# pred_sub_all = pred_sub_all[pred_sub_all['gene_var_gt'].str.contains('_ch', regex=True)] # retain CH scores only 
# pred_sub_all['gene_var'] = pred_sub_all['gene_var_gt'].str.replace('_ch', '') # remove the ch / refseq annotation
# pred_sub_all['gene_var'] = pred_sub_all['gene_var'].str.replace('_refseq', '') # remove refseq if present 
# pred_sub_all['gene_var'] = pred_sub_all['gene_var'].astype(str)

# # subset batch_gene_age_hla file 
# ch_hla_sub_all = df_hla2_drpq[hla_intersect_all_list + df_hla2_drpq.columns[df_hla2_drpq.columns.str.contains('gene_var')].tolist()]
# ch_hla_sub_all = pd.concat([df_hla2_drpq["Person_ID"], ch_hla_sub_all], axis=1) # add CH cases 
# ch_hla_sub_all['score'] = ch_hla_sub_all.apply(find_best_score_for_variant_carried_all_hla, df=pred_sub_all, param=param, axis=1) # add score for the parameter
# ch_hla_scores_all = ch_hla_sub_all.dropna() # remove NA (incorrectly annotated cases)

# # merge scores with VAF and age
# age_vaf_all = df_hla2_drpq[['Person_ID', 'VAF', 'var_depth', 'depth', 'age', 'gene_var']]
# ch_hla_merge_all = pd.merge(ch_hla_scores_all, age_vaf_all, on = ['Person_ID', 'gene_var'])

# # now add the columns with VAF and age 
# col_to_select = ['Person_ID', 'gene_var', 'score', 'age', 'var_depth', 'depth', 'VAF'] # subset the data 
# ch_hla_merge_sub_all = ch_hla_merge_all[col_to_select]
# ch_hla_merge_sub_all['log_score'] = -1*np.log10(ch_hla_merge_sub_all['score']) # convert score to -log10(score)

# # filter out rare variants
# gene_counts = ch_hla_merge_sub_all.gene_var.value_counts().reset_index()
# variants_to_examine = gene_counts[gene_counts['count'] > 15].gene_var.tolist()
# ch_hla_merge_sub_all = ch_hla_merge_sub_all[ch_hla_merge_sub_all['gene_var'].isin(variants_to_examine)]



: 

## Add scores for all variants 

In [13]:
# need to do something to assign, for everyone, the score for every variant we have predictions for based on their MHC I

# define the function to find best scores (we can technically do this for different parameters, realistically I think will just do it for %Rank_EL)

def find_best_score_for_all_variants(row, df, param):

    '''
    row = function is applied to each row of the participant dataframe (ie run through each participant)

    df = dataframe with prediction scores (like netmhc)

    Allowed parameters (param) are:
    Aff_nM - affinity (raw number)
    Score_BA - binding affinity score
    Score_EL - elution score
    %Rank_BA - %Rank of binding affinity cf a set of random peptides
    %Rank_EL - %Rank of elution cf a set of random peptides
    '''


    # get HLAs for each person 
    hlas = row.index[7:][row[7:] >= 1] # select alleles which each Person (row) carries (the first 7 columns are: Person ID, gene_var, VAF, var_depth, depth, CH status, age)

    # get variants 
    variants = df['gene_var']
   
    scores = {} # initialise empy dictionaries

    # depending on the parameter, pick the minimum of maximum value 
    if param == "Aff_nM":
        for var in variants:
            # Find the minimum value for each variant in the category that is present
            best_value = min(df.loc[df['gene_var'] == var, hlas].values[0])
            # Update the dictionary with the minimum value for the corresponding variant
            scores[f'score_{var}'] = best_value
        return pd.Series(scores)

    elif param == "Score_BA":
        for var in variants:
            
            best_value = max(df.loc[df['gene_var'] == var, hlas].values[0])
            scores[f'score_{var}'] = best_value
        
        return pd.Series(scores)

    elif param == "Score_EL":
        for var in variants:
           
            best_value = max(df.loc[df['gene_var'] == var, hlas].values[0])
            scores[f'score_{var}'] = best_value

        return pd.Series(scores)

    elif param == "%Rank_BA":
        for var in variants:

            best_value = min(df.loc[df['gene_var'] == var, hlas].values[0])
            scores[f'score_{var}'] = best_value

        return pd.Series(scores)

    # we will likely be choosing this option
    elif param == "%Rank_EL":
        for var in variants:
            
            best_value = min(df.loc[df['gene_var'] == var, hlas].values[0]) # choose minimum rank as the best score 
            scores[f'score_{var}'] = best_value

        return pd.Series(scores)


### check all variants screened for are present 

In [14]:
print('Number of variants identified in the UKBB from CH screening:', len(df_hla2.gene_var.unique())-1) # NB this list also includes 'NaN' that's why its -1
print('Number or variants I have predictions for (NetMHC):',  len(pred_df_all.gene_var.unique()))

variants_in_ukbb = df_hla2.gene_var.unique().tolist()
variants_in_ukbb = [x for x in variants_in_ukbb if str(x) != 'nan']
print('List of variants identified in UKBB:', sorted(variants_in_ukbb))
print('List of variants with NetMHC preds:',  sorted(pred_df_all.gene_var.unique())) # there is more because I also made predictions for variants which ended up not being common enough 

Number of variants identified in the UKBB from CH screening: 40
Number or variants I have predictions for (NetMHC): 54
List of variants identified in UKBB: ['DNMT3A_P904L', 'DNMT3A_P904Q', 'DNMT3A_P904R', 'DNMT3A_R320*', 'DNMT3A_R326C', 'DNMT3A_R326G', 'DNMT3A_R326S', 'DNMT3A_R598*', 'DNMT3A_R729G', 'DNMT3A_R729W', 'DNMT3A_R736C', 'DNMT3A_R736G', 'DNMT3A_R736H', 'DNMT3A_R736L', 'DNMT3A_R736S', 'DNMT3A_R771*', 'DNMT3A_R882C', 'DNMT3A_R882H', 'DNMT3A_R882L', 'DNMT3A_R882P', 'DNMT3A_R882S', 'DNMT3A_Y735C', 'DNMT3A_Y735F', 'DNMT3A_Y735S', 'GNB1_K57E', 'IDH1_R132H', 'IDH2_R140Q', 'IDH2_R172K', 'JAK2_V617F', 'KRAS_G12D', 'KRAS_G12S', 'MPL_W515L', 'NRAS_G12D', 'SF3B1_K666N', 'SF3B1_K700E', 'SRSF2_P95H', 'SRSF2_P95L', 'SRSF2_P95R', 'TP53_R175H', 'TP53_R273H']
List of variants with NetMHC preds: ['DNMT3A_P904L', 'DNMT3A_P904Q', 'DNMT3A_P904R', 'DNMT3A_R320*', 'DNMT3A_R326C', 'DNMT3A_R326G', 'DNMT3A_R326S', 'DNMT3A_R598*', 'DNMT3A_R598G', 'DNMT3A_R729G', 'DNMT3A_R729W', 'DNMT3A_R736C', 'DNMT3A_R

In [15]:
# make sure the naming of variants is the same
pred_df_all['gene_var'] = pred_df_all['gene_var'].str.replace('STOP', '*')
pred_df_all['gene_var'].unique()

pred_df_all['gene_var_gt'] = pred_df_all['gene_var_gt'].str.replace('STOP', '*')
pred_df_all['gene_var_gt'].unique()

array(['DNMT3A_P904L_ch', 'DNMT3A_P904L_refseq', 'DNMT3A_P904Q_ch',
       'DNMT3A_P904Q_refseq', 'DNMT3A_P904R_ch', 'DNMT3A_P904R_refseq',
       'DNMT3A_R320*_ch', 'DNMT3A_R320*_refseq', 'DNMT3A_R326C_ch',
       'DNMT3A_R326C_refseq', 'DNMT3A_R326G_ch', 'DNMT3A_R326G_refseq',
       'DNMT3A_R326S_ch', 'DNMT3A_R326S_refseq', 'DNMT3A_R598*_ch',
       'DNMT3A_R598*_refseq', 'DNMT3A_R598G_ch', 'DNMT3A_R598G_refseq',
       'DNMT3A_R729G_ch', 'DNMT3A_R729G_refseq', 'DNMT3A_R729W_ch',
       'DNMT3A_R729W_refseq', 'DNMT3A_R736C_ch', 'DNMT3A_R736C_refseq',
       'DNMT3A_R736G_ch', 'DNMT3A_R736G_refseq', 'DNMT3A_R736H_ch',
       'DNMT3A_R736H_refseq', 'DNMT3A_R736L_ch', 'DNMT3A_R736L_refseq',
       'DNMT3A_R736P_ch', 'DNMT3A_R736P_refseq', 'DNMT3A_R736S_ch',
       'DNMT3A_R736S_refseq', 'DNMT3A_R771*_ch', 'DNMT3A_R771*_refseq',
       'DNMT3A_R771G_ch', 'DNMT3A_R771G_refseq', 'DNMT3A_R882C_ch',
       'DNMT3A_R882C_refseq', 'DNMT3A_R882G_ch', 'DNMT3A_R882G_refseq',
       'DNMT3A_R882H

In [16]:
# create a suitably formatted NetMHC df 
param = '%Rank_EL'
pred_df_sub = pred_df_all[['HLA_formatted', 'gene_var_gt', param]]
pred_df_sub_wide = pd.pivot(pred_df_sub, index='gene_var_gt', columns='HLA_formatted', values=param)
pred_df_sub_wide = pred_df_sub_wide.reset_index() # this is to make sure that you have the gene_var column in there too 
pred_df_sub_wide

HLA_formatted,gene_var_gt,DPA1_103-DPB1_1001,DPA1_103-DPB1_101,DPA1_103-DPB1_1101,DPA1_103-DPB1_1301,DPA1_103-DPB1_1401,DPA1_103-DPB1_1501,DPA1_103-DPB1_1601,DPA1_103-DPB1_1701,DPA1_103-DPB1_1801,...,DRB3_101,DRB3_202,DRB3_210,DRB3_224,DRB3_301,DRB4_101,DRB4_103,DRB5_101,DRB5_102,DRB5_202
0,DNMT3A_P904L_ch,20.84,7.30,6.75,5.05,36.28,21.29,8.93,17.73,17.99,...,64.62,54.75,54.75,47.57,60.38,57.86,57.86,1.54,1.54,2.05
1,DNMT3A_P904L_refseq,8.04,7.35,0.99,0.67,25.15,9.09,3.76,7.01,4.96,...,49.11,23.74,23.74,16.93,31.32,30.11,30.11,0.82,0.82,1.17
2,DNMT3A_P904Q_ch,12.28,13.90,22.69,11.32,31.15,11.51,3.75,9.87,5.75,...,52.89,41.59,41.59,33.98,46.25,37.95,37.95,0.81,0.89,1.19
3,DNMT3A_P904Q_refseq,8.04,7.35,0.99,0.67,25.15,9.09,3.76,7.01,4.96,...,49.11,23.74,23.74,16.93,31.32,30.11,30.11,0.82,0.82,1.17
4,DNMT3A_P904R_ch,11.95,13.49,26.51,14.69,30.02,10.98,3.43,10.44,6.18,...,51.32,53.38,53.38,42.37,42.22,49.26,49.26,1.21,1.28,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,SRSF2_P95R_refseq,11.10,36.72,19.38,3.89,33.17,20.87,12.42,6.89,16.39,...,42.34,5.37,5.37,2.43,12.47,0.84,0.84,2.82,1.04,2.18
104,TP53_R175H_ch,13.89,19.00,30.48,29.72,24.22,53.28,10.49,11.97,39.93,...,50.00,36.00,36.00,34.15,58.28,49.46,49.46,13.89,22.95,10.08
105,TP53_R175H_refseq,14.10,17.70,54.38,27.30,22.04,53.69,9.09,11.85,53.04,...,49.16,35.86,35.86,22.26,44.61,39.17,39.17,16.28,24.90,13.51
106,TP53_R273H_ch,3.52,54.74,12.27,16.16,28.71,36.98,5.89,2.62,21.58,...,49.16,17.57,17.57,8.10,24.17,43.39,43.39,12.18,24.21,15.39


In [17]:
hla_ukbb_all = df_hla2_drpq.columns[22:] # select columns with alleles 
hla_ukbb_all

Index(['DRB5_101', 'DRB5_102', 'DRB5_202', 'DRB5_9901', 'DRB4_101', 'DRB4_103',
       'DRB4_9901', 'DRB3_101', 'DRB3_202', 'DRB3_210',
       ...
       'DQA1_102-DQB1_609', 'DQA1_103-DQB1_609', 'DQA1_201-DQB1_609',
       'DQA1_301-DQB1_609', 'DQA1_302-DQB1_609', 'DQA1_303-DQB1_609',
       'DQA1_401-DQB1_609', 'DQA1_501-DQB1_609', 'DQA1_505-DQB1_609',
       'DQA1_601-DQB1_609'],
      dtype='object', length=446)

In [18]:
# create a list of HLA alleles genotyped in the UKBB for which predictions are available
hla_intersect = pred_df_sub_wide.columns[pred_df_sub_wide.columns.isin(hla_ukbb_all)].tolist() # DRB alleles 
hla_intersect = list(set(hla_intersect) & set(hla_ukbb_all))

# # create the prediction dataset 
pred_df_sub = pred_df_sub_wide[hla_intersect + pred_df_sub_wide.columns[pred_df_sub_wide.columns.str.contains('gene_var')].tolist()] # subset netmhc so you only have alleles which are in the UKBB, also 194
pred_df_sub = pred_df_sub[pred_df_sub['gene_var_gt'].str.contains('_ch', regex=True)] # retain CH scores only 
pred_df_sub['gene_var'] = pred_df_sub['gene_var_gt'].str.replace('_ch', '') # remove the ch / refseq annotation
pred_df_sub['gene_var'] = pred_df_sub['gene_var'].str.replace('_refseq', '') # remove refseq if present 

# apply the function to find, for each participant, the best score for each examined CH variant 
# you take your current df and apply, row by row, the function to get scores for each variant 
df_hla2_hlas_drpq = pd.concat([df_hla2_drpq[['Person_ID', 'gene_var', 'VAF', 'var_depth', 'depth', 'ch_status', 'age']], df_hla2_drpq[hla_intersect]], axis = 1)

In [19]:
# apply the function to find scores for each variant 
df_hla2_scores = pd.concat([df_hla2_hlas_drpq, df_hla2_hlas_drpq.apply(find_best_score_for_all_variants, df=pred_df_sub, param=param, axis=1)], axis=1)

In [20]:
# add extra columns (MHC data)
df_hla2_scores_added = pd.concat([df_hla2_scores, df_hla2[['het_allele_II_DRB', 'het_allele_II_DPA', 'het_allele_II_DPB', 'het_allele_II_DQA', 'het_allele_II_DQB', 'count_class_II', 'sum_class_II', 'het_all_class_II']]], axis = 1)
df_hla2_scores_added

Unnamed: 0,Person_ID,gene_var,VAF,var_depth,depth,ch_status,age,DRB1_408,DQA1_301-DQB1_302,DQA1_101-DQB1_605,...,score_TP53_R175H,score_TP53_R273H,het_allele_II_DRB,het_allele_II_DPA,het_allele_II_DPB,het_allele_II_DQA,het_allele_II_DQB,count_class_II,sum_class_II,het_all_class_II
0,4860169,,,,,0,46,0.0,0.0,0.0,...,7.91,3.15,True,False,True,True,True,14,16.0,False
1,3381323,,,,,0,52,0.0,1.0,0.0,...,20.19,21.06,True,False,True,True,True,13,16.0,False
2,2805252,,,,,0,65,0.0,0.0,0.0,...,2.89,3.15,True,True,True,True,True,15,16.0,False
3,3318036,,,,,0,50,0.0,0.0,0.0,...,18.58,23.87,False,False,False,False,False,8,16.0,False
4,4120291,,,,,0,41,0.0,0.0,0.0,...,4.19,4.71,True,False,True,True,True,13,16.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328236,4478244,DNMT3A_P904Q,0.018692,2.0,107.0,1,55,0.0,0.0,0.0,...,6.47,3.15,True,False,True,True,True,14,16.0,False
328237,4478244,DNMT3A_R882H,0.022727,2.0,88.0,1,55,0.0,0.0,0.0,...,6.47,3.15,True,False,True,True,True,14,16.0,False
328238,3573995,,,,,0,62,0.0,0.0,0.0,...,11.96,17.57,False,False,False,False,False,9,16.0,False
328239,3025735,,,,,0,51,0.0,0.0,0.0,...,14.15,8.58,True,False,True,True,True,14,16.0,False


In [21]:
# save to a file (this is already usable for further analysis)
df_hla2_scores_added.to_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/results/dataframes/20240907_netmhc2_scores_for_all_var.csv')


In [5]:
# read in the dataframe 
df_hla2_scores_added = pd.read_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/results/dataframes/20240907_netmhc2_scores_for_all_var.csv')


### Add label (top / bottom half of binding scores)

In [22]:
# subset the dataframe to only include gene_var, Person ID and scores 
scores_col = [col for col in df_hla2_scores_added.columns if col.startswith('score_')]
df_hla2_scores_sub = pd.concat([df_hla2_scores_added[['Person_ID', 'gene_var', 'var_depth', 'depth', 'VAF']], df_hla2_scores_added[scores_col]], axis = 1)

# melt the dataframe 
df_hla2_scores_sub_melted = pd.melt(df_hla2_scores_sub, id_vars = ['gene_var', 'Person_ID', 'var_depth', 'depth', 'VAF'])

# add a column which has the name of the variant (nicely formatted)
df_hla2_scores_sub_melted['CH_variant'] = df_hla2_scores_sub_melted['variable'].str[6:]

# add a column to indicate CH status (either carrier of the variant the score is for, or non-carrier, even if have CH driven by sth else)
df_hla2_scores_sub_melted['CH_status'] = np.where(df_hla2_scores_sub_melted['gene_var'] == df_hla2_scores_sub_melted['CH_variant'].replace(), 1, 0)

# you want the %Rank_EL to be in the format of -1*log10(%Rank_EL)
df_hla2_scores_sub_melted['log_score'] = -1 * np.log10(df_hla2_scores_sub_melted['value'])

# add median score for each variant and then order them by median 
df_hla2_scores_sub_melted['median_score'] = df_hla2_scores_sub_melted.groupby('CH_variant')['log_score'].transform('median')

# convert to categories 
df_hla2_scores_sub_melted['gene_var'] = df_hla2_scores_sub_melted['gene_var'].astype('category') 
df_hla2_scores_sub_melted['CH_variant'] = df_hla2_scores_sub_melted['CH_variant'].astype('category') # make sure you convert this to category first of all

In [23]:
# Add group (split participants with values equal to the median)

variants = df_hla2_scores_sub_melted.CH_variant.unique().tolist()

for var in variants:

    # select df with variant 
    df_variant = df_hla2_scores_sub_melted[df_hla2_scores_sub_melted['CH_variant'] == var]
    
    # get the median 
    median_score = df_variant['log_score'].median()
    
    # find median values 
    median_values = df_variant[df_variant['log_score'] == df_variant['median_score']]
    
    # assign top / bottom if above or below median 
    below_median = df_variant[df_variant['log_score'] < median_score]
    above_median = df_variant[df_variant['log_score'] > median_score]
    
    # figure out how many observations you need with median values to top / bottom half
    half_length = len(df_variant) // 2
    num_bottom_needed = half_length - len(below_median)
    num_top_needed = len(median_values) - num_bottom_needed
    
    # assign values equal to median to top or bottom
    shuffled_median_values = median_values.sample(frac=1)
    bottom_half_median = shuffled_median_values.iloc[:num_bottom_needed]
    top_half_median = shuffled_median_values.iloc[num_bottom_needed:]
    
    # assign groups
    df_hla2_scores_sub_melted.loc[below_median.index, 'group'] = 'bottom half'
    df_hla2_scores_sub_melted.loc[above_median.index, 'group'] = 'top half'
    df_hla2_scores_sub_melted.loc[bottom_half_median.index, 'group'] = 'bottom half'
    df_hla2_scores_sub_melted.loc[top_half_median.index, 'group'] = 'top half'


In [24]:
# modified saving labels

# save to a file (this is already usable for further analysis)
df_hla2_scores_sub_melted.to_csv('/Users/barbarawalkowiak/Desktop/msc_thesis/results/dataframes/20240907_netmhc2_scores_for_all_var_with_labels.csv')


: 