In [3]:
import gseapy as gp
import pandas as pd
import numpy as np
import scipy

import re
from gseapy import Biomart

# CWAS Gene Set Enrichment

In this notebook, we explore whether categories significantly enriched for SVs in cases vs. controls reflect any higher level biological process. One way to do this is with gene set enrichment.

# What is gene set enrichment anyway?

The idea of gene set enrichment is that in a list of genes (i.e. upregulated genes, or genes targeted by a "category"), if it were random, would have a certain degree of overlap with a gene set by pure chance. The idea of gene set enrichment is that we determine overlap with a gene set, and compare this to the background.

This is a bit weirder with these data, since we have actual _SV counts_. The underlying test for gene set enrichment is a fisher's exact test, which works perfectly fine with counts in that way, but I'm not entirely sure it's right to do?

We'll do it anyway I guess.

# Load in the data

We load in the SVs and dosages, which we'll need.

## Read in SVs

In [253]:
# define the folder name for all our results
folder_name = "processed-data-v2.5.2"

# define SVs and dosages for discovery and validation
sv_path = "gs://vanallen-pedsv-analysis/beds/PedSV.v2.5.2.full_cohort.analysis_samples.sites.bed.gz"
dosages_path = "gs://vanallen-pedsv-analysis/beds/PedSV.v2.5.2.full_cohort.analysis_samples.allele_dosages.bed.gz"

# define metadata
metadata_path = "gs://vanallen-pedsv-analysis/sample_info/PedSV.v2.5.2.cohort_metadata.w_control_assignments.tsv.gz"
samples_path = "gs://vanallen-pedsv-analysis/sample_info/PedSV.v2.5.2.final_analysis_cohort.samples.list"

# list of genes that are annotated - drop ensembl IDs
gene_ref = pd.read_csv("ref/gencode_hg38_protein_coding_genes_for_annotation_7_31_23.txt")
gene_ref = gene_ref[~gene_ref['value'].str.startswith('ENSG00')]
gene_ref = gene_ref['value'].tolist()

Load metadata and SVs

In [121]:
metadata = pd.read_csv(
    metadata_path,
    sep="\t",
)

# add a sex label to metadata
metadata["sex"] = (metadata["chrX_CopyNumber"].round() < 2).astype(int)

###############
### Samples ###
###############
nbl_cases = metadata[
    (metadata["neuroblastoma_case"] == True)
]["entity:sample_id"].tolist()

nbl_controls = metadata[
    (metadata["neuroblastoma_control"] == True)
]["entity:sample_id"].tolist()

Now we load the SVs. We'll eventually combine discovery and validation data, but it's easiest to keep them separate for now, since the SVs and dosages are not fully overlapping.

In [6]:
###############
##### SVs #####
###############
svs = pd.read_csv(
    sv_path,
    sep="\t",
)

###############
### Dosages ###
###############
dosages = pd.read_csv(
    dosages_path,
    sep="\t",
    index_col=False,
)

  exec(code_obj, self.user_global_ns, self.user_ns)


## Read in the category results

I'm going to concatenate all this data so that I'm only dealing with a few files. Focusing on neuroblastoma.

In [102]:
###########
### SVs ###
###########
nbl_coding_svs = pd.read_csv(
    "data/CWAS data for Jett/List of variants by category for each CWAS analysis/neuroblastoma_all_coding_SVs_in_each_category_list_combined_11_3_23.txt",
    sep="\t",
)

nbl_noncoding_svs = pd.read_csv('data/CWAS data for Jett/List of variants by category for each CWAS analysis/neuroblastoma_all_noncoding_SVs_in_each_category_list_combined_BURDEN_TESTING_with_col_names_11_3_23.txt', sep='\t')
nbl_noncoding_svs = nbl_noncoding_svs.rename(columns = {'emd': 'end'})

# combine the SVs
nbl_coding_svs['sv_category'] = 'coding'
nbl_noncoding_svs['sv_category'] = 'non-coding'

nbl_category_svs = pd.concat([nbl_coding_svs, nbl_noncoding_svs])
nbl_category_svs.head(2)

Unnamed: 0,SV,chrom,start,end,category,sv_category
0,PedSV.2.5.2_DUP_chr1_794,chr1,19221626,19301822,DUP.RARE.PREDICTED_COPY_GAIN.lof_constrained.e...,coding
1,PedSV.2.5.2_DUP_chr1_1379,chr1,44731601,44792024,DUP.RARE.PREDICTED_COPY_GAIN.lof_constrained.e...,coding


In [103]:
#########################
### FRAMEWORK RESULTS ###
#########################
nbl_singleton_coding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_coding_cwas_concatenated_glm_results_SINGLETON_11_3_23.txt",
    sep="\t",
)
nbl_singleton_coding_framework_results[['af_category', 'sv_category']] = ['singleton', 'coding']

nbl_rare_coding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_coding_cwas_concatenated_glm_results_RARE_11_3_23.txt",
    sep="\t",
)
nbl_rare_coding_framework_results[['af_category', 'sv_category']] = ['rare', 'coding']

nbl_singleton_noncoding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_noncoding_cwas_concatenated_glm_results_SINGLETON_11_3_23.txt",
    sep="\t",
)
nbl_singleton_noncoding_framework_results[['af_category', 'sv_category']] = ['singleton', 'non-coding']

nbl_rare_noncoding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_noncoding_cwas_concatenated_glm_results_RARE_11_3_23.txt",
    sep="\t",
)
nbl_rare_noncoding_framework_results[['af_category', 'sv_category']] = ['rare', 'non-coding']

nbl_framework_results = pd.concat([nbl_singleton_coding_framework_results, nbl_rare_coding_framework_results, 
                                   nbl_singleton_noncoding_framework_results, nbl_rare_noncoding_framework_results])

In [104]:
nbl_framework_results.head(2)

Unnamed: 0,point_estimate,std_error,z_score,p_value,SV_counts_cases,SV_counts_cases_max,number_of_cases_with_zero_SVs,total_cases,SV_counts_controls,SV_counts_controls_max,...,number_of_unique_SVs,category_name,sv_type,frequency,mean_SVs_per_case,mean_SVs_per_control,mean_SVs_total,negative_log10_p_value,af_category,sv_category
0,0.271918,0.051227,5.3081,1.11e-07,438,5,336,646,2441,4,...,459,ANY.SINGLETON.PREDICTED_LOF_or_PREDICTED_PARTI...,ANY,SINGLETON,0.678019,0.519362,0.538533,6.955563,singleton,coding
1,0.276065,0.054006,5.111756,3.19e-07,393,5,356,646,2172,4,...,411,DEL.SINGLETON.ANY.ANY.ANY.protein_coding,DEL,SINGLETON,0.608359,0.462128,0.479798,6.495968,singleton,coding


In [105]:
nbl_coding_framework = pd.read_csv(
    "data/CWAS data for Jett/CWAS frameworks/CWAS_categories_neuroblastoma_coding_8_17_23.txt",
    sep="\t",
)

nbl_noncoding_framework = pd.read_csv(
    "data/CWAS data for Jett/CWAS frameworks/CWAS_rare_categories_neuroblastoma_noncoding_10_2_23.txt",
    sep="\t",
)

# Walk through a coding example

Let's extract out all the data that we need to examine the highest result for a single hallmark gene set.

In [278]:
test_framework = nbl_framework_results.query('af_category == "singleton" & sv_category == "coding"').loc[0, ["category_name"]].item()
framework_components = test_framework.split(".")
genic_relationship = framework_components[2]
test_framework, genic_relationship

('ANY.SINGLETON.PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP.ANY.ANY.protein_coding',
 'PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP')

In [279]:
nbl_coding_framework.head(5)

Unnamed: 0,sv_type,frequency,genic_relationship,constraint,expression,gene_group
0,DUP,RARE,PREDICTED_COPY_GAIN,lof_constrained,expressed_in_adrenal_gland,protein_coding
1,DEL,SINGLETON,PREDICTED_INTRAGENIC_EXON_DUP,missense_constrained,ANY,cosmic_cancer_genes
2,CPX_or_INV,,PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP,unconstrained,,germline_CPGs
3,INS_ALL,,ANY,ANY,,base_excision_repair_genes
4,ANY,,,,,chromatin_organization_genes


We identify the SVs that are part of that category.

In [280]:
svs_in_category = nbl_category_svs[(nbl_category_svs['sv_category'] == "coding") & 
                                   (nbl_category_svs["category"] == test_framework)]

# subset the actual SV matrix
svs_in_category = svs[svs["name"].isin(svs_in_category["SV"].tolist())]

svs_in_category.head(2)

Unnamed: 0,#chrom,start,end,name,svtype,AC,AF,ALGORITHMS,AN,BOTHSIDES_SUPPORT,...,trio_POPMAX_FREQ_HOMALT,trio_POPMAX_CN_FREQ,trio_POPMAX_CN_NONREF_FREQ,gnomad_v3.1_sv_POPMAX_AF,gnomad_v3.1_sv_POPMAX_FREQ_HOMREF,gnomad_v3.1_sv_POPMAX_FREQ_HET,gnomad_v3.1_sv_POPMAX_FREQ_HOMALT,gnomad_v3.1_sv_POPMAX_CN_FREQ,gnomad_v3.1_sv_POPMAX_CN_NONREF_FREQ,FILTER
64,chr1,923800,943501,PedSV.2.5.2_DEL_chr1_80,DEL,1,7.7e-05,depth,13038,False,...,0.0,,,0.000107,,,,,,PASS
152,chr1,1240217,1243609,PedSV.2.5.2_DEL_chr1_210,DEL,1,7.4e-05,manta,13462,True,...,0.0,,,1.8e-05,,,,,,PASS


In [281]:
svs_in_category.shape

(3837, 952)

Next, we determine the genes in question. We reference the column where the genes can be found, `genic_relationship`. We have to split this one in half.

In [282]:
genic_relationships = genic_relationship.split("_or_")
genic_relationships

['PREDICTED_LOF', 'PREDICTED_PARTIAL_EXON_DUP']

In [283]:
svs_in_category[genic_relationships].head()

Unnamed: 0,PREDICTED_LOF,PREDICTED_PARTIAL_EXON_DUP
64,SAMD11,
152,C1QTNF12,
162,"ACAP3,INTS11,PUSL1,SCNN1D",
173,CPTP,
198,"TMEM88B,VWA1",


We should be a bit more careful if any results turn up positive, but for now we'll just register an SV as contributing to a count for that gene.

In [291]:
nbl_sv_dosages = (
    dosages
    .set_index("ID")
    .loc[svs_in_category["name"].tolist(), nbl_cases + nbl_controls]
)
nbl_sv_dosages.head(2)

Unnamed: 0_level_0,PT_00QYKRAX,PT_00Y8C0XA,PT_025YMME2,PT_02AE4RSP,PT_02SNWVRF,PT_06Z51EN5,PT_0CKD259J,PT_0GMP9VVY,PT_0MVMPZKX,PT_11XN6CG5,...,ssi_26401,ssi_26409,ssi_26411,ssi_26422,ssi_26442,ssi_26452,ssi_26453,ssi_26458,ssi_26459,ssi_26463
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PedSV.2.5.2_DEL_chr1_80,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PedSV.2.5.2_DEL_chr1_210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [292]:
nbl_sv_dosages.shape

(3837, 5519)

So we now have the dosages for both cases and controls. We analyze these two datasets separately, and will need to determine post-hoc if there is a unique enrichment in cases vs. control. Additionally, we need to drop samples that are poorly genotyped for these SVs. The strategy that Ryan and Riaz used is to drop samples with >5% `NaN` genotyping rate. We do that here.

In [293]:
kept = pd.isnull(nbl_sv_dosages).sum(axis = 0) / len(nbl_sv_dosages) < 0.05

nbl_kept_cases = list(set(nbl_cases) & set(kept[kept].index))
nbl_kept_controls = list(set(nbl_controls) & set(kept[kept].index))

In [294]:
nbl_sv_dosages = nbl_sv_dosages.loc[:, kept]

In [295]:
nbl_sv_dosages.shape

(3837, 5346)

So now we can generate our counts, separately for cases and controls. We can ignore `NaNs`.

In [296]:
sv_counts_cases = nbl_sv_dosages[nbl_kept_cases].fillna(0).sum(axis=1).sort_values()
sv_counts_controls = nbl_sv_dosages[nbl_kept_controls].fillna(0).sum(axis=1).sort_values()
sv_counts_cases.head()

ID
PedSV.2.5.2_DEL_chr1_80       0.0
PedSV.2.5.2_DEL_chr12_7605    0.0
PedSV.2.5.2_DEL_chr12_7621    0.0
PedSV.2.5.2_DEL_chr12_7627    0.0
PedSV.2.5.2_DEL_chr12_7653    0.0
dtype: float64

Note that some SVs have 0 counts, presumably because those SVs are present in non-neuroblastoma samples? Let's just verify that.

In [297]:
test_sv = dosages.set_index("ID").loc["PedSV.2.5.2_DEL_chr1_80"].iloc[3:]
test_sv[test_sv == 1]

SJ042098    1.0
Name: PedSV.2.5.2_DEL_chr1_80, dtype: object

In [298]:
metadata.set_index("entity:sample_id").loc["SJ042098"]

ancestry_short_variant_inferred_or_reported                                       NaN
batch                                          PedSV.v2-wgd_score_1-median_coverage_2
study                                                                          StJude
disease                                                                  osteosarcoma
family_id                                                                         NaN
                                                                ...                  
pancan_control                                                                  False
osteosarcoma_control                                                            False
neuroblastoma_control                                                           False
ewing_control                                                                   False
sex                                                                                 0
Name: SJ042098, Length: 73, dtype: object

Yep. Alright, we can move on and actually count things up. First, how many SVs are we actually dealing with here?

In [299]:
sv_counts_cases.sum(), sv_counts_controls.sum()

(438.0, 2441.0)

We can see that the cases have a higher proportion of counts relative to the proportion of samples, which is what we expect (this category is significant).

Let's see what happens when we convert to gene counts.

In [300]:
len(svs_in_category)

3837

In [301]:
genes_in_svs = svs_in_category[['name'] + genic_relationships].set_index('name')

In [302]:
gene_counts = {'cases': [], 'controls': []}
for cohort, sv_counts in zip(['cases', 'controls'], [sv_counts_cases, sv_counts_controls]):

    sample_genes_in_svs = genes_in_svs.copy()
    sample_genes_in_svs.loc[sv_counts.index, 'count'] = sv_counts.astype(int)

    # simple enough to go through
    cohort_gene_counts = []
    for index, row in sample_genes_in_svs.iterrows():
        if not pd.isnull(row['PREDICTED_LOF']):
            cohort_gene_counts += row['PREDICTED_LOF'].split(',') * row['count']
        if not pd.isnull(row['PREDICTED_PARTIAL_EXON_DUP']):
            cohort_gene_counts += row['PREDICTED_PARTIAL_EXON_DUP'].split(',') * row['count']

    gene_counts[cohort] = pd.DataFrame(np.unique(cohort_gene_counts, return_counts = True), index = ['gene', 'count']).T

In [303]:
gene_counts['cases'].shape, gene_counts['controls'].shape

((511, 2), (2790, 2))

Great. Now we can try merging this with a gene set to test significance. We'll try it with a small, well characterized gene set first.

In [174]:
hallmark = gp.get_library(name='MSigDB_Hallmark_2020')

In [175]:
g2m_checkpoint = hallmark['G2-M Checkpoint']
len(g2m_checkpoint)

200

We need to calculate 4 numbers for our Fisher's exact test:

1. The counts of genes in the gene set and category
2. The counts of genes in the category and not the gene set
3. The counts of genes in the gene set and not the category
4. The counts of genes in neither (~19k)

We then do a fisher's exact test.

In [177]:
for cohort in ['cases', 'controls']:
    
    cohort_gene_counts = gene_counts[cohort]
    
    genes_in_category_and_gs = cohort_gene_counts[cohort_gene_counts['gene'].isin(g2m_checkpoint)]['count'].sum()
    genes_in_category_and_not_gs = cohort_gene_counts[~cohort_gene_counts['gene'].isin(g2m_checkpoint)]['count'].sum()

    genes_not_in_category_and_in_gs = len(set(g2m_checkpoint) - set(cohort_gene_counts['gene']))
    genes_not_in_category_and_not_gs = len(gene_ref) - genes_not_in_category_and_in_gs
    
    cont_table = np.array([[genes_in_category_and_gs, genes_in_category_and_not_gs], 
                       [genes_not_in_category_and_in_gs, genes_not_in_category_and_not_gs]])
    print(scipy.stats.fisher_exact(cont_table))

(0.8991389913899139, 1.0)
(0.9707648735179256, 1.0)


So neither cases nor controls are significantly enriched for G2-M checkpoint genes. But at least this process makes sense. Now we can generalize a bit.

In [194]:
hallmark_results = []
for gs, genes in hallmark.items():
    
    # store the expected frequency
    expected = f'{len(genes)}/{len(gene_ref)}'
    row = [gs, expected]
    
    for cohort in ['cases', 'controls']:
        
        cohort_gene_counts = gene_counts[cohort]
        
        upper_left = cohort_gene_counts[cohort_gene_counts['gene'].isin(genes)]['count'].sum()
        upper_right = cohort_gene_counts[~cohort_gene_counts['gene'].isin(genes)]['count'].sum()

        bottom_left = len(set(genes) - set(cohort_gene_counts['gene']))
        botttom_right = len(gene_ref) - bottom_left

        cont_table = np.array([[upper_left, upper_right], 
                               [bottom_left, botttom_right]])
        res, p = scipy.stats.fisher_exact(cont_table)

        # provide the expected and the data frequencies
        data = f'{upper_left}/{upper_left + upper_right}'
        row += [res, p, data]
        
    hallmark_results.append(row)
    
col_names = ['gene_set', 'expected', 'case_stat', 'case_p', 'case_data', 'control_stat', 'control_p', 'control_data']
hallmark_results = pd.DataFrame(hallmark_results, columns = col_names)

In [196]:
hallmark_results.query('case_p < 0.05')

Unnamed: 0,gene_set,expected,case_stat,case_p,case_data,control_stat,control_p,control_data
11,Adipogenesis,200/19201,2.670747,0.001485,14/547,0.783782,0.316489,24/3334
25,mTORC1 Signaling,200/19201,0.0,0.007451,0/547,0.746616,0.229092,23/3334


Interesting... we'll follow up on that in a second.

# Walk through a non-coding example

Let's extract out all the data that we need to examine the highest result for a single hallmark gene set.

In [204]:
# we'll select a non-tad framework for testing
nontad_test_framework = nbl_framework_results[(nbl_framework_results['af_category'] == "singleton") & 
                      (nbl_framework_results['sv_category'] == "non-coding") &
                      (~nbl_framework_results['category_name'].str.contains('tad'))].iloc[0]['category_name']

framework_components = nontad_test_framework.split(".")
genic_relationship = framework_components[2]
test_framework, genic_relationship

('ANY.SINGLETON.PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP.ANY.ANY.protein_coding',
 'ANY')

This is good practice - we see that the genic relationship here is `ANY`. In the context of noncoding analysis, this has a specific meaning.

In [205]:
nbl_noncoding_framework.head(6)

Unnamed: 0,sv_type,frequency,functional_intersection,functional_category,genic_relationship,constraint,expression,gene_group
0,DUP,RARE,PREDICTED_NONCODING_BREAKPOINT,neuroblastoma_atac_peaks,PREDICTED_INTERGENIC,lof_constrained,expressed_in_adrenal_gland,protein_coding
1,DEL,SINGLETON,PREDICTED_NONCODING_SPAN,neuroblastoma_chromHMM15_Enh,PREDICTED_INTRONIC,ANY,ANY,cosmic_and_germline_CPGs
2,CPX_or_INV,,ANY,neuroblastoma_chromHMM15_Enh_conserved,PREDICTED_PROMOTER,,,
3,INS_ALL,,,neuroblastoma_chromHMM15_EnhG,PREDICTED_UTR,,,
4,ANY,,,neuroblastoma_chromHMM15_EnhG_conserved,ANY,,,
5,,,,neuroblastoma_H3K27Ac_peak,,,,


So `ANY` really means `PREDICTED_INTERGENIC | PREDICTED INTRONIC | PREDICTED PROMOTER | PREDICTED UTR`

In [206]:
genic_relationships = ['PREDICTED_INTERGENIC', 'PREDICTED_INTRONIC', 'PREDICTED_PROMOTER', 'PREDICTED_UTR']

# a weird feature of these data is that PREDICTED_INTERGENIC is actually boolean, and refers to PREDICTED_NEAREST_TSS
genic_relationships[genic_relationships.index('PREDICTED_INTERGENIC')] = 'PREDICTED_NEAREST_TSS'

Subset down to those SVssvs

In [207]:
svs_in_category = nbl_category_svs[(nbl_category_svs['sv_category'] == "non-coding") & 
                                   (nbl_category_svs["category"] == nontad_test_framework)]

# subset the actual SV matrix
svs_in_category = svs[svs["name"].isin(svs_in_category["SV"].tolist())]

svs_in_category.head(2)

Unnamed: 0,#chrom,start,end,name,svtype,AC,AF,ALGORITHMS,AN,BOTHSIDES_SUPPORT,...,trio_POPMAX_FREQ_HOMALT,trio_POPMAX_CN_FREQ,trio_POPMAX_CN_NONREF_FREQ,gnomad_v3.1_sv_POPMAX_AF,gnomad_v3.1_sv_POPMAX_FREQ_HOMREF,gnomad_v3.1_sv_POPMAX_FREQ_HET,gnomad_v3.1_sv_POPMAX_FREQ_HOMALT,gnomad_v3.1_sv_POPMAX_CN_FREQ,gnomad_v3.1_sv_POPMAX_CN_NONREF_FREQ,FILTER
120,chr1,1116266,1116473,PedSV.2.5.2_DEL_chr1_165,DEL,1,7.4e-05,manta,13462,True,...,0.0,,,0.0,,,,,,PASS
137,chr1,1157302,1157390,PedSV.2.5.2_DEL_chr1_186,DEL,1,7.4e-05,wham,13462,False,...,0.0,,,3.1e-05,,,,,,PASS


In [208]:
svs_in_category.shape

(3021, 952)

In [209]:
svs_in_category[genic_relationships].head()

Unnamed: 0,PREDICTED_NEAREST_TSS,PREDICTED_INTRONIC,PREDICTED_PROMOTER,PREDICTED_UTR
120,,,C1orf159,
137,TTLL10,,,
224,,ATAD3B,,
584,,,C1orf174,
836,HES3,,,


We should be a bit more careful if any results turn up positive, but for now we'll just register an SV as contributing to a count for that gene.

In [210]:
nbl_sv_dosages = (
    dosages
    .set_index("ID")
    .loc[svs_in_category["name"].tolist(), nbl_cases + nbl_controls]
)
nbl_sv_dosages.head(2)

Unnamed: 0_level_0,PT_00QYKRAX,PT_00Y8C0XA,PT_025YMME2,PT_02AE4RSP,PT_02SNWVRF,PT_06Z51EN5,PT_0CKD259J,PT_0GMP9VVY,PT_0MVMPZKX,PT_11XN6CG5,...,ssi_26401,ssi_26409,ssi_26411,ssi_26422,ssi_26442,ssi_26452,ssi_26453,ssi_26458,ssi_26459,ssi_26463
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PedSV.2.5.2_DEL_chr1_165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PedSV.2.5.2_DEL_chr1_186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Again subset our dosage matrix, dropping samples with bad GT rates

In [211]:
kept = pd.isnull(nbl_sv_dosages).sum(axis = 0) / len(nbl_sv_dosages) < 0.05

nbl_kept_cases = list(set(nbl_cases) & set(kept[kept].index))
nbl_kept_controls = list(set(nbl_controls) & set(kept[kept].index))

In [212]:
nbl_sv_dosages = nbl_sv_dosages.loc[:, kept]

In [213]:
nbl_sv_dosages.shape

(3021, 5346)

In [214]:
sv_counts_cases = nbl_sv_dosages[nbl_kept_cases].fillna(0).sum(axis=1).sort_values()
sv_counts_controls = nbl_sv_dosages[nbl_kept_controls].fillna(0).sum(axis=1).sort_values()
sv_counts_cases.head()

ID
PedSV.2.5.2_DEL_chr1_165     0.0
PedSV.2.5.2_DEL_chr12_612    0.0
PedSV.2.5.2_DEL_chr12_615    0.0
PedSV.2.5.2_DEL_chr12_705    0.0
PedSV.2.5.2_DEL_chr12_839    0.0
dtype: float64

First, how many SVs are we actually dealing with here?

In [215]:
sv_counts_cases.sum(), sv_counts_controls.sum()

(339.0, 1962.0)

So maybe that's not so bad? We'll see what happens.

In [216]:
genes_in_svs = svs_in_category[['name'] + genic_relationships].set_index('name')

In [218]:
gene_counts = {'cases': [], 'controls': []}
for cohort, sv_counts in zip(['cases', 'controls'], [sv_counts_cases, sv_counts_controls]):

    sample_genes_in_svs = genes_in_svs.copy()
    sample_genes_in_svs.loc[sv_counts.index, 'count'] = sv_counts.astype(int)

    # simple enough to go through
    cohort_gene_counts = []
    for index, row in sample_genes_in_svs.iterrows():
        for col in genic_relationships:
            if not pd.isnull(row[col]):
                cohort_gene_counts += row[col].split(',') * row['count']

    gene_counts[cohort] = pd.DataFrame(np.unique(cohort_gene_counts, return_counts = True), index = ['gene', 'count']).T

In [219]:
gene_counts['cases'].shape, gene_counts['controls'].shape

((314, 2), (1565, 2))

Calculate with Fisher's exact:

In [220]:
for cohort in ['cases', 'controls']:
    
    cohort_gene_counts = gene_counts[cohort]
    
    genes_in_category_and_gs = cohort_gene_counts[cohort_gene_counts['gene'].isin(g2m_checkpoint)]['count'].sum()
    genes_in_category_and_not_gs = cohort_gene_counts[~cohort_gene_counts['gene'].isin(g2m_checkpoint)]['count'].sum()

    genes_not_in_category_and_in_gs = len(set(g2m_checkpoint) - set(cohort_gene_counts['gene']))
    genes_not_in_category_and_not_gs = len(gene_ref) - genes_not_in_category_and_in_gs
    
    cont_table = np.array([[genes_in_category_and_gs, genes_in_category_and_not_gs], 
                       [genes_not_in_category_and_in_gs, genes_not_in_category_and_not_gs]])
    print(scipy.stats.fisher_exact(cont_table))

(0.8638836275475414, 1.0)
(1.37372476979423, 0.14412787728548032)


In [221]:
hallmark_results = []
for gs, genes in hallmark.items():
    
    # store the expected frequency
    expected = f'{len(genes)}/{len(gene_ref)}'
    row = [gs, expected]
    
    for cohort in ['cases', 'controls']:
        
        cohort_gene_counts = gene_counts[cohort]
        
        top_left = cohort_gene_counts[cohort_gene_counts['gene'].isin(genes)]['count'].sum()
        top_right = cohort_gene_counts[~cohort_gene_counts['gene'].isin(genes)]['count'].sum()

        bottom_left = len(set(genes) - set(cohort_gene_counts['gene']))
        botttom_right = len(gene_ref) - bottom_left

        cont_table = np.array([[upper_left, upper_right], 
                               [bottom_left, botttom_right]])
        res, p = scipy.stats.fisher_exact(cont_table)

        # provide the expected and the data frequencies
        data = f'{top_left}/{top_left + top_right}'
        row += [res, p, data]
        
    hallmark_results.append(row)
    
col_names = ['gene_set', 'expected', 'case_stat', 'case_p', 'case_data', 'control_stat', 'control_p', 'control_data']
hallmark_results = pd.DataFrame(hallmark_results, columns = col_names)

In [223]:
hallmark_results.query('case_p < 0.05')

Unnamed: 0,gene_set,expected,case_stat,case_p,case_data,control_stat,control_p,control_data
0,TNF-alpha Signaling via NF-kB,200/19201,3.034415,0.002708,10/338,2.047719,0.0002785264,36/1981
3,Mitotic Spindle,199/19201,2.412819,0.022825,8/338,1.863254,0.002358557,33/1981
11,Adipogenesis,200/19201,2.400126,0.023432,8/338,1.405359,0.1166004,26/1981
16,Protein Secretion,96/19201,3.118717,0.026645,5/338,1.226758,0.4865993,11/1981
23,Unfolded Protein Response,113/19201,2.629969,0.048101,5/338,0.737422,0.5164636,8/1981
38,UV Response Dn,144/19201,3.863425,0.000918,9/338,3.906664,1.042503e-11,44/1981
43,Bile Acid Metabolism,112/19201,2.65446,0.046622,5/338,1.025317,0.8726269,11/1981
45,Allograft Rejection,200/19201,2.722672,0.008239,9/338,0.775782,0.3962079,15/1981
49,Pancreas Beta Cells,40/19201,4.516025,0.033713,3/338,2.285799,0.05459697,8/1981


# Generalized gene set enrichment

Alright, we've been through two examples. Now let's try generalizing across two axes--categories and gene sets. For now, we'll only neuroblastoma significant categories.

I don't know if these `p_values` are already corrected or not. I'll assume they're not.

## Define the categories for analysis

Here, we'll select which categories we want to examine. We'll stick to neuroblastoma, but we'll examine `singleton` and `rare`, as well as `noncoding` and `coding`.

Worth mentioning that the `noncoding` categories could be quite difficult to interpret.

In [224]:
nbl_analysis_categories = nbl_framework_results.query('negative_log10_p_value > 3.5')
nbl_analysis_categories.shape

(77, 22)

Here, we'll also define a helpful lookup that maps from the "collapsed" genic relationships to all their component relationships.

In [225]:
gr_coding_mapping = {'PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP': ['PREDICTED_LOF', 'PREDICTED_PARTIAL_EXON_DUP'],
                     'ANY': ['PREDICTED_COPY_GAIN', 'PREDICTED_INTRAGENIC_EXON_DUP', 'PREDICTED_LOF', 'PREDICTED_PARTIAL_EXON_DUP']}

gr_noncoding_mapping = {'ANY': ['PREDICTED_INTERGENIC', 'PREDICTED_INTRONIC', 'PREDICTED_PROMOTER', 'PREDICTED_UTR']}


## Define the gene sets for analysis

Let's highlight some specific gene sets for analysis. We'll do the following:

* `MSigDB_Hallmark_2020`
* `GO_Biological_Process_2023` (this is the default for GO term analysis)
* `Reactome_2022`

We'll begin with these, and then we can add in other specific ones that might be relevant later:

### GO terms

We're using `gseapy` to get our gene sets. Unfortunately, I've discovered that the gene sets procided by `gp.get_library` _do not_ match the GO gene sets fetched by `biomaRt`. The libraries defined here are from `enrichr`, and from digging around a bit, they do not match. Maybe they're some custom file or something?

In [362]:
bm = Biomart()

In [394]:
gene_sets = {}
for db_name in ['MSigDB_Hallmark_2020', 'GO_Biological_Process_2023', 'Reactome_2022']:
    db = gp.get_library(name=db_name)
    gene_sets[db_name] = db

We need to fix the GO terms.

In [363]:
# # fetch the GO ids, and then make them into 50 length chunks
# pattern = r'\((.*?)\)'
# go_id_to_go_term = {re.findall(pattern, g)[0]: g for g in gs['GO_Biological_Process_2023']}
# go_chunks = [list(go_id_to_go_term.keys())[i:i + 50] for i in range(0, len(go_id_to_go_term.keys()), 50)]

# go_results = []
# for i, go_ids in enumerate(go_chunks):
#     print(i, end = ', ')
#     queries ={'go': go_ids}
    
#     # look up the true GO ids from biomart
#     results = bm.query(dataset='hsapiens_gene_ensembl',
#                        attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id', 'go'],
#                        filters=queries)
#     go_results.append(results)
    
# go_results = pd.concat(go_results)
# go_results['go_term'] = go_results['go'].replace(go_id_to_go_term)
# go_results[['go_term', 'external_gene_name']].to_csv('ref/GO_Biological_Process_Gene_Mapping.csv', index=False)

108

In [397]:
go_terms = pd.read_csv('ref/GO_Biological_Process_Gene_Mapping.csv').set_index('go_term')
go_terms.head(2)

Unnamed: 0_level_0,external_gene_name
go_term,Unnamed: 1_level_1
DNA Damage Checkpoint Signaling (GO:0000077),E2F1
DNA Damage Checkpoint Signaling (GO:0000077),DONSON


In [402]:
new_go = {}
for gs, genes in gene_sets['GO_Biological_Process_2023'].items():
    if gs in go_terms.index:
        correct_genes = go_terms.loc[gs]
        new_go[gs] = correct_genes
        
gene_sets['GO_Biological_Process_2023'] = new_go

In [406]:
adrenal_genes = pd.read_csv('ref/adrenal-specific-genes.txt', sep='\t', comment = '#')['Gene Name'].tolist()
adrenal_genes = [g for g in adrenal_genes if g in gene_ref]

gene_sets['custom'] = {'adrenal-specific-exp': adrenal_genes}

In [407]:
gs_count = 0
for db_name, db in gene_sets.items():
    gs_count += len(db.values())
gs_count

6665

Clearly that's going to lead to some false positives, but it is what it is.

## Run the thing

This code will need to be decently adaptable, since it has to handle a few different unique components (noncoding categories, etc).

In [408]:
gse_results = []

for i, (index, row) in enumerate(nbl_analysis_categories.iterrows()):
    print(i, end = ', ')
    cat_name = row['category_name']
    af_category = row['af_category']
    sv_category = row['sv_category']
    p_category = row['p_value']
    
    cat_components = cat_name.split('.')
    
    base_row = [cat_name, af_category, sv_category, p_category]
    
    # here, we define the necessary genic relationships
    # we handle the collapsed categories as well
    if sv_category == 'coding':
        gr = cat_components[2]

        # convert gr to components
        genic_rel = gr_coding_mapping.get(gr, [gr])
        
    elif sv_category == 'non-coding':
        gr = cat_components[4]

        # convert gr to components
        genic_rel = gr_noncoding_mapping.get(gr, [gr])
        
    # swap out intergenic for nearest_tss
    if 'PREDICTED_INTERGENIC' in genic_rel:
        genic_rel[genic_rel.index('PREDICTED_INTERGENIC')] = 'PREDICTED_NEAREST_TSS'
        
    # next, we pull out our SVs in this category
    svs_in_category = nbl_category_svs[(nbl_category_svs['sv_category'] == sv_category) & 
                                       (nbl_category_svs["category"] == cat_name)]
    
    # subset the actual SV matrix
    svs_in_category = svs[svs["name"].isin(svs_in_category["SV"].tolist())]
    
    # extract the dosages for cases and controls
    nbl_sv_dosages = (
        dosages
        .set_index("ID")
        .loc[svs_in_category["name"].tolist(), nbl_cases + nbl_controls]
        )
    
    # drop samples with bad GT rates
    kept = pd.isnull(nbl_sv_dosages).sum(axis = 0) / len(nbl_sv_dosages) < 0.05
    nbl_kept_cases = list(set(nbl_cases) & set(kept[kept].index))
    nbl_kept_controls = list(set(nbl_controls) & set(kept[kept].index))
    
    nbl_sv_dosages = nbl_sv_dosages.loc[:, kept]
    
    # define the number of unique SVs
    sv_counts_cases = nbl_sv_dosages[nbl_kept_cases].fillna(0).sum(axis=1).sort_values()
    sv_counts_controls = nbl_sv_dosages[nbl_kept_controls].fillna(0).sum(axis=1).sort_values()
    
    # define our affected genes
    genes_in_svs = svs_in_category[['name'] + genic_rel].set_index('name')
    
    # for each gene, count up the number of SVs in cases and controls, separately.
    gene_counts = {'cases': [], 'controls': []}
    for cohort, sv_counts in zip(['cases', 'controls'], [sv_counts_cases, sv_counts_controls]):

        sample_genes_in_svs = genes_in_svs.copy()
        sample_genes_in_svs.loc[sv_counts.index, 'count'] = sv_counts.astype(int)

        # count for each genic relationship
        cohort_gene_counts = []
        for index, row in sample_genes_in_svs.iterrows():
            for col in genic_rel:
                if not pd.isnull(row[col]):
                    cohort_gene_counts += row[col].split(',') * row['count']

        gene_counts[cohort] = pd.DataFrame(np.unique(cohort_gene_counts, return_counts = True), index = ['gene', 'count']).T
        
        # store the number of unique SVs for cases and controls
        base_row.append(len(gene_counts[cohort]))
        
    # for each gene set, create a contingency matrix and calculate our
    # fisher's result
    for db_name, db in gene_sets.items():
        for gs_name, gs_genes in db.items():
            
            expected = f'{len(gs_genes)}/{len(gene_ref)}'
            row = base_row + [db_name, gs_name, expected]
            
            for cohort in ['cases', 'controls']:
                cohort_gene_counts = gene_counts[cohort]
            
                top_left = cohort_gene_counts[cohort_gene_counts['gene'].isin(gs_genes)]['count'].sum()
                top_right = cohort_gene_counts[~cohort_gene_counts['gene'].isin(gs_genes)]['count'].sum()

                bottom_left = len(set(gs_genes) - set(cohort_gene_counts['gene']))
                bottom_right = len(gene_ref) - bottom_left
            
                cont_table = np.array([[top_left, top_right], 
                                       [bottom_left, bottom_right]])
                res, p = scipy.stats.fisher_exact(cont_table)

                # calculate the number of unique overlaps. Helps us determine
                # if signal is being driven by a single/few gene(s)
                unique_overlaps = len(set(gs_genes) & set(cohort_gene_counts['gene']))
                data = f'{top_left}/{top_right + top_left}'

                # store our data
                row = row + [unique_overlaps, res, p, data]
        
            gse_results.append(row)
            
names = ['category', 'af_category', 'sv_category', 'category_p', 'num_svs_cases', 
         'num_svs_controls', 'db', 'gs', 'expected', 
         'gs_unique_overlap_cases', 'res_cases', 'p_cases', 'data_cases',
         'gs_unique_overlap_controls', 'res_controls', 'p_controls', 'data_controls']

gse_results = pd.DataFrame(gse_results, columns = names)

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 

In [419]:
gse_results.to_csv('data/cwas-results/cwwas-gene-set-enrichment-results.csv', index=False)

In [421]:
gse_results['category'].iloc[0]

'ANY.SINGLETON.PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP.ANY.ANY.protein_coding'