In [53]:
import pandas as pd
import pybedtools
import numpy as np
from collections import defaultdict
import pyranges
import scipy

import statsmodels.stats.multitest

# Extract category genes

After some consideration and thought, I've realized that performing naive gene set enrichment on noncoding categories--which are themselves already gene subsets--is confounding two signals. The first signal is the enrichment that accompanies the category itself (an ATAC peak, for example, is likely to be enriched for genes responsible for certain processes). The second is a different kind of enrichment, focused on SVs.

Separating these influences is worth it, I think. To do so requires us to calculate the gene sets that are present within each category.

# Read in our data

First, we read in the frameworks

In [2]:
#########################
### FRAMEWORK RESULTS ###
#########################
nbl_singleton_coding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_coding_cwas_concatenated_glm_results_SINGLETON_11_3_23.txt",
    sep="\t",
)
nbl_singleton_coding_framework_results[['af_category', 'sv_category']] = ['singleton', 'coding']

nbl_rare_coding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_coding_cwas_concatenated_glm_results_RARE_11_3_23.txt",
    sep="\t",
)
nbl_rare_coding_framework_results[['af_category', 'sv_category']] = ['rare', 'coding']

nbl_singleton_noncoding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_noncoding_cwas_concatenated_glm_results_SINGLETON_11_3_23.txt",
    sep="\t",
)
nbl_singleton_noncoding_framework_results[['af_category', 'sv_category']] = ['singleton', 'non-coding']

nbl_rare_noncoding_framework_results = pd.read_csv(
    "data/CWAS data for Jett/CWAS sum stats/neuroblastoma_all_noncoding_cwas_concatenated_glm_results_RARE_11_3_23.txt",
    sep="\t",
)
nbl_rare_noncoding_framework_results[['af_category', 'sv_category']] = ['rare', 'non-coding']

nbl_framework_results = pd.concat([nbl_singleton_coding_framework_results, nbl_rare_coding_framework_results, 
                                   nbl_singleton_noncoding_framework_results, nbl_rare_noncoding_framework_results]).reset_index(drop = True)

In [3]:
len(nbl_framework_results['category_name'])

4939

In [4]:
nbl_coding_framework = pd.read_csv(
    "data/CWAS data for Jett/CWAS frameworks/CWAS_categories_neuroblastoma_coding_8_17_23.txt",
    sep="\t",
)

nbl_noncoding_framework = pd.read_csv(
    "data/CWAS data for Jett/CWAS frameworks/CWAS_rare_categories_neuroblastoma_noncoding_10_2_23.txt",
    sep="\t",
)

for effect, framework in zip(['coding', 'non-coding'], [nbl_coding_framework, nbl_noncoding_framework]):
    print(effect)
    print('-' * 40)
    for col in framework.columns:
        print(col, ':', framework[col].dropna().values)
        print()

coding
----------------------------------------
sv_type : ['DUP' 'DEL' 'CPX_or_INV' 'INS_ALL' 'ANY']

frequency : ['RARE' 'SINGLETON']

genic_relationship : ['PREDICTED_COPY_GAIN' 'PREDICTED_INTRAGENIC_EXON_DUP'
 'PREDICTED_LOF_or_PREDICTED_PARTIAL_EXON_DUP' 'ANY']

constraint : ['lof_constrained' 'missense_constrained' 'unconstrained' 'ANY']

expression : ['expressed_in_adrenal_gland' 'ANY']

gene_group : ['protein_coding' 'cosmic_cancer_genes' 'germline_CPGs'
 'base_excision_repair_genes' 'chromatin_organization_genes'
 'dna_damage_bypass_genes' 'dna_damage_reversal_genes'
 'dna_DSB_repair_genes' 'dna_DSB_response_genes' 'dna_repair_genes'
 'fanconi_genes' 'mismatch_repair_genes'
 'nucleotide_excision_repair_genes' 'oncogenic_MAPK_signaling_genes'
 'signaling_transduction_by_receptor_tyrosine_kinases_genes']

non-coding
----------------------------------------
sv_type : ['DUP' 'DEL' 'CPX_or_INV' 'INS_ALL' 'ANY']

frequency : ['RARE' 'SINGLETON']

functional_intersection : ['PREDICTED

Some of these characteristics impose restrictions on genomic regions that are accessible in the analysis. THESE are the characteristics that we need to use to impose restrictions. They are as follows:

* Coding (other gene groups besides protein_coding are ignored, as they represent gene sets themselves)  
    * lof_constrained
    * missense_constrained
    * unconstrained
    * expressed_in_adrenal_gland  
    * protein_coding  

* Non-coding (other gene groups besides protein_coding are ignored, as they represent gene sets themselves)  
    * functional_categories  
    * lof_constrained
    * expressed_in_adrenal_gland
    * protein_coding

# Read in genes for non-functional categories

These are perhaps the easiest - Riaz has just provided these genes for us

In [5]:
paths = {'lof_constrained': 'data/CWAS data for Jett/lof_constrained_genes_7_31_23 (1).txt',
         'missense_constrained': 'data/CWAS data for Jett/missense_constrained_genes_7_31_23.txt',
         'unconstrained': 'data/CWAS data for Jett/unconstrained_genes_7_31_23.txt',
         'expressed_in_adrenal_gland': 'data/CWAS data for Jett/top_expressed_in_adrenal_gland_7_31_23 (1).txt',
         'protein_coding': 'data/CWAS data for Jett/gencode_hg38_protein_coding_genes_for_annotation_7_31_23 (1).txt',
         'cosmic_and_germline_CPGs': 'data/CWAS data for Jett/cosmic_cancer_genes_tier_one_and_germline_CPGs_10_23_23 (1).txt'}

gene_restrictions = {}
for label, path in paths.items():
    data = pd.read_csv(path)
    data = data[~data['value'].str.startswith('ENSG00')]
    data = data['value'].tolist()
    
    gene_restrictions[label] = data

In [6]:
print(len(gene_restrictions['expressed_in_adrenal_gland']))
print(len(gene_restrictions['protein_coding']))

9505
19201


Great.

# Identify genes within functional categories

We need to read in the genomic locations of the categories. Riaz has kindly already compiled these.

In [7]:
category_regions = pd.read_csv('data/CWAS data for Jett/final_noncoding_annotation_file_list_combined_7_18_23.txt', sep='\t')

In [8]:
regions = list(category_regions['annotation'].unique())

In [9]:
category_regions['annotation'].unique()

array(['recombination_hotspot', 'ewing_ABC_MAX_enhancer',
       'ewing_and_osteosarcoma_atac_peaks',
       'ewing_and_osteosarcoma_tad_boundary', 'ewing_chromHMM15_Enh',
       'ewing_chromHMM15_EnhBiv', 'ewing_chromHMM15_EnhG',
       'ewing_chromHMM15_Quies', 'ewing_chromHMM15_TssAFlnk',
       'ewing_encode_consensus_enhancers', 'ewing_H3K27Ac_peak',
       'fragile_site', 'neuroblastoma_ABC_MAX_enhancer',
       'neuroblastoma_atac_peaks', 'neuroblastoma_chromHMM15_Enh',
       'neuroblastoma_chromHMM15_EnhBiv', 'neuroblastoma_chromHMM15_EnhG',
       'neuroblastoma_chromHMM15_Quies',
       'neuroblastoma_chromHMM15_TssAFlnk',
       'neuroblastoma_encode_consensus_enhancers',
       'neuroblastoma_H3K27Ac_peak', 'neuroblastoma_tad_boundary',
       'osteosarcoma_ABC_MAX_enhancer', 'osteosarcoma_chromHMM15_Enh',
       'osteosarcoma_chromHMM15_EnhBiv', 'osteosarcoma_chromHMM15_EnhG',
       'osteosarcoma_chromHMM15_Quies',
       'osteosarcoma_chromHMM15_TssAFlnk',
       'osteo

Wonderful--this lists all the regions along with their coordinates. Let's give a value for each of the regions so we know which is which.

In [10]:
category_regions['annotation'] = category_regions['annotation'] + '_' + np.array(category_regions.index).astype(str)

## Download genes

Next, we extract out the genic locations, for cross-reference with our category regions. We use gencode.

In [None]:
gtf = pd.read_csv(
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.basic.annotation.gtf.gz",
    comment="#",
    sep="\t",
    header=None,
    names=[
        "seqname",
        "source",
        "feature",
        "start",
        "end",
        "score",
        "strand",
        "frame",
        "attribute",
    ],
)

In [None]:
genes = gtf.query('feature == "gene"').reset_index(drop = True)

characteristics = {}
for index, row in genes.iterrows():
    gene_data = row['attribute'].split('; ')
    gene_data = {field.split(' ')[0]: field.split(' ')[1].strip(';') for field in gene_data}
    gene_data = {key: val.strip('"') for key, val in gene_data.items()}
    characteristics[index] = gene_data
    
characteristics = pd.DataFrame(characteristics).T

genes = pd.concat([genes, characteristics], axis = 1)

In [None]:
# drop genes in PAR
genes = genes[~(genes['tag'] == 'PAR')
             & (genes['gene_type'] == 'protein_coding')]

In [None]:
genes.shape

We subset this to genes that are in our dataset.

In [None]:
gene_ref = gene_restrictions['protein_coding']
missing_genes = set(gene_ref) - set(genes['gene_name'])
print(len(gene_ref), len(missing_genes))

So a few genes are present in our reference dataset but missing from the gtf. What are these genes?

In [None]:
print(sorted(missing_genes))

In [None]:
gtf[gtf['attribute'].str.contains('HLA-DRB3')]

It's not at all obvious to me why these genes are missing, but there is truly no mention of them in the GTF (even in non-gene entries). So we move on, further restricting our analysis to these genes.

In [None]:
gene_subset = genes[genes['gene_name'].isin(gene_ref)]
print(gene_subset.shape[0], len(set(gene_subset['gene_name'])))

So there are some duplicates. In looking at them, it's not entirely obvious which is which.

In [None]:
gene_counts = gene_subset['gene_name'].value_counts()
duplicates = gene_counts[gene_counts > 1]

# some heuristics to get proper genes
duplicate_gene_subset = gene_subset[gene_subset['gene_name'].isin(duplicates.index)]
duplicate_gene_subset.shape

In [None]:
gene_subset = gene_subset.drop_duplicates(subset = ['gene_name'])
gene_subset.shape

## Do the intersection

We intersect the category regions with the genes. This will yield a list of genes that are present in these categories.

Some category regions, however, will not intersect genes. This will yield no gene and be valid for most comparisons. For some noncoding analyses, however, the nearest gene must also be included in the analysis, since `PREDICTED_INTERGENIC` can yield this.

Incorporating this is extremely frustrating, as it's not clear how to optimally do this. For example, consider a region, with a closest gene. There are many possibilities:

1. The closest gene is within the region
2. The closest gene is outside the region, but there is also a gene in the region
3. The closest gene is downstream, but at another point in the region, the closest gene is upstream

Which of these is the case depends very strongly on where the SV is located within the region. However, we can't use the SV locations, as the point of gene set enrichment is to examine _which genes COULD be impacted_. This is... unresolvable.

I suppose the best way forward is to assume the following:

1. If a gene is within a region, it will be closest to all SVs in that region if those SVs are noncoding
2. If a gene is not within a region, the closest gene to the region will be the closest gene to all SVs

In [None]:
# do the intersection with pybedtools.
category_bed = pybedtools.BedTool(
    category_regions.to_csv(sep="\t", index=False, header=False), from_string=True
)

gene_bed = pybedtools.BedTool(
    gene_subset[['seqname', 'start', 'end', 'gene_name']].to_csv(sep="\t", index=False, header=False),
    from_string=True,
)
/
genes_in_regions = category_bed.intersect(gene_bed, wao=True)
genes_in_regions = genes_in_regions.to_dataframe()

In [None]:
genes_in_regions['category'] = genes_in_regions['name'].apply(lambda n: '_'.join(n.split('_')[:-1]))
genes_in_regions = genes_in_regions.rename(columns = {'thickEnd': 'gene'})

In [None]:
genes_in_regions.head(3)

So now, for regions that do not contain a gene, we examine the closest gene.

In [None]:
regions_without_genes = genes_in_regions.query('score == "."')[['chrom', 'start', 'end', 'name']].sort_values(by = ['chrom', 'start'])


In [None]:
# do the intersection with pybedtools.
without_genes_bed = pybedtools.BedTool(
    regions_without_genes.to_csv(sep="\t", index=False, header=False), from_string=True
)

gene_bed = pybedtools.BedTool(
    gene_subset[['seqname', 'start', 'end', 'gene_name']].sort_values(by = ['seqname', 'start']).to_csv(sep="\t", index=False, header=False),
    from_string=True,
)

closest_genes = without_genes_bed.closest(gene_bed, d=True)
closest_genes = closest_genes.to_dataframe()

In [None]:
closest_genes['category'] = closest_genes['name'].apply(lambda n: '_'.join(n.split('_')[:-1]))
closest_genes = closest_genes.rename(columns = {'thickEnd': 'gene'})

In [None]:
closest_genes.head(2)

The number of genes identified through this process and the size of our original regions without genes are not the same:

In [None]:
regions_without_genes.shape, closest_genes.shape

What gives?

In [None]:
region_counts = closest_genes['name'].value_counts()
duplicate_regions = region_counts[region_counts > 1]

closest_genes[closest_genes['name'] == 'ewing_chromHMM15_Quies_390260']

I see. This is an issue with pseudogenes, etc. Well, I suppose we have to keep them.

In [None]:
genes_in_regions = genes_in_regions.query('gene != "."')

We now convert these to lookup dictionaries for easy access.

In [None]:
region_gene_lookup = defaultdict(dict)

for region in nbl_noncoding_framework['functional_category']:
    within_genes = set(genes_in_regions[genes_in_regions['category'] == region]['gene'])
    close_genes = set(closest_genes[closest_genes['category'] == region]['gene'])
    
    region_gene_lookup[region]['within'] = within_genes
    region_gene_lookup[region]['closest'] = close_genes

# Create our gene lists

Alright, now for the tricky part. For each category, we need to create a gene list. It's pretty clear to me that the best way to store this is in a sparse matrix. Each row will be a gene, and each column a category. The element will be 1 if that gene is eligible for that category and 0 otherwise.

Each category starts with all of the reference genes being eligible and then progressively shrinks to fit the category's requirements.

In [None]:
gene_ref = gene_subset['gene_name'].tolist()
len(gene_ref)

## Some examples

So ~19k genes are eligible as the "master" protein_coding list. An example, let's consider a few categories:

In [None]:
nbl_framework_results.loc[0, 'category_name']

This category receives no reductions whatsoever.

In [None]:
test_cat = 'ANY.SINGLETON.PREDICTED_NONCODING_SPAN.neuroblastoma_H3K27Ac_peak.PREDICTED_INTRONIC.ANY.ANY.protein_coding'

This category receives one reduction--from `neuroblastoma_H3K27Ac_peak`

In [None]:
neuroblastoma_H3K27Ac_peak_genes = region_gene_lookup['neuroblastoma_H3K27Ac_peak']['within']
len(neuroblastoma_H3K27Ac_peak_genes)

In [None]:
len(set(neuroblastoma_H3K27Ac_peak_genes) & set(gene_ref))

So from 19k genes down to 13.5k genes.

In [None]:
test_cat = 'ANY.SINGLETON.PREDICTED_NONCODING_BREAKPOINT.neuroblastoma_tad_boundary.ANY.ANY.ANY.protein_coding'

This framework receives two reductions--one for `neuroblastoma_tad_boundary` and one for `genic_relationship == "ANY"`, which contains within it `genic_relationship == "INTERGENIC"`. This means we need to include the closest genes as well.

In [None]:
neuroblastoma_tad_boundary_genes = region_gene_lookup['neuroblastoma_tad_boundary']['within']
neuroblastoma_tad_boundary_genes_closest = region_gene_lookup['neuroblastoma_tad_boundary']['closest']
print(len(neuroblastoma_tad_boundary_genes), len(neuroblastoma_tad_boundary_genes_closest))

neuroblastoma_tad_boundary_genes = neuroblastoma_tad_boundary_genes | neuroblastoma_tad_boundary_genes_closest
len(neuroblastoma_tad_boundary_genes)

## Formalize the calculation of gene lists

We define a quick function to make this easier

In [None]:
def category_to_gene_list(category, sv_effect, genes_in_regions, closest_genes, region_gene_lookup, reference_list):
    '''Takes in a category and its effect (coding or non-coding) and returns a gene list of eligible genes in that category.
    Takes in a reference gene list that will be subset as well.'''
    
    category_genes = set(reference_list.copy())
    
    if sv_effect == "coding":
        components = dict(zip(nbl_coding_framework.columns, category.split('.')))
            
    elif sv_effect == "non-coding":
        components = dict(zip(nbl_noncoding_framework.columns, category.split('.')))
        
    # we only analyze two gene groups
    gene_group = components['gene_group']
    if gene_group not in ['protein_coding', 'cosmic_and_germline_CPGs']:
        raise ValueError(f'gene_group {gene_group} not valid.')
        
    # apply constraints
    constraint = components['constraint']
    if components['constraint'] in ['lof_constrained', 'missense_constrained', 'unconstrained']:
        category_genes = category_genes & set(gene_restrictions[constraint])
    
    # apply expression
    if components['expression'] == 'expressed_in_adrenal_gland':
        category_genes = category_genes & set(gene_restrictions['expressed_in_adrenal_gland'])
    
    # if the sv is a non-coding SV, there are still more restrictions
    if sv_effect == "non-coding":
        
        # apply gene group if necessary
        if components['gene_group'] == 'cosmic_and_germline_CPGs':
            category_genes = category_genes & set(gene_restrictions['cosmic_and_germline_CPGs'])
            
        # then, assess the functional category
        func_cat = components['functional_category']
        func_cat_genes = region_gene_lookup[func_cat]['within']
        
        # then, we check the genic relationship, which determines whether we need the closest genes as well
        if components['genic_relationship'] in ['ANY', 'PREDICTED_INTERGENIC']:
            cat_closest_genes = region_gene_lookup[func_cat]['closest']
            func_cat_genes = func_cat_genes | cat_closest_genes
            
        # with this in mind, merge
        category_genes = category_genes & func_cat_genes
        
    return category_genes

In [None]:
test_cat = 'ANY.SINGLETON.PREDICTED_NONCODING_BREAKPOINT.neuroblastoma_tad_boundary.ANY.ANY.ANY.protein_coding'
category_genes = category_to_gene_list(test_cat, 'non-coding', genes_in_regions, closest_genes, region_gene_lookup, gene_ref)
print(len(category_genes))

## Create our category-gene matrix

In [None]:
included_categories = nbl_framework_results[nbl_framework_results['category_name'].apply(lambda cn: cn.split('.')[-1] in ['protein_coding', 'cosmic_and_germline_CPGs'])].reset_index(drop = True)
category_gene_matrix = pd.DataFrame(0, index = included_categories['category_name'].tolist(), columns = sorted(gene_ref))

In [None]:
for index, row in included_categories.iterrows():
    if index % 100 == 0:
        print(index, end = ', ')
    category = row['category_name']
    sv_category = row['sv_category']
    
    if category.split('.')[-1] not in ['protein_coding', 'cosmic_and_germline_CPGs']:
        continue
    
    genes_in_category = category_to_gene_list(category, sv_category, genes_in_regions, closest_genes, region_gene_lookup, gene_ref)
    
    category_gene_matrix.loc[category, genes_in_category] = 1

In [None]:
counts = category_gene_matrix.sum(axis = 1)
counts[counts > 1].sort_values()

We start to see some of the severe restrictions for some of these categories, and we note that thousands of categories actually contain no genes!

# Export

In [None]:
category_gene_matrix.index.name = 'category'

In [None]:
category_gene_matrix.to_csv('data/cwas-results/category-gene-matrix.csv')

# Run gene set enrichment on the categories themselves

We end up running a ton of gene set enrichments in a variety of contexts for the CWAS results. I want to run the gene set enrichment for the categories at baseline here.

In [12]:
category_gene_matrix = pd.read_csv('data/cwas-results/category-gene-matrix.csv').set_index('category')

We subset to significant categories, because these are the only ones that we care about.

In [13]:
nbl_analysis_categories = nbl_framework_results.query('negative_log10_p_value > 3.5')
analysis_category_gene_matrix = category_gene_matrix.loc[nbl_analysis_categories['category_name'].tolist()]

In [14]:
gene_ref = set(category_gene_matrix.columns)

Read in the gene sets

In [15]:
gene_sets = {}
with open('ref/gene-sets.txt') as gs_in:
    for line in gs_in:
        comp = line.strip().split('\t')
        db = comp[0]
        gs = comp[1]
        genes = comp[2:]
        gene_sets[db + ' | ' + gs] = genes

In [16]:
len(gene_sets)

26983

We convert both our gene sets and categories to sparse matrices.

In [17]:
import scipy.sparse

In [18]:
cat_gene_sparse_mtx = scipy.sparse.csr_matrix(analysis_category_gene_matrix.values)

In [19]:
gene_to_idx = {gene: i for i, gene in enumerate(analysis_category_gene_matrix.columns)}
cat_to_idx = {cat: i for i, cat in enumerate(analysis_category_gene_matrix.index)}

In [20]:
gs_to_idx = {}

values = []
row_indices = []
column_indices = []
for i, (gs, genes) in enumerate(gene_sets.items()):
    gs_to_idx[gs] = i
    
    column_indices += [gene_to_idx[g] for g in genes]
    row_indices += [i] * len(genes)
    values += [1] * len(genes)
    
values = np.array(values)
row_indices = np.array(row_indices)
column_indices = np.array(column_indices)

In [21]:
gs_gene_sparse_mtx = scipy.sparse.csr_matrix((values, (row_indices, column_indices)), 
                                             shape = (len(gs_to_idx), len(gene_to_idx))).T

We use some nice matrix math to pull out all our elements for the contingency table.

In [22]:
cat_and_gs = cat_gene_sparse_mtx.dot(gs_gene_sparse_mtx).todense().A
cat_and_not_gs = (cat_gene_sparse_mtx.sum(axis = 1) - cat_and_gs).A
not_cat_and_gs = (gs_gene_sparse_mtx.sum(axis = 0) - cat_and_gs).A

# a bit of math
I = np.ones(shape = (cat_gene_sparse_mtx.shape[0], gs_gene_sparse_mtx.shape[1])) * cat_gene_sparse_mtx.shape[1]
not_cat_and_not_gs = (I - cat_and_not_gs - not_cat_and_gs - cat_and_gs)

We now pass this to our vectorized fishers exact test

In [23]:
categories = []
for c in analysis_category_gene_matrix.index:
    categories += [c] * cat_and_gs.shape[1]
    
gene_sets = list(gs_to_idx.keys()) * cat_and_gs.shape[0]

In [24]:
tp = cat_and_gs.ravel().astype(int)
fp = cat_and_not_gs.ravel().astype(int)
fn = not_cat_and_gs.ravel().astype(int)
tn = not_cat_and_not_gs.ravel().astype(int)

results = pyranges.statistics.fisher_exact(tp, fp, fn, tn, pseudocount = 0)
results['expected'] = pd.Series((fn + tp).astype(str)) + '/' + pd.Series((fp + tn + tp + fn).astype(str))
results['data'] = pd.Series((tp).astype(str)) + '/' + pd.Series((tp + fp).astype(str))
results['db'] = [string.split(' | ')[0] for string in gene_sets]
results['gs'] = [string.split(' | ')[1] for string in gene_sets]
results['category'] = categories

results = results[['db', 'gs', 'category', 'expected', 'data', 'OR', 'P']]
results = results.rename(columns = {'OR': 'odds_ratio', 'P': 'p'})

  OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))
  OR = ((tp + pseudocount) / (fp + pseudocount)) / ((fn + pseudocount) / (tn + pseudocount))


In [25]:
results.head(2)

Unnamed: 0,db,gs,category,expected,data,odds_ratio,p
0,MSigDB_Hallmark_2020,TNF-alpha Signaling via NF-kB,ANY.SINGLETON.PREDICTED_LOF_or_PREDICTED_PARTI...,199/19092,199/19092,,1.0
1,MSigDB_Hallmark_2020,Hypoxia,ANY.SINGLETON.PREDICTED_LOF_or_PREDICTED_PARTI...,200/19092,200/19092,,1.0


In [85]:
results[results['category'].str.contains('\.neuroblastoma_H3K27Ac_peak\.')].loc[1160264]

db                                   GO_Biological_Process_Full
gs            positive regulation of behavioral fear respons...
category      DEL.SINGLETON.PREDICTED_NONCODING_BREAKPOINT.n...
expected                                                7/19092
data                                                     1/2758
odds_ratio                                             0.987063
p                                                           1.0
fdr_p                                                       1.0
Name: 1160264, dtype: object

# FDR correction

No analysis is complete without an FDR correction. Thankfully, we get to throw out tests that can't possibly yield in a significant result. We've already calculated what we need, so we'll be a bit fancy.

In [58]:
for cat in nbl_analysis_categories['category_name'].tolist():
    for db in results['db'].unique():
        cat_db_result = results[(results['db'] == db) &
                                (results['category'] == cat)]
        
        fdr_p = statsmodels.stats.multitest.multipletests(cat_db_result['p'].to_list(), method='fdr_bh')[1]
        results.loc[cat_db_result.index, 'fdr_p'] = fdr_p

In [68]:
results.to_csv('data/cwas-results/category-gene-set-raw-significance.csv', index=False)

# How to handle counts?

In our analysis notebook, we'll need to handle _counts_ rather than just overlaps. Let's make sure we understand how to do this.

In [48]:
M = np.array([[1, 4, 0, 2], [2, 3, 1, 0], [0, 3, 4, 1], [0, 0, 0, 1]])
N = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 0, 1, 1], [0, 0, 0, 1]])

M_and_N = M.dot(N)
M_and_notN = M.sum(axis = 1).reshape(-1, 1) - M_and_N
notM_and_N = N.sum(axis = 0).reshape(1, -1) - M.astype(bool).dot(N)

I = np.ones(shape = (M.shape[0], N.shape[1])) * M.shape[1]
notM_and_notN = I - M.astype(bool).sum(axis = 1).reshape(-1, 1) - N.sum(axis = 0).reshape(1, -1) + M.astype(bool).dot(N)

Essentially, the top row of the contigency table uses counts, but the bottom row uses binary presence/absence. So you just have to be careful with conversion to boolean for bottom row elements.