# Computation of annotation enrichment in a genetic set 

This code excerpt shows how to use the annotation enrichment computation function in a genetic set.
The funciton return a dataframe of binom and hypergeometric p-value (<= 0.05) and for each p-value a fdr correction with alpha = 0.05.   

In the second part of the code we will compare the results obtained with the Great webserver and between the binomial and hypergeometric probability

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import greatpy as great
import pandas as pd
from math import inf
from numpy import log,nan, int64,cov,corrcoef
from scipy.stats import pearsonr
import os 
import re

import warnings
warnings.filterwarnings('ignore')

In [3]:
import os 
import re
def all_comparison(scatterplot=False): 
    pp = {
        "name":[],
        "before_pp_greatpy_size":[], 
        "before_pp_great_size":[],
        "after_pp_greatpy_size":[],
        "after_pp_great_size":[],
    }
    asso = {
        "name":[],
        "Number of gene association present in greatpy and great":[],
        "Number of gene association present in great and not in greatpy":[],
        "Number of gene association present in greatpy and not in great":[],

    }

    nb = len(os.listdir("../../data/tests/test_data/input/"))
    for path in os.listdir("../../data/tests/test_data/input/") : 
        sp = path.split(".")
        id = sp[0][:2]
        name = sp[0][3:]
        pp["name"].append(name)
        i = 0
        great_out = ""
        great_asso = ""

        for out_path in os.listdir("../../data/tests/test_data/output/") :
            if out_path.split("_")[0] == id : 
                if re.match(".*hg38.*",out_path) != None : 
                    assembly = "hg38"
                else : 
                    assembly = "hg19"
                
                if re.match(".*output.*",out_path) != None : 
                    great_out = "../../data/tests/test_data/output/" + out_path
                else : 
                    great_asso = "../../data/tests/test_data/output/" + out_path 

        test = "../../data/tests/test_data/input/" + path 
        regdom = f"../../data/human/{assembly}/regulatory_domain.bed"
        size = f"../../data/human/{assembly}/chr_size.bed"

        if great_out == "" or great_asso == "" : 
            return False
        
        enrichment_tot = great.tl.GREAT.enrichment(test_file=test,
                                                    regdom_file=regdom,
                                                    chr_size_file=size,
                                                    annotation_file=f"../../data/human/ontologies.csv",
                                                    binom=True,
                                                    hypergeom=True,
                                                    )
        enrichment_tot = great.tl.GREAT.set_bonferroni(enrichment_tot,0.05)
        enrichment_tot = great.tl.GREAT.set_fdr(enrichment_tot,0.05)
            
        great_webserver = pd.read_csv(great_out,sep="\t",comment="#",
                                    names=["ontologie","term_name","ID","binom_p_value","binom_bonferroni","binom_fdr","hyper_p_value","hyper_bonferroni","hyper_fdr"],index_col=False,
                                    dtype={"term_name":"object", "ID":"object","binom_p_value":"float64", "binom_bonferroni":"float64", "binom_fdr":"float64", "hyper_p_value":"float64", "hyper_bonferroni":"float64", "hyper_fdr":"float64"})
        great_webserver.rename(columns={"ID":"id"},inplace=True)
        del great_webserver["ontologie"]
        del great_webserver["term_name"]
        
        pp["before_pp_greatpy_size"].append(enrichment_tot.shape[0])
        enrichment_tot = enrichment_tot[enrichment_tot.index.isin(list(great_webserver["id"]))]
        pp["after_pp_greatpy_size"].append(enrichment_tot.shape[0])

        pp["before_pp_great_size"].append(great_webserver.shape[0])
        great_webserver = great_webserver[great_webserver["id"].isin(list(enrichment_tot.index))]
        pp["after_pp_great_size"].append(great_webserver.shape[0])

        great_webserver = great_webserver.sort_values("id")
        binom_greatpy = hyper_greatpy = binom_great = hyper_great = []
        for i in range(enrichment_tot.shape[0]) : 
            go_id = list(enrichment_tot.index)[i]
            curr_enrichment = enrichment_tot.iloc[i]
            curr_great_webserver = great_webserver.loc[great_webserver["id"]==go_id]
            binom_greatpy.append(float(curr_enrichment["binom_p_value"]))
            hyper_greatpy.append(float(curr_enrichment["hypergeom_p_value"]))
            binom_great.append(float(curr_great_webserver["binom_p_value"]))
            hyper_great.append(float(curr_great_webserver["hyper_p_value"]))
        binom = pd.DataFrame({"binom_greatpy":binom_greatpy,"binom_great":binom_great})
        hyper = pd.DataFrame({"hyper_greatpy":hyper_greatpy,"hyper_great":hyper_great})

        if scatterplot : 
            great.pl.scatterplot(binom,colname_x="binom_greatpy",colname_y="binom_great",title=f"binom comparison for {name}")
            great.pl.scatterplot(hyper,colname_x="hyper_greatpy",colname_y="hyper_great",title=f"hypergeom comparison for {name}")
        
        gene_asso_great = pd.read_csv(great_asso,sep="\t",comment="#",names=["ontologies","gene"],index_col=False, dtype={"ontologies":"object","gene":"object"},usecols=["gene"])
        gene_asso_greatpy = great.tl.get_association(
            test = pd.read_csv(test,sep="\t",comment="#",usecols=[0,1,2],names=["Chr", "Chr_Start", "Chr_End"],dtype={"Chr":"object", "Chr_Start":"int64", "Chr_End":"int64"}),
            regdom= pd.read_csv(regdom,sep="\t",comment="#",names=["Chr", "Chr_Start", "Chr_End","Name","tss","Strand"],dtype={"Chr":"object", "Chr_Start":"int64", "Chr_End":"int64","Name":"object","tss":"int64","Strand":"object"}))
        
        in_in = gene_asso_great[gene_asso_great["gene"].isin(gene_asso_greatpy)].shape[0]
        in_out = [i for i in list(gene_asso_great["gene"]) if i not in gene_asso_greatpy]
        out_in = [i for i in gene_asso_greatpy if i not in list(gene_asso_great["gene"])]

        asso["name"].append(name)
        asso["Number of gene association present in greatpy and great"].append(str(in_in))
        asso["Number of gene association present in great and not in greatpy"].append(str(len(in_out)))
        asso["Number of gene association present in greatpy and not in great"].append(str(len(out_in))) 
        ## add the total number of great and greatpy 

        n = len(asso["name"])
        print(f"{name} finished, still {nb-n} more files to calculate") 
        enrichment_tot = pd.DataFrame()

    return pd.DataFrame(pp),pd.DataFrame(asso)
a,b = all_comparison(scatterplot=False)

random finished, still 9 more files to calculate
height_snps_hg19 finished, still 8 more files to calculate
ultra_hg38 finished, still 7 more files to calculate
MAX finished, still 6 more files to calculate
ultra_hg19 finished, still 5 more files to calculate
srf_hg38 finished, still 4 more files to calculate
height_snps_hg38 finished, still 3 more files to calculate
ERF finished, still 2 more files to calculate
FOXO3 finished, still 1 more files to calculate
srf_hg19 finished, still 0 more files to calculate


In [4]:
a

Unnamed: 0,name,before_pp_greatpy_size,before_pp_great_size,after_pp_greatpy_size,after_pp_great_size
0,random,581,197,117,117
1,height_snps_hg19,2299,2063,1167,1167
2,ultra_hg38,3267,2175,1393,1393
3,MAX,3058,2395,887,887
4,ultra_hg19,3019,2186,1360,1360
5,srf_hg38,4812,2681,1854,1854
6,height_snps_hg38,2287,2103,1141,1141
7,ERF,7314,2410,1022,1022
8,FOXO3,2892,2328,680,680
9,srf_hg19,4162,2063,1464,1464


In [5]:
b

Unnamed: 0,name,Number of gene association present in greatpy and great,Number of gene association present in great and not in greatpy,Number of gene association present in greatpy and not in great
0,random,57,0,1
1,height_snps_hg19,287,0,0
2,ultra_hg38,496,0,2
3,MAX,155,273,276
4,ultra_hg19,474,6,1
5,srf_hg38,923,0,4
6,height_snps_hg38,277,0,1
7,ERF,369,1087,1398
8,FOXO3,119,281,328
9,srf_hg19,791,0,3
