# rGreat local vs greatpy

This notebook allows you to compare results between local rGreat and enrichment function from greatpy. 

r-base == 3.6.1

python version == 3.8 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects import r as r
pandas2ri.activate()
import pandas as pd

import os 
import matplotlib.pyplot as plt
import greatpy as great 
from scipy.stats import pearsonr
import re
import time 

In [3]:
rpy2.__version__

'3.5.2'

In [4]:
importr('rGREAT')
ranges = importr('GenomicRanges')

In [5]:
def rgreat_online_vs_local_vs_greatpy(): 
    stat_df = {
        "name" : [],
        "pearson_binom" : [], 
        "pearson_hypergeom" : []
    }
    pp = {
        "name" : [],
        "before_pp_greatpy_size" : [], 
        "before_pp_local_size" : [],
        "final_size" : [],
        "%_of_diffrent_GO_term" : [],
    }
    # test_file = ["01_random.bed","06_height_snps_hg38.bed","07_height_snps_hg19.bed","10_MAX.bed"]
    test_file = ["06_height_snps_hg38.bed"]

    for name in test_file : 
        # find the assembly 
        if re.match(".*hg19.*",name) != None : 
            assembly = "hg19"
        else : 
            assembly = "hg38"

        # online test 
        res_online = rpy2.robjects.r['submitGreatJob'](f"../../data/tests/test_data/input/{name}",species=f"{assembly}",help=False)
        res_online = rpy2.robjects.r['getEnrichmentTables'](res_online)

        time.sleep(30)

        # local test
            # proprocessing : make a Grange frame 
        df = r["read.csv"](f"../../data/tests/test_data/input/{name}",sep='\t')
        seqname = rpy2.robjects.StrVector(["seqnames", "seqname","chromosome", "X.Chr","chr", "chromosome_name","seqid"])
        end = rpy2.robjects.StrVector(['end', 'stop'])
        df = ranges.makeGRangesFromDataFrame(df,seqnames_field=seqname)

            # great calculation
        local = rpy2.robjects.r['great'](df, "msigdb:C5", f"txdb:{assembly}",verbose=False)
        local = rpy2.robjects.r['getEnrichmentTables'](local)

        time.sleep(30)

        # greatpy calculation
        greatpy = great.tl.GREAT.enrichment(
            test_file=f"../../data/tests/test_data/input/{name}",
            regdom_file=f"../../data/human/{assembly}/regulatory_domain.bed",
            chr_size_file=f"../../data/human/{assembly}/chr_size.bed",
            annotation_file=f"../../data/human/ontologies.csv",
            binom=True,
            hypergeom=True,
            )
        time.sleep(30)

        # create each dataframe
            # online
        name = [cdc.lower() for cdc in list(res_online.rx2("GO Molecular Function").rx2("name"))+list(res_online.rx2("GO Biological Process").rx2("name"))+list(res_online.rx2("GO Cellular Component").rx2("name"))]
        online = pd.DataFrame({
            "id" : list(res_online.rx2("GO Molecular Function").rx2("ID"))+list(res_online.rx2("GO Biological Process").rx2("ID"))+list(res_online.rx2("GO Cellular Component").rx2("ID")),
            "name" : name,
            "binom_p_val" : list(res_online.rx2("GO Molecular Function").rx2("Binom_Raw_PValue"))+list(res_online.rx2("GO Biological Process").rx2("Binom_Raw_PValue"))+list(res_online.rx2("GO Cellular Component").rx2("Binom_Raw_PValue")),
            "hyper_p_val" : list(res_online.rx2("GO Molecular Function").rx2("Hyper_Raw_PValue"))+list(res_online.rx2("GO Biological Process").rx2("Hyper_Raw_PValue"))+list(res_online.rx2("GO Cellular Component").rx2("Hyper_Raw_PValue"))
            })

            # local
        name_go = list(local.rx2("id")) 
        name_go = [" ".join(cdc.lower().split("_")[1:]) for cdc in list(local.rx2("id"))]
        local = pd.DataFrame({
            "name": name_go,
            "binom_p_val" : list(local.rx2("p_value")),
            "hyper_p_val" : list(local.rx2("p_value_hyper"))
            })

            # greatpy
        greatpy["go_term"] = greatpy["go_term"].str.lower()

        
        time.sleep(90) # just to have not a burning computer ^^ 
        
    return local, online, greatpy

In [6]:
local,online,greatpy = rgreat_online_vs_local_vs_greatpy()

R[write to console]: The default enrichment tables contain no associated genes for the input
regions.You can set `download_by = 'tsv'` to download the complete
table,but note only the top 500 regions can be retreived. See the
following link:

https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport

R[write to console]: * check whether TxDb package 'TxDb.Hsapiens.UCSC.hg38.knownGene' is installed.

R[write to console]: * gene ID type in the extended TSS is 'Entrez Gene ID'.

R[write to console]: * restrict chromosomes to 'chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10,
    chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX,
    chrY, chrM'.

R[write to console]: * 18208/29644 protein-coding genes left.

R[write to console]: * update seqinfo to the selected chromosomes.

R[write to console]: * TSS extension mode is 'basalPlusExt'.

R[write to console]: * construct the basal domains by extending 5000bp t

In [7]:
local

Unnamed: 0,name,binom_p_val,hyper_p_val
0,proteoglycan metabolic process,3.44454e-09,9.69768e-04
1,chondroitin sulfate proteoglycan metabolic pro...,6.60888e-09,1.25454e-02
2,positive regulation of glycoprotein metabolic ...,8.50096e-08,5.22013e-02
3,muscle cell migration,2.35739e-07,1.96183e-05
4,replacement ossification,2.81099e-07,7.21346e-04
...,...,...,...
1830,inorganic ion transmembrane transport,9.95850e-01,9.78588e-01
1831,synapse organization,9.97051e-01,6.84269e-01
1832,neuron projection,9.98345e-01,7.85672e-01
1833,synapse,9.98970e-01,7.43814e-01


In [8]:
online

Unnamed: 0,id,name,binom_p_val,hyper_p_val
0,GO:0017116,single-stranded dna-dependent atp-dependent dn...,3.68373e-11,4.36089e-02
1,GO:0043141,atp-dependent 5'-3' dna helicase activity,5.41251e-11,7.16235e-02
2,GO:0003680,at dna binding,8.83561e-11,2.49782e-04
3,GO:0042975,peroxisome proliferator activated receptor bin...,1.10785e-10,9.02449e-03
4,GO:0043139,5'-3' dna helicase activity,2.02135e-10,1.12120e-01
...,...,...,...,...
19104,GO:1990909,wnt signalosome,1.00000e+00,1.00000e+00
19105,GO:1990913,sperm head plasma membrane,1.00000e+00,1.00000e+00
19106,GO:1990917,ooplasm,1.00000e+00,1.00000e+00
19107,GO:1990923,pet complex,1.00000e+00,1.00000e+00


In [9]:
greatpy

Unnamed: 0,go_term,binom_p_value,hypergeom_p_value
GO:0031936,negative regulation of chromatin silencing,1.00612e-14,1.40000e-307
GO:0017116,single-stranded dna-dependent atp-dependent dn...,4.80098e-12,3.00000e-308
GO:0042974,retinoic acid receptor binding,1.25139e-11,1.50000e-307
GO:0003680,at dna binding,4.80863e-11,1.50000e-307
GO:0043141,atp-dependent 5'-3' dna helicase activity,5.19613e-11,8.00000e-308
...,...,...,...
GO:0005886,plasma membrane,9.94321e-01,2.22000e-306
GO:0043025,neuronal cell body,9.95491e-01,2.76000e-306
GO:0007420,brain development,9.97816e-01,2.13000e-306
GO:0030424,axon,9.99016e-01,2.77000e-306


In [15]:
local.to_csv("../../../local.csv",sep="\t",header=True,index=False)
online.to_csv("../../../online.csv",sep="\t",header=True,index=False)
greatpy.to_csv("../../../greatpy.csv",sep="\t",header=True,index=True)

In [None]:
import pandas as pd
import seaborn as sns
sns.set_theme()

# Load the brain networks example dataset
df = sns.load_dataset("brain_networks", header=[0, 1, 2], index_col=0)

# Select a subset of the networks
used_networks = [1, 5, 6, 7, 8, 12, 13, 17]
used_columns = (df.columns.get_level_values("network")
                          .astype(int)
                          .isin(used_networks))
df = df.loc[:, used_columns]

# Create a categorical palette to identify the networks
network_pal = sns.husl_palette(8, s=.45)
network_lut = dict(zip(map(str, used_networks), network_pal))

# Convert the palette to vectors that will be drawn on the side of the matrix
networks = df.columns.get_level_values("network")
network_colors = pd.Series(networks, index=df.columns).map(network_lut)

# Draw the full plot
g = sns.clustermap(df.corr(), center=0, cmap="vlag",
                   row_colors=network_colors, col_colors=network_colors,
                   dendrogram_ratio=(.1, .2),
                   cbar_pos=(.02, .32, .03, .2),
                   linewidths=.75, figsize=(12, 13))

g.ax_row_dendrogram.remove()

In [10]:
import seaborn as sns; sns.set_theme(color_codes=True)
iris = sns.load_dataset("iris")
species = iris.pop("species")

In [11]:
lut = dict(zip(species.unique(), "rbg"))
lut
# row_colors = species.map(lut)
# g = sns.clustermap(iris, row_colors=row_colors)

{'setosa': 'r', 'versicolor': 'b', 'virginica': 'g'}

In [12]:
# id = online.pop("id")

In [13]:
# import seaborn as sns
# import random
# col = {i : "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in id.unique()}
# row_colors = id.map(col)
# sns.clustermap(online[["binom_p_val","hyper_p_val"]], row_colors=row_colors)

As we can see the correlation between greatpy and local analysis of rGreat is not good.

In [14]:
preprocess

NameError: name 'preprocess' is not defined

As we can see, the shape reduction is very high so It would to say : the GO term pick by the two function is diffrent.