# rGreat online vs local 

This notebook allows you to compare results between local and online rGreat. 

r-base == 3.6.1

python version == 3.8 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects import r as r
pandas2ri.activate()
import pandas as pd

import os 
import matplotlib.pyplot as plt
import greatpy as great 
from scipy.stats import pearsonr
import re
import time 

In [3]:
rpy2.__version__

'3.5.2'

In [4]:
importr('rGREAT')
ranges = importr('GenomicRanges')

In [12]:
def local_vs_online(): 
    stat_df = {
        "name" : [],
        "pearson_binom" : [], 
        "pearson_hypergeom" : []
    }
    pp = {
        "name" : [],
        "before_pp_online_size" : [], 
        "before_pp_local_size" : [],
        "final_size" : [],
        "%_of_GO_from_online_lost" : [],
    }

    for name in os.listdir("../../data/tests/test_data/input"): 
        # find the assembly 
        if re.match(".*hg19.*",name) != None : 
            assembly = "hg19"
        else : 
            assembly = "hg38"
            
        # online test 
        res_online = rpy2.robjects.r['submitGreatJob'](f"../../data/tests/test_data/input/{name}",species=f"{assembly}",help=False)
        res_online = rpy2.robjects.r['getEnrichmentTables'](res_online)

        time.sleep(30)

        # local test
            # proprocessing : make a Grange frame 
        df = r["read.csv"](f"../../data/tests/test_data/input/{name}",sep='\t')
        seqname = rpy2.robjects.StrVector(["seqnames", "seqname","chromosome", "X.Chr","chr", "chromosome_name","seqid"])
        end = rpy2.robjects.StrVector(['end', 'stop'])
        df = ranges.makeGRangesFromDataFrame(df,seqnames_field=seqname)#,keep_extra_columns=False,ignore_strand=False,seqinfo=None,start_field='start', end_field=end, strand_field='strand',starts_in_df_are_0based=False)

            # great calculation
        local = rpy2.robjects.r['great'](df, "msigdb:C5", f"txdb:{assembly}",verbose=False)
        local = rpy2.robjects.r['getEnrichmentTables'](local)

        # create each dataframe
        name = [cdc.lower() for cdc in list(res_online.rx2("GO Molecular Function").rx2("name"))+list(res_online.rx2("GO Biological Process").rx2("name"))+list(res_online.rx2("GO Cellular Component").rx2("name"))]

        online = pd.DataFrame({
            "id" : list(res_online.rx2("GO Molecular Function").rx2("ID"))+list(res_online.rx2("GO Biological Process").rx2("ID"))+list(res_online.rx2("GO Cellular Component").rx2("ID")),
            "name" : name,
            "binom_p_val" : list(res_online.rx2("GO Molecular Function").rx2("Binom_Raw_PValue"))+list(res_online.rx2("GO Biological Process").rx2("Binom_Raw_PValue"))+list(res_online.rx2("GO Cellular Component").rx2("Binom_Raw_PValue")),
            "hyper_p_val" : list(res_online.rx2("GO Molecular Function").rx2("Hyper_Raw_PValue"))+list(res_online.rx2("GO Biological Process").rx2("Hyper_Raw_PValue"))+list(res_online.rx2("GO Cellular Component").rx2("Hyper_Raw_PValue"))
            })

        name = list(local.rx2("id")) 
        name = [" ".join(cdc.lower().split("_")[1:]) for cdc in list(local.rx2("id"))]

        local = pd.DataFrame({
            "name": name,
            "binom_p_val" : list(local.rx2("p_value")),
            "hyper_p_val" : list(local.rx2("p_value_hyper"))
            })

        # reduce the df 
        pp["name"].append(name)
        pp["before_pp_local_size"].append(local.shape[0])
        pp["before_pp_online_size"].append(online.shape[0])
        local = local.loc[local["name"].isin(list(online["name"]))]
        online = online.loc[online["name"].isin(list(local["name"]))]
        pp["final_size"].append(local.shape[0])
        pp["%_of_GO_from_online_lost"].append(round(100*(online.shape[0]-local.shape[0])/online.shape[0],2))

        # compare
        binom_local = []
        hyper_local = []
        binom_online = []
        hyper_online = []
        for i in range(online.shape[0]) : 
            try : 
                go_term = list(online["name"])[i]
                curr_online_enrichment = online.iloc[i]
                curr_local_enrichment = local.loc[local["name"]==go_term]
                binom_local.append(float(curr_online_enrichment["binom_p_val"]))
                hyper_local.append(float(curr_online_enrichment["hyper_p_val"]))
                binom_online.append(float(curr_local_enrichment["binom_p_val"]))
                hyper_online.append(float(curr_local_enrichment["hyper_p_val"]))
            except :
                print(f"{curr_online_enrichment.to_markdown()}\n and \n{curr_local_enrichment.to_markdown()}\ngenerate an error")
        
        binom = pd.DataFrame({"binom_local" : binom_local,"binom_online" : binom_online})
        hyper = pd.DataFrame({"hyper_local" : hyper_local,"hyper_online" : hyper_online})

        # plot the results 
        # great.pl.scatterplot(binom,colname_x="binom_online",colname_y="binom_local",title=f"binom comparison {name}")
        # plt.show()
        # great.pl.scatterplot(hyper,colname_x="hyper_online",colname_y="hyper_local",title=f"hyper comparison {name}")
        # plt.show()
        
        # Error : 
        # ValueError: Image size of 896295x279 pixels is too large. It must be less than 2^16 in each direction.
        # <Figure size 432x288 with 1 Axes>
        
        stat_df["name"].append(name[0])
        stat_df["pearson_binom"].append(pearsonr(binom_local,binom_online)[0])
        stat_df["pearson_hypergeom"].append(pearsonr(hyper_local,hyper_online)[0])

        time.sleep(90) # just to have not a burning computer ^^ 
        
    return pd.DataFrame(stat_df),pd.DataFrame(pp)

In [11]:
stat,preprocess = local_vs_online()

R[write to console]: The default enrichment tables contain no associated genes for the input
regions.You can set `download_by = 'tsv'` to download the complete
table,but note only the top 500 regions can be retreived. See the
following link:

https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport

R[write to console]: * extended_tss is already cached, directly use it.

  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError: Image size of 896295x279 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 432x288 with 1 Axes>

  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError: Image size of 896292x279 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 432x288 with 1 Axes>

R[write to console]: The default enrichment tables contain no associated genes for the input
regions.You can set `download_by = 'tsv'` to download the complete
table,but note only the top 500 regions can be retreived. See the
following link:

https://great-help.atlassian.net/wiki/spaces/GREAT/pages/655401/Export#Export-GlobalExport

R[write to console]: * extended_tss is already cached, directly use it.



ValueError: Image size of 377201x279 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 432x288 with 1 Axes>

ValueError: Image size of 377198x279 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 432x288 with 1 Axes>

KeyboardInterrupt: 

In [None]:
stat 

As we can see the local implementation is nor really good correlated with the online implementation.

In [None]:
preprocess

As we can see, the online implementation return some GO terms whoch is not in the local implementation.