Skip to content

Commit

Permalink
GSVA in now shipped, #226
Browse files Browse the repository at this point in the history
  • Loading branch information
zqfang committed Oct 20, 2023
1 parent 0f44c2a commit cbd959c
Show file tree
Hide file tree
Showing 7 changed files with 447 additions and 206 deletions.
67 changes: 66 additions & 1 deletion gseapy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import warnings
from typing import AnyStr, Dict, Iterable, List, Optional, Tuple, Union

Expand All @@ -8,6 +7,7 @@
from .biomart import Biomart
from .enrichr import Enrichr
from .gsea import GSEA, Prerank, Replot, SingleSampleGSEA
from .gsva import GSVA
from .msigdb import Msigdb
from .parser import get_library, get_library_name, read_gmt
from .plot import barplot, dotplot, enrichment_map, gseaplot, gseaplot2, heatmap
Expand Down Expand Up @@ -629,6 +629,69 @@ def enrich(
return enr


def gsva(
data: Union[pd.DataFrame, pd.Series, str],
gene_sets: Union[List[str], str, Dict[str, str]],
outdir: Optional[str] = None,
kcdf: Optional[str] = "Gaussian",
weighted_score_type: float = 0.25,
mx_diff: bool = True,
abs_rnk: bool = False,
min_size: int = 15,
max_size: int = 1000,
threads: int = 1,
seed: int = 123,
verbose: bool = False,
**kwargs,
):
"""Run Gene Set Enrichment Analysis with single sample GSEA tool
:param data: Expression table, pd.Series, pd.DataFrame, GCT file
:param gene_sets: Enrichr Library name or .gmt gene sets file or dict of gene sets. Same input with GSEA.
:param outdir: Results output directory. If None, nothing will write to disk.
:param str kcdf: "Gaussian", "Possion" or None.
:param str weighted_score_type: tau of gsva. Default: 1.
:param bool mx_diff: Offers two approaches to calculate the enrichment statistic (ES) from the KS random walk statistic. see GSVA doc
:param bool abs_rnk: Flag used only when mx_diff=True. see GSVA doc: https://rdrr.io/bioc/GSVA/man/gsva.html
:param int min_size: Minimum allowed number of genes from gene set also the data set. Default: 15.
:param int max_size: Maximum allowed number of genes from gene set also the data set. Default: 1000.
:param int threads: Number of threads you are going to use. Default: 4.
:param seed: Random seed. expect an integer. Default:None.
:param bool verbose: Bool, increase output verbosity, print out progress of your job, Default: False.
:return: Return a GSVA obj.
All results can be accessed by obj.results or obj.res2d.
"""
gv = GSVA(
data=data,
gene_sets=gene_sets,
outdir=outdir,
kcdf=kcdf,
weight=weighted_score_type,
mx_diff=mx_diff,
abs_rnk=abs_rnk,
min_size=min_size,
max_size=max_size,
threads=threads,
seed=seed,
verbose=verbose,
)
gv.run()
return gv


__all__ = [
"dotplot",
"barplot",
Expand All @@ -639,12 +702,14 @@ def enrich(
"replot",
"prerank",
"gsea",
"gsva",
"ssgsea",
"enrichr",
"enrich",
"Replot",
"Prerank",
"GSEA",
"GSVA",
"SingleSampleGSEA",
"Enrichr",
"Biomart",
Expand Down
178 changes: 165 additions & 13 deletions gseapy/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# or args = argparser.parse_args() will throw bugs!!!


__version__ = "1.0.6"
__version__ = "1.1.0"


def main():
Expand Down Expand Up @@ -108,6 +108,24 @@ def main():
)
ss.run()

elif subcommand == "gsva":
from .gsva import GSVA

gv = GSVA(
data=args.data,
gene_sets=args.gmt,
outdir=args.outdir,
kcdf=args.kcdf,
weight=args.weight,
mx_diff=args.mx_diff,
abs_rnk=args.abs_rnk,
min_size=args.mins,
max_size=args.maxs,
threads=args.threads,
seed=args.seed,
verbose=args.verbose,
)
gv.run()
elif subcommand == "enrichr":
# calling enrichr API
from .enrichr import Enrichr
Expand Down Expand Up @@ -170,6 +188,8 @@ def prepare_argparser():
add_prerank_parser(subparsers)
# command for 'ssgsea'
add_singlesample_parser(subparsers)
# command for 'gsva'
add_gsva_parser(subparsers)
# command for 'plot'
add_plot_parser(subparsers)
# command for 'enrichr'
Expand Down Expand Up @@ -246,21 +266,39 @@ def add_output_option(parser):
def add_output_group(parser, required=True):
"""output group"""

output_group = parser.add_mutually_exclusive_group(required=required)
output_group.add_argument(
# output_group = parser.add_mutually_exclusive_group(required=required)
# output_group.add_argument(
# "-o",
# "--ofile",
# dest="ofile",
# type=str,
# default="GSEApy_reports",
# help="Output file name. Mutually exclusive with --o-prefix.",
# )
# output_group.add_argument(
# "--o-prefix",
# dest="ofile",
# type=str,
# default="GSEApy_reports",
# help="Output file prefix. Mutually exclusive with -o/--ofile.",
# )
parser.add_argument(
"-o",
"--ofile",
dest="ofile",
"--outdir",
dest="outdir",
type=str,
default="GSEApy_reports",
help="Output file name. Mutually exclusive with --o-prefix.",
metavar="",
action="store",
help="The GSEApy output directory. Default: the current working directory",
)
output_group.add_argument(
"--o-prefix",
dest="ofile",
type=str,
default="GSEApy_reports",
help="Output file prefix. Mutually exclusive with -o/--ofile.",
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Increase output verbosity, print out progress of your job",
)


Expand Down Expand Up @@ -626,7 +664,7 @@ def add_singlesample_parser(subparsers):
dest="weight",
default=0.25,
type=float,
metavar="weight",
metavar="float",
help="Weighted_score of rank_metrics. For weighting input genes. Default: 0.25",
)
group_opt.add_argument(
Expand Down Expand Up @@ -661,6 +699,120 @@ def add_singlesample_parser(subparsers):
return


def add_gsva_parser(subparsers):
"""Add function 'GSVA' argument parsers."""

argparser_gsva = subparsers.add_parser("gsva", help="Run GSVA.")

# group for input files
group_input = argparser_gsva.add_argument_group("Input files arguments")
group_input.add_argument(
"-d",
"--data",
dest="data",
action="store",
type=str,
required=True,
help="Input gene expression dataset file in txt format. Same with GSEA.",
)
group_input.add_argument(
"-g",
"--gmt",
dest="gmt",
action="store",
type=str,
required=True,
help="Gene set database in GMT format. Same with GSEA.",
)
# group for output files
group_output = argparser_gsva.add_argument_group("Output arguments")
# add_output_option(group_output)
add_output_group(group_output)
# group for General options.
group_opt = argparser_gsva.add_argument_group("GSVA advanced arguments")
group_opt.add_argument(
"-m",
"--mx-diff",
dest="mx_diff",
action="store_false",
default=True,
help="When set, ES is calculated as the maximum distance of the random walk from 0. Default: False"
"Default: False",
)

group_opt.add_argument(
"-k",
"--kernel-cdf",
dest="kcdf",
action="store",
type=str,
default="Gaussian",
metavar="",
choices=("Gaussian", "Poisson", "None"),
help="Gaussian is suitable when input expression values are continuous. " +\
"If input integer counts, then this argument should be set to 'Poisson'",
)

group_opt.add_argument(
"-a",
"--abs-ranking",
action="store_true",
dest="abs_rnk",
default=False,
help="Flag used only when --mx-diff is not set. When set, the original Kuiper statistic is used",
)
group_opt.add_argument(
"--min-size",
dest="mins",
action="store",
type=int,
default=15,
metavar="int",
help="Min size of input genes presented in Gene Sets. Default: 15",
)
group_opt.add_argument(
"--max-size",
dest="maxs",
action="store",
type=int,
default=2000,
metavar="int",
help="Max size of input genes presented in Gene Sets. Default: 2000",
)
group_opt.add_argument(
"-w",
"--weight",
action="store",
dest="weight",
default=1,
type=float,
metavar="float",
help="tau in the random walk performed by the gsva. Default: 1",
)
group_opt.add_argument(
"-s",
"--seed",
dest="seed",
action="store",
type=int,
default=123,
metavar="",
help="Number of random seed. Default: 123",
)
group_opt.add_argument(
"-p",
"--threads",
dest="threads",
action="store",
type=int,
default=4,
metavar="int",
help="Number of Processes you are going to use. Default: 4",
)

return


def add_plot_parser(subparsers):
"""Add function 'plot' argument parsers."""

Expand Down
6 changes: 4 additions & 2 deletions gseapy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,13 +601,15 @@ def to_df(
"Gene %",
"Lead_genes",
]
res_df.drop(dc, axis=1, inplace=True)
if self.module == "gsva":
dc += ["NES"]
# re-order by NES
# for pandas > 1.1, use df.sort_values(by='B', key=abs) will sort by abs value
self.res2d = res_df.reindex(
res_df["NES"].abs().sort_values(ascending=False).index
).reset_index(drop=True)

res_df.drop(dc, axis=1, inplace=True)

if self._outdir is not None:
out = os.path.join(
self.outdir,
Expand Down

0 comments on commit cbd959c

Please sign in to comment.