Added new grouping code.

tanghaibao · Jun 26, 2018 · b3fe1cb · b3fe1cb
1 parent 3489301
commit b3fe1cb
Show file tree

Hide file tree

Showing 24 changed files with 2,887 additions and 67 deletions.
diff --git a/goatools/cli/gosubdag_plot.py b/goatools/cli/gosubdag_plot.py
@@ -74,13 +74,13 @@
 from goatools.gosubdag.go_tasks import get_go2obj_unique
 from goatools.gosubdag.go_tasks import get_leaf_children
 from goatools.gosubdag.rpt.wr_xlsx import read_d1_letter
-# COMING SOON: Plotting using GOATOOLS grouping:
-#   from goatools.gosubdag.rpt.read_goids import read_sections
-#   from goatools.grouper.grprdflts import GrouperDflts
-#   from goatools.grouper.hdrgos import HdrgosSections
-#   from goatools.grouper.grprobj import Grouper
-#   from goatools.grouper.colors import GrouperColors
-#   from goatools.grouper.grprplt import GrouperPlot
+
+from goatools.grouper.read_goids import read_sections
+from goatools.grouper.grprdflts import GrouperDflts
+from goatools.grouper.hdrgos import HdrgosSections
+from goatools.grouper.grprobj import Grouper
+from goatools.grouper.colors import GrouperColors
+from goatools.grouper.grprplt import GrouperPlot
 
 
 # pylint: disable=too-few-public-methods
@@ -94,24 +94,25 @@ class GetGOs(object):
 
     def __init__(self, go2obj):
         self.go2obj = go2obj
-        self.re_goids = re.compile(r"(GO:\d{7})+?")
-        self.re_color = re.compile(r"(#[0-9a-fA-F]{6})+?")
+        self.re_goids = re.compile(ur"(GO:\d{7})+?")
+        self.re_color = re.compile(ur"(#[0-9a-fA-F]{6})+?")
 
     def get_go_color(self, **kws):
         """Return source GO IDs ."""
-        # kws: GO go_file draw-children
         ret = {'GOs':set(), 'go2color':{}}
         if 'GO' in kws:
             self._goargs(ret, kws['GO'])
         if 'go_file' in kws:
             self._rdtxt_gos(ret, kws['go_file'])
         if 'draw-children' in kws:
-            self._add_gochildleaf(ret)
+            ret['GOs'].update(get_leaf_children(ret['GOs'], self.go2obj))
         # If there have been no GO IDs explicitly specified by the user
         if not ret['GOs']:
             # If the GO-DAG is sufficiently small, print all GO IDs
             if len(self.go2obj) < self.max_gos:
-                self._add_all_leafs(ret)
+                main_gos = set(o.id for go, o in self.go2obj.items() if go != o.id)
+                go_leafs = set(go for go, o in self.go2obj.items() if not o.children)
+                ret['GOs'] = go_leafs.difference(main_gos)
             else:
                 raise RuntimeError("GO IDs NEEDED")
         go2obj = {go:self.go2obj[go] for go in ret['GOs']}
@@ -155,23 +156,6 @@ def _rdtxt_gos(self, ret, go_file):
                             print("IGNORING: {L}".format(L=line),)
         self._update_ret(ret, goids, go2color)
 
-    def _add_gochildleaf(self, ret):
-        """Add leaf-level GO children to GO list colored uniquely."""
-        leaf_gos = get_leaf_children(ret['GOs'], self.go2obj)
-        if leaf_gos:
-            ret['GOs'].update(leaf_gos)
-            leaf_go_color = Go2Color.key2col['go_leafchild']
-            go2color = ret['go2color']
-            for goid in leaf_gos:
-                if goid not in go2color:
-                    go2color[goid] = leaf_go_color
-
-    def _add_all_leafs(self, ret):
-        """Print all GO IDs."""
-        main_gos = set(o.id for go, o in self.go2obj.items() if go != o.id)
-        go_leafs = set(go for go, o in self.go2obj.items() if not o.children)
-        ret['GOs'] = go_leafs.difference(main_gos)
-
     @staticmethod
     def _update_ret(ret, goids, go2color):
         """Update 'GOs' and 'go2color' in dict with goids and go2color."""
@@ -209,6 +193,7 @@ def cli(self):
         # GO kws_all: GO go_file draw-children
         goids, go2color = GetGOs(go2obj).get_go_color(**kws_all)
         relationships = 'relationship' in optional_attrs
+        #### self.gosubdag = GoSubDag(goids, go2obj, relationships, tcntobj=tcntobj)
         kws_dag = self._get_kwsdag(goids, go2obj, **kws_all)
         self.gosubdag = GoSubDag(goids, go2obj, relationships, **kws_dag)
 
@@ -217,41 +202,39 @@ def cli(self):
         else:
             return self._plt_gosubdag(goids, go2color, **kws_all)
 
-    # pylint: disable=unused-argument,no-self-use
     def _plt_gogrouped(self, goids, go2color_usr, **kws):
         """Plot grouped GO IDs."""
-        print("Plotting with GOATOOLS grouping coming soon...")
-   #      fout_img = self.get_outfile(kws['outfile'], goids, 'relationship' in kws)
-   #      sections = read_sections(kws['sections'], exclude_ungrouped=True)
-   #      # kws_plt = {k:v for k, v in kws.items if k in self.kws_plt}
-   #      grprobj_cur = self._get_grprobj(goids, sections)
-   #      # GO: purple=hdr-only, green=hdr&usr, yellow=usr-only
-   #      # BORDER: Black=hdr Blu=hdr&usr
-   #      grpcolor = GrouperColors(grprobj_cur)  # get_bordercolor get_go2color_users
-   #      grp_go2color = grpcolor.get_go2color_users()
-   #      grp_go2bordercolor = grpcolor.get_bordercolor()
-   #      for goid, color in go2color_usr.items():
-   #          grp_go2color[goid] = color
-   #      objcolor = Go2Color(self.gosubdag, objgoea=None,
-   #                          go2color=grp_go2color, go2bordercolor=grp_go2bordercolor)
-   #      go2txt = GrouperPlot.get_go2txt(grprobj_cur, grp_go2color, grp_go2bordercolor)
-   #      objplt = GoSubDagPlot(self.gosubdag, Go2Color=objcolor, go2txt=go2txt, **kws)
-   #      objplt.prt_goids(sys.stdout)
-   #      objplt.plt_dag(fout_img)
-   #      sys.stdout.write("{N:>6} sections read\n".format(
-   #          N="NO" if sections is None else len(sections)))
-   #      return fout_img
-
-   #  def _get_grprobj(self, goids, sections):
-   #      """Get Grouper, given GO IDs and sections."""
-   #      grprdflt = GrouperDflts(self.gosubdag, "goslim_generic.obo")
-   #      hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections)
-   #      return Grouper("sections", goids, hdrobj, self.gosubdag)
+        fout_img = self.get_outfile(kws['outfile'], goids)
+        sections = read_sections(kws['sections'], exclude_ungrouped=True)
+        print ("KWWSSSSSSSS", kws)
+        # kws_plt = {k:v for k, v in kws.items if k in self.kws_plt}
+        grprobj_cur = self._get_grprobj(goids, sections)
+        # GO: purple=hdr-only, green=hdr&usr, yellow=usr-only
+        # BORDER: Black=hdr Blu=hdr&usr
+        grpcolor = GrouperColors(grprobj_cur)  # get_bordercolor get_go2color_users
+        grp_go2color = grpcolor.get_go2color_users()
+        grp_go2bordercolor = grpcolor.get_bordercolor()
+        for goid, color in go2color_usr.items():
+            grp_go2color[goid] = color
+        objcolor = Go2Color(self.gosubdag, objgoea=None,
+                            go2color=grp_go2color, go2bordercolor=grp_go2bordercolor)
+        go2txt = GrouperPlot.get_go2txt(grprobj_cur, grp_go2color, grp_go2bordercolor)
+        objplt = GoSubDagPlot(self.gosubdag, Go2Color=objcolor, go2txt=go2txt, **kws)
+        objplt.prt_goids(sys.stdout)
+        objplt.plt_dag(fout_img)
+        sys.stdout.write("{N:>6} sections read\n".format(
+            N="NO" if sections is None else len(sections)))
+        return fout_img
+
+    def _get_grprobj(self, goids, sections):
+        """Get Grouper, given GO IDs and sections."""
+        grprdflt = GrouperDflts(self.gosubdag, "goslim_generic.obo")
+        hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections)
+        return Grouper("sections", goids, hdrobj, self.gosubdag)
 
     def _plt_gosubdag(self, goids, go2color, **kws):
         """Plot GO IDs."""
-        print("PLOTTING KWS", kws)
-        fout_img = self.get_outfile(kws['outfile'], goids, 'relationship' in kws)
+        fout_img = self.get_outfile(kws['outfile'], goids)
         objcolor = Go2Color(self.gosubdag, objgoea=None, go2color=go2color)
         objplt = GoSubDagPlot(self.gosubdag, Go2Color=objcolor, **kws)
         objplt.prt_goids(sys.stdout)
@@ -261,6 +244,10 @@ def _plt_gosubdag(self, goids, go2color, **kws):
     def _get_kwsdag(self, goids, go2obj, **kws_all):
         """Get keyword args for a GoSubDag."""
         kws_dag = {}
+        # Term Counts for GO Term information score
+        tcntobj = self._get_tcntobj(goids, go2obj, **kws_all)  # TermCounts or None
+        if tcntobj is not None:
+            kws_dag['tcntobj'] = tcntobj
         # GO letters specified by the user
         if 'go_aliases' in kws_all:
             fin_go_aliases = kws_all['go_aliases']
@@ -275,6 +262,9 @@ def _get_tcntobj(goids, go2obj, **kws):
         """Get a TermCounts object if the user provides an annotation file, otherwise None."""
         # kws: gaf (gene2go taxid)
         if 'gaf' in kws or 'gene2go' in kws:
+            # Get a reduced go2obj set for TermCounts
+            _gosubdag = GoSubDag(goids, go2obj, rcntobj=False)
+            #return get_tcntobj(_gosubdag.go2obj, **kws)  # TermCounts
             return get_tcntobj(go2obj, **kws)  # TermCounts
 
     def get_docargs(self, args=None, prt=None):
@@ -311,23 +301,19 @@ def _err(self, msg, err=True):
         sys.stdout.write(txt)
         sys.exit(0)
 
-    def get_outfile(self, outfile, goids=None, b_rel=False):
+    def get_outfile(self, outfile, goids=None):
         """Return output file for GO Term plot."""
         # 1. Use the user-specfied output filename for the GO Term plot
         if outfile != self.dflt_outfile:
             return outfile
-        rstr = "_r1" if b_rel else ""
         # 2. If only plotting 1 GO term, use GO is in plot name
         if goids is not None and len(goids) == 1:
             goid = next(iter(goids))
             goobj = self.gosubdag.go2obj[goid]
             fout = "GO_{NN}_{NM}".format(NN=goid.replace("GO:", ""), NM=goobj.name)
-            return "".join([re.sub(r"[\s#'()+,-./:<=>\[\]_}]", '_', fout), rstr, '.png'])
+            return ".".join([re.sub(r"[\s#'()+,-./:<=>\[\]_}]", '_', fout), 'png'])
         # 3. Return default name
-        if not b_rel:
-            return self.dflt_outfile
-        else:
-            return self.dflt_outfile.replace('.png', '_r1.png')
+        return self.dflt_outfile
 
     @staticmethod
     def _get_optional_attrs(kws):

diff --git a/goatools/cli/wr_sections.py b/goatools/cli/wr_sections.py
@@ -0,0 +1,112 @@
+"""Command-line interface to create an initial Python sections file
+
+Usage:
+  wr_sections.py [GO_FILE]
+  wr_sections.py [GO_FILE] [options]
+
+Options:
+  -h --help                                 show this help message and exit
+
+  -i <file.txt>, --ifile=<sections_in.txt>  Read or Write file name [default: sections_in.txt]
+  -o <file.txt>, --ofile=<sections.txt>     write file name [default: sections.txt]
+  --txt=<file.txt>                          Write file name [default: grouped_gos.txt]
+
+  --py=<file.py>       Write the sections list into a Python file
+  --xlsx=<file.xlsx>   Group user GO IDs and write the results into an xlsx file
+
+  --obo=<file.obo>     Ontologies in obo file [default: go-basic.obo].
+  --slims=<file.obo>   GO slims in obo file [default: goslim_generic.obo].
+
+  --gaf=<file.gaf>     Annotations from a gaf file
+  --gene2go=<gene2go>  Annotations from a gene2go file downloaded from NCBI
+
+"""
+
+from __future__ import print_function
+
+__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved."
+__author__ = "DV Klopfenstein"
+
+
+import os
+import sys
+
+from goatools.base import get_godag
+from goatools.associations import get_tcntobj
+
+from goatools.cli.docopt_parse import DocOptParse
+from goatools.cli.gos_get import GetGOs
+
+from goatools.gosubdag.gosubdag import GoSubDag
+
+from goatools.grouper.read_goids import read_sections
+from goatools.grouper.grprdflts import GrouperDflts
+from goatools.grouper.hdrgos import HdrgosSections
+from goatools.grouper.grprobj import Grouper
+from goatools.grouper.wr_sections import WrSections
+from goatools.grouper.wr_sections import WrPySections
+from goatools.grouper.sorter import Sorter
+from goatools.grouper.wrxlsx import WrXlsxSortedGos
+
+
+# pylint: disable=too-few-public-methods
+class SectionsWr(object):
+    """Class for command-line interface for creating GO term diagrams"""
+
+    kws_dict = set(['GO_FILE', 'obo', 'slims',
+                    'ifile', 'ofile', 'txt',
+                    'py', 'xlsx',
+                    'gaf', 'gene2go', 'taxid'])
+    kws_set = set()
+
+    def __init__(self, gosubdag=None):
+        self.objdoc = DocOptParse(__doc__, self.kws_dict, self.kws_set)
+        self.gosubdag = None if gosubdag is None else gosubdag
+
+    def cli(self, prt=sys.stdout):
+        """Command-line interface for go_draw script."""
+        kws = self.objdoc.get_docargs(prt=None)
+        godag = get_godag(kws['obo'], prt=None, loading_bar=False, optional_attrs=['relationship'])
+        usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get('GO_FILE'), prt)
+        tcntobj = self._get_tcntobj(usrgos, godag, **kws)  # Gets TermCounts or None
+        self.gosubdag = GoSubDag(usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None)
+        grprdflt = GrouperDflts(self.gosubdag, kws['slims'])
+        ver_list = [godag.version, grprdflt.ver_goslims]
+        prt.write("{VER}\n".format(VER="\n".join(ver_list)))
+        sections = read_sections(kws['ifile'], exclude_ungrouped=True, prt=None)
+        # print("SECSECSEC", sections)
+        hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections)
+        grprobj = Grouper("init", usrgos, hdrobj, self.gosubdag)
+        # Write sections
+        objsecwr = WrSections(grprobj, ver_list)
+        if not os.path.exists(kws['ifile']):
+            objsecwr.wr_txt_section_hdrgos(kws['ifile'])
+        objsecwr.wr_txt_section_hdrgos(kws['ofile'])
+        objsecpy = WrPySections(grprobj, ver_list)
+        if 'py' in kws:
+            objsecpy.wr_py_sections(kws['py'], sections, doc=godag.version)
+        # Write user GO IDs in sections
+        sortobj = Sorter(grprobj)
+        objgowr = WrXlsxSortedGos("init", sortobj, ver_list)
+        objgowr.wr_txt_gos(kws['txt'], sortby=objsecpy.fncsortnt)
+        #objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt)
+        self._prt_cnt_usrgos(usrgos, sys.stdout)
+
+    def _prt_cnt_usrgos(self, usrgos_read, prt):
+        num_usrgos = len(self.gosubdag.go_sources)
+        prt.write("{GOs:6} user GO IDs".format(GOs=num_usrgos))
+        if len(usrgos_read) != num_usrgos:
+            prt.write(" of {M} GO IDs read".format(M=len(usrgos_read)))
+        prt.write("\n")
+
+    @staticmethod
+    def _get_tcntobj(goids, go2obj, **kws):
+        """Get a TermCounts object if the user provides an annotation file, otherwise None."""
+        # kws: gaf (gene2go taxid)
+        if 'gaf' in kws or 'gene2go' in kws:
+            # Get a reduced go2obj set for TermCounts
+            _gosubdag = GoSubDag(goids, go2obj, rcntobj=False, prt=None)
+            return get_tcntobj(_gosubdag.go2obj, **kws)  # TermCounts
+
+
+# Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved.
diff --git a/goatools/gosubdag/go_most_specific.py b/goatools/gosubdag/go_most_specific.py
@@ -0,0 +1,38 @@
+"""Various methods to estimating if a GO term is more specific than another GO term."""
+
+__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved."
+__author__ = "DV Klopfenstein"
+
+
+def get_most_specific_dcnt(goids, go2nt):
+    """Get the GO ID with the lowest descendants count."""
+    # go2nt_usr = {go:go2nt[go] for go in goids}
+    # return min(go2nt_usr.items(), key=lambda t: t[1].dcnt)[0]
+    return min(_get_go2nt(goids, go2nt), key=lambda t: t[1].dcnt)[0]
+
+def get_most_specific_tinfo(goids, go2nt):
+    """Get the GO ID with the highest GO term annotation information value."""
+    # go2nt_usr = {go:go2nt[go] for go in goids}
+    # return max(go2nt_usr.items(), key=lambda t: t[1].tinfo)[0]
+    return max(_get_go2nt(goids, go2nt), key=lambda t: t[1].tinfo)[0]
+
+def get_most_specific_tinfo_dcnt(goids, go2nt):
+    """Get the GO ID with the highest GO term annotation information value."""
+    # go2nt_usr = {go:go2nt[go] for go in goids}
+    # return max(go2nt_usr.items(), key=lambda t: [t[1].tinfo, t[1].dcnt])[0]
+    return max(_get_go2nt(goids, go2nt), key=lambda t: [t[1].tinfo, t[1].dcnt])[0]
+
+def _get_go2nt(goids, go2nt_all):
+    """Get user go2nt using main GO IDs, not alt IDs."""
+    go_nt_list = []
+    goids_seen = set()
+    for goid_usr in goids:
+        ntgo = go2nt_all[goid_usr]
+        goid_main = ntgo.id
+        if goid_main not in goids_seen:
+            goids_seen.add(goid_main)
+            go_nt_list.append((goid_main, ntgo))
+    return go_nt_list
+
+
+# Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved.
diff --git a/goatools/grouper/__init__.py b/goatools/grouper/__init__.py
@@ -1 +1 @@
-# Group GO Terms under researcher-defined sections
+# Directory for grouper functions and classes