Skip to content

Commit

Permalink
Added new grouping code.
Browse files Browse the repository at this point in the history
  • Loading branch information
dvklopfenstein committed Jun 26, 2018
1 parent 3489301 commit b3fe1cb
Show file tree
Hide file tree
Showing 24 changed files with 2,887 additions and 67 deletions.
118 changes: 52 additions & 66 deletions goatools/cli/gosubdag_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@
from goatools.gosubdag.go_tasks import get_go2obj_unique
from goatools.gosubdag.go_tasks import get_leaf_children
from goatools.gosubdag.rpt.wr_xlsx import read_d1_letter
# COMING SOON: Plotting using GOATOOLS grouping:
# from goatools.gosubdag.rpt.read_goids import read_sections
# from goatools.grouper.grprdflts import GrouperDflts
# from goatools.grouper.hdrgos import HdrgosSections
# from goatools.grouper.grprobj import Grouper
# from goatools.grouper.colors import GrouperColors
# from goatools.grouper.grprplt import GrouperPlot

from goatools.grouper.read_goids import read_sections
from goatools.grouper.grprdflts import GrouperDflts
from goatools.grouper.hdrgos import HdrgosSections
from goatools.grouper.grprobj import Grouper
from goatools.grouper.colors import GrouperColors
from goatools.grouper.grprplt import GrouperPlot


# pylint: disable=too-few-public-methods
Expand All @@ -94,24 +94,25 @@ class GetGOs(object):

def __init__(self, go2obj):
self.go2obj = go2obj
self.re_goids = re.compile(r"(GO:\d{7})+?")
self.re_color = re.compile(r"(#[0-9a-fA-F]{6})+?")
self.re_goids = re.compile(ur"(GO:\d{7})+?")
self.re_color = re.compile(ur"(#[0-9a-fA-F]{6})+?")

def get_go_color(self, **kws):
"""Return source GO IDs ."""
# kws: GO go_file draw-children
ret = {'GOs':set(), 'go2color':{}}
if 'GO' in kws:
self._goargs(ret, kws['GO'])
if 'go_file' in kws:
self._rdtxt_gos(ret, kws['go_file'])
if 'draw-children' in kws:
self._add_gochildleaf(ret)
ret['GOs'].update(get_leaf_children(ret['GOs'], self.go2obj))
# If there have been no GO IDs explicitly specified by the user
if not ret['GOs']:
# If the GO-DAG is sufficiently small, print all GO IDs
if len(self.go2obj) < self.max_gos:
self._add_all_leafs(ret)
main_gos = set(o.id for go, o in self.go2obj.items() if go != o.id)
go_leafs = set(go for go, o in self.go2obj.items() if not o.children)
ret['GOs'] = go_leafs.difference(main_gos)
else:
raise RuntimeError("GO IDs NEEDED")
go2obj = {go:self.go2obj[go] for go in ret['GOs']}
Expand Down Expand Up @@ -155,23 +156,6 @@ def _rdtxt_gos(self, ret, go_file):
print("IGNORING: {L}".format(L=line),)
self._update_ret(ret, goids, go2color)

def _add_gochildleaf(self, ret):
"""Add leaf-level GO children to GO list colored uniquely."""
leaf_gos = get_leaf_children(ret['GOs'], self.go2obj)
if leaf_gos:
ret['GOs'].update(leaf_gos)
leaf_go_color = Go2Color.key2col['go_leafchild']
go2color = ret['go2color']
for goid in leaf_gos:
if goid not in go2color:
go2color[goid] = leaf_go_color

def _add_all_leafs(self, ret):
"""Print all GO IDs."""
main_gos = set(o.id for go, o in self.go2obj.items() if go != o.id)
go_leafs = set(go for go, o in self.go2obj.items() if not o.children)
ret['GOs'] = go_leafs.difference(main_gos)

@staticmethod
def _update_ret(ret, goids, go2color):
"""Update 'GOs' and 'go2color' in dict with goids and go2color."""
Expand Down Expand Up @@ -209,6 +193,7 @@ def cli(self):
# GO kws_all: GO go_file draw-children
goids, go2color = GetGOs(go2obj).get_go_color(**kws_all)
relationships = 'relationship' in optional_attrs
#### self.gosubdag = GoSubDag(goids, go2obj, relationships, tcntobj=tcntobj)
kws_dag = self._get_kwsdag(goids, go2obj, **kws_all)
self.gosubdag = GoSubDag(goids, go2obj, relationships, **kws_dag)

Expand All @@ -217,41 +202,39 @@ def cli(self):
else:
return self._plt_gosubdag(goids, go2color, **kws_all)

# pylint: disable=unused-argument,no-self-use
def _plt_gogrouped(self, goids, go2color_usr, **kws):
"""Plot grouped GO IDs."""
print("Plotting with GOATOOLS grouping coming soon...")
# fout_img = self.get_outfile(kws['outfile'], goids, 'relationship' in kws)
# sections = read_sections(kws['sections'], exclude_ungrouped=True)
# # kws_plt = {k:v for k, v in kws.items if k in self.kws_plt}
# grprobj_cur = self._get_grprobj(goids, sections)
# # GO: purple=hdr-only, green=hdr&usr, yellow=usr-only
# # BORDER: Black=hdr Blu=hdr&usr
# grpcolor = GrouperColors(grprobj_cur) # get_bordercolor get_go2color_users
# grp_go2color = grpcolor.get_go2color_users()
# grp_go2bordercolor = grpcolor.get_bordercolor()
# for goid, color in go2color_usr.items():
# grp_go2color[goid] = color
# objcolor = Go2Color(self.gosubdag, objgoea=None,
# go2color=grp_go2color, go2bordercolor=grp_go2bordercolor)
# go2txt = GrouperPlot.get_go2txt(grprobj_cur, grp_go2color, grp_go2bordercolor)
# objplt = GoSubDagPlot(self.gosubdag, Go2Color=objcolor, go2txt=go2txt, **kws)
# objplt.prt_goids(sys.stdout)
# objplt.plt_dag(fout_img)
# sys.stdout.write("{N:>6} sections read\n".format(
# N="NO" if sections is None else len(sections)))
# return fout_img

# def _get_grprobj(self, goids, sections):
# """Get Grouper, given GO IDs and sections."""
# grprdflt = GrouperDflts(self.gosubdag, "goslim_generic.obo")
# hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections)
# return Grouper("sections", goids, hdrobj, self.gosubdag)
fout_img = self.get_outfile(kws['outfile'], goids)
sections = read_sections(kws['sections'], exclude_ungrouped=True)
print ("KWWSSSSSSSS", kws)
# kws_plt = {k:v for k, v in kws.items if k in self.kws_plt}
grprobj_cur = self._get_grprobj(goids, sections)
# GO: purple=hdr-only, green=hdr&usr, yellow=usr-only
# BORDER: Black=hdr Blu=hdr&usr
grpcolor = GrouperColors(grprobj_cur) # get_bordercolor get_go2color_users
grp_go2color = grpcolor.get_go2color_users()
grp_go2bordercolor = grpcolor.get_bordercolor()
for goid, color in go2color_usr.items():
grp_go2color[goid] = color
objcolor = Go2Color(self.gosubdag, objgoea=None,
go2color=grp_go2color, go2bordercolor=grp_go2bordercolor)
go2txt = GrouperPlot.get_go2txt(grprobj_cur, grp_go2color, grp_go2bordercolor)
objplt = GoSubDagPlot(self.gosubdag, Go2Color=objcolor, go2txt=go2txt, **kws)
objplt.prt_goids(sys.stdout)
objplt.plt_dag(fout_img)
sys.stdout.write("{N:>6} sections read\n".format(
N="NO" if sections is None else len(sections)))
return fout_img

def _get_grprobj(self, goids, sections):
"""Get Grouper, given GO IDs and sections."""
grprdflt = GrouperDflts(self.gosubdag, "goslim_generic.obo")
hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections)
return Grouper("sections", goids, hdrobj, self.gosubdag)

def _plt_gosubdag(self, goids, go2color, **kws):
"""Plot GO IDs."""
print("PLOTTING KWS", kws)
fout_img = self.get_outfile(kws['outfile'], goids, 'relationship' in kws)
fout_img = self.get_outfile(kws['outfile'], goids)
objcolor = Go2Color(self.gosubdag, objgoea=None, go2color=go2color)
objplt = GoSubDagPlot(self.gosubdag, Go2Color=objcolor, **kws)
objplt.prt_goids(sys.stdout)
Expand All @@ -261,6 +244,10 @@ def _plt_gosubdag(self, goids, go2color, **kws):
def _get_kwsdag(self, goids, go2obj, **kws_all):
"""Get keyword args for a GoSubDag."""
kws_dag = {}
# Term Counts for GO Term information score
tcntobj = self._get_tcntobj(goids, go2obj, **kws_all) # TermCounts or None
if tcntobj is not None:
kws_dag['tcntobj'] = tcntobj
# GO letters specified by the user
if 'go_aliases' in kws_all:
fin_go_aliases = kws_all['go_aliases']
Expand All @@ -275,6 +262,9 @@ def _get_tcntobj(goids, go2obj, **kws):
"""Get a TermCounts object if the user provides an annotation file, otherwise None."""
# kws: gaf (gene2go taxid)
if 'gaf' in kws or 'gene2go' in kws:
# Get a reduced go2obj set for TermCounts
_gosubdag = GoSubDag(goids, go2obj, rcntobj=False)
#return get_tcntobj(_gosubdag.go2obj, **kws) # TermCounts
return get_tcntobj(go2obj, **kws) # TermCounts

def get_docargs(self, args=None, prt=None):
Expand Down Expand Up @@ -311,23 +301,19 @@ def _err(self, msg, err=True):
sys.stdout.write(txt)
sys.exit(0)

def get_outfile(self, outfile, goids=None, b_rel=False):
def get_outfile(self, outfile, goids=None):
"""Return output file for GO Term plot."""
# 1. Use the user-specfied output filename for the GO Term plot
if outfile != self.dflt_outfile:
return outfile
rstr = "_r1" if b_rel else ""
# 2. If only plotting 1 GO term, use GO is in plot name
if goids is not None and len(goids) == 1:
goid = next(iter(goids))
goobj = self.gosubdag.go2obj[goid]
fout = "GO_{NN}_{NM}".format(NN=goid.replace("GO:", ""), NM=goobj.name)
return "".join([re.sub(r"[\s#'()+,-./:<=>\[\]_}]", '_', fout), rstr, '.png'])
return ".".join([re.sub(r"[\s#'()+,-./:<=>\[\]_}]", '_', fout), 'png'])
# 3. Return default name
if not b_rel:
return self.dflt_outfile
else:
return self.dflt_outfile.replace('.png', '_r1.png')
return self.dflt_outfile

@staticmethod
def _get_optional_attrs(kws):
Expand Down
112 changes: 112 additions & 0 deletions goatools/cli/wr_sections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Command-line interface to create an initial Python sections file
Usage:
wr_sections.py [GO_FILE]
wr_sections.py [GO_FILE] [options]
Options:
-h --help show this help message and exit
-i <file.txt>, --ifile=<sections_in.txt> Read or Write file name [default: sections_in.txt]
-o <file.txt>, --ofile=<sections.txt> write file name [default: sections.txt]
--txt=<file.txt> Write file name [default: grouped_gos.txt]
--py=<file.py> Write the sections list into a Python file
--xlsx=<file.xlsx> Group user GO IDs and write the results into an xlsx file
--obo=<file.obo> Ontologies in obo file [default: go-basic.obo].
--slims=<file.obo> GO slims in obo file [default: goslim_generic.obo].
--gaf=<file.gaf> Annotations from a gaf file
--gene2go=<gene2go> Annotations from a gene2go file downloaded from NCBI
"""

from __future__ import print_function

__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved."
__author__ = "DV Klopfenstein"


import os
import sys

from goatools.base import get_godag
from goatools.associations import get_tcntobj

from goatools.cli.docopt_parse import DocOptParse
from goatools.cli.gos_get import GetGOs

from goatools.gosubdag.gosubdag import GoSubDag

from goatools.grouper.read_goids import read_sections
from goatools.grouper.grprdflts import GrouperDflts
from goatools.grouper.hdrgos import HdrgosSections
from goatools.grouper.grprobj import Grouper
from goatools.grouper.wr_sections import WrSections
from goatools.grouper.wr_sections import WrPySections
from goatools.grouper.sorter import Sorter
from goatools.grouper.wrxlsx import WrXlsxSortedGos


# pylint: disable=too-few-public-methods
class SectionsWr(object):
"""Class for command-line interface for creating GO term diagrams"""

kws_dict = set(['GO_FILE', 'obo', 'slims',
'ifile', 'ofile', 'txt',
'py', 'xlsx',
'gaf', 'gene2go', 'taxid'])
kws_set = set()

def __init__(self, gosubdag=None):
self.objdoc = DocOptParse(__doc__, self.kws_dict, self.kws_set)
self.gosubdag = None if gosubdag is None else gosubdag

def cli(self, prt=sys.stdout):
"""Command-line interface for go_draw script."""
kws = self.objdoc.get_docargs(prt=None)
godag = get_godag(kws['obo'], prt=None, loading_bar=False, optional_attrs=['relationship'])
usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get('GO_FILE'), prt)
tcntobj = self._get_tcntobj(usrgos, godag, **kws) # Gets TermCounts or None
self.gosubdag = GoSubDag(usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None)
grprdflt = GrouperDflts(self.gosubdag, kws['slims'])
ver_list = [godag.version, grprdflt.ver_goslims]
prt.write("{VER}\n".format(VER="\n".join(ver_list)))
sections = read_sections(kws['ifile'], exclude_ungrouped=True, prt=None)
# print("SECSECSEC", sections)
hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections)
grprobj = Grouper("init", usrgos, hdrobj, self.gosubdag)
# Write sections
objsecwr = WrSections(grprobj, ver_list)
if not os.path.exists(kws['ifile']):
objsecwr.wr_txt_section_hdrgos(kws['ifile'])
objsecwr.wr_txt_section_hdrgos(kws['ofile'])
objsecpy = WrPySections(grprobj, ver_list)
if 'py' in kws:
objsecpy.wr_py_sections(kws['py'], sections, doc=godag.version)
# Write user GO IDs in sections
sortobj = Sorter(grprobj)
objgowr = WrXlsxSortedGos("init", sortobj, ver_list)
objgowr.wr_txt_gos(kws['txt'], sortby=objsecpy.fncsortnt)
#objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt)
self._prt_cnt_usrgos(usrgos, sys.stdout)

def _prt_cnt_usrgos(self, usrgos_read, prt):
num_usrgos = len(self.gosubdag.go_sources)
prt.write("{GOs:6} user GO IDs".format(GOs=num_usrgos))
if len(usrgos_read) != num_usrgos:
prt.write(" of {M} GO IDs read".format(M=len(usrgos_read)))
prt.write("\n")

@staticmethod
def _get_tcntobj(goids, go2obj, **kws):
"""Get a TermCounts object if the user provides an annotation file, otherwise None."""
# kws: gaf (gene2go taxid)
if 'gaf' in kws or 'gene2go' in kws:
# Get a reduced go2obj set for TermCounts
_gosubdag = GoSubDag(goids, go2obj, rcntobj=False, prt=None)
return get_tcntobj(_gosubdag.go2obj, **kws) # TermCounts


# Copyright (C) 2016-2018, DV Klopfenstein, H Tang. All rights reserved.
38 changes: 38 additions & 0 deletions goatools/gosubdag/go_most_specific.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Various methods to estimating if a GO term is more specific than another GO term."""

__copyright__ = "Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved."
__author__ = "DV Klopfenstein"


def get_most_specific_dcnt(goids, go2nt):
"""Get the GO ID with the lowest descendants count."""
# go2nt_usr = {go:go2nt[go] for go in goids}
# return min(go2nt_usr.items(), key=lambda t: t[1].dcnt)[0]
return min(_get_go2nt(goids, go2nt), key=lambda t: t[1].dcnt)[0]

def get_most_specific_tinfo(goids, go2nt):
"""Get the GO ID with the highest GO term annotation information value."""
# go2nt_usr = {go:go2nt[go] for go in goids}
# return max(go2nt_usr.items(), key=lambda t: t[1].tinfo)[0]
return max(_get_go2nt(goids, go2nt), key=lambda t: t[1].tinfo)[0]

def get_most_specific_tinfo_dcnt(goids, go2nt):
"""Get the GO ID with the highest GO term annotation information value."""
# go2nt_usr = {go:go2nt[go] for go in goids}
# return max(go2nt_usr.items(), key=lambda t: [t[1].tinfo, t[1].dcnt])[0]
return max(_get_go2nt(goids, go2nt), key=lambda t: [t[1].tinfo, t[1].dcnt])[0]

def _get_go2nt(goids, go2nt_all):
"""Get user go2nt using main GO IDs, not alt IDs."""
go_nt_list = []
goids_seen = set()
for goid_usr in goids:
ntgo = go2nt_all[goid_usr]
goid_main = ntgo.id
if goid_main not in goids_seen:
goids_seen.add(goid_main)
go_nt_list.append((goid_main, ntgo))
return go_nt_list


# Copyright (C) 2016-2018, DV Klopfenstein, H Tang, All rights reserved.
2 changes: 1 addition & 1 deletion goatools/grouper/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
# Group GO Terms under researcher-defined sections
# Directory for grouper functions and classes

0 comments on commit b3fe1cb

Please sign in to comment.