Skip to content

Commit

Permalink
Added the ability to compress/shorten the hierarchy reports by not
Browse files Browse the repository at this point in the history
printing a path to a GO leaf-level term more than once.
  • Loading branch information
dvklopfenstein committed Apr 14, 2015
1 parent c6a6e09 commit 6fcff34
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 14 deletions.
33 changes: 22 additions & 11 deletions goatools/obo_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,21 +162,31 @@ def get_all_child_edges(self):
all_child_edges |= p.get_all_child_edges()
return all_child_edges

def write_hier_rec(self, out=sys.stdout,
len_dash=1, max_depth=None, num_child=None,
def write_hier_rec(self, gos_printed, out=sys.stdout,
len_dash=1, max_depth=None, num_child=None, short_prt=False,
depth=1, dp="-"):
"""Write hierarchy for a GO Term record."""
GO_id = self.id
nrp = short_prt and GO_id in gos_printed
if len_dash is not None:
dp = ''.join(['-']*depth) if len_dash is not None else ''
out.write('{DASHES:{N}} '.format(DASHES=dp, N=len_dash))
letter = '-' if not nrp or not self.children else '='
dp = ''.join([letter]*depth)
out.write('{DASHES:{N}} '.format(DASHES=dp, N=len_dash))
if num_child is not None:
out.write('{N:>5} '.format(N=len(self.get_all_children())))
out.write('{N:>5} '.format(N=len(self.get_all_children())))
out.write('{GO}\n'.format(GO=self))
# Track GOs previously printed only if needed
if short_prt:
gos_printed.add(GO_id)
# Do not print hierarchy below this turn if it has already been printed
if nrp:
return
depth += 1
if max_depth is not None and depth > max_depth:
return
return
for p in self.children:
p.write_hier_rec(out, len_dash, max_depth, num_child, depth, dp)
p.write_hier_rec(gos_printed, out, len_dash, max_depth, num_child, short_prt,
depth, dp)


class GODag(dict):
Expand Down Expand Up @@ -237,16 +247,17 @@ def write_dag(self, out=sys.stdout):
print(rec, file=out)

def write_hier_all(self, out=sys.stdout,
len_dash=1, max_depth=None, num_child=None):
len_dash=1, max_depth=None, num_child=None, short_prt=False):
"""Write hierarchy for all GO Terms in obo file."""
# Print: [biological_process, molecular_function, and cellular_component]
for go_id in ['GO:0008150', 'GO:0003674', 'GO:0005575']:
self.write_hier(go_id, out, len_dash, max_depth, num_child)
self.write_hier(go_id, out, len_dash, max_depth, num_child, short_prt)

def write_hier(self, GO_id, out=sys.stdout,
len_dash=1, max_depth=None, num_child=None):
len_dash=1, max_depth=None, num_child=None, short_prt=False):
"""Write hierarchy for a GO Term."""
self[GO_id].write_hier_rec(out, len_dash, max_depth, num_child)
gos_printed = set()
self[GO_id].write_hier_rec(gos_printed, out, len_dash, max_depth, num_child, short_prt)

def write_summary_cnts(self, GO_ids, out=sys.stdout):
"""Write summary of level and depth counts for specific GO ids."""
Expand Down
24 changes: 21 additions & 3 deletions scripts/write_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@
>>> python {SCR} --max_depth=2
>>> python {SCR} --max_depth=2 --dash_len=2 --num_child
Print a shortened version of the hierarchy for BP, MF, and CC.
This will only print a path to a leaf GO Term once.
If the path appears a second time, the term is printed again, but its path is not.
The presence of a compressed (unprinted) paths is marked by using '=" instead of '-'.
$ wc -l hier_BP_MF_CC*.rpt
789583 hier_BP_MF_CC.rpt
70152 hier_BP_MF_CC_short.rpt
>>> python {SCR} --o=hier_BP_MF_CC_short.rpt --short
Print hierarchy
- 26894 GO:0008150 level-00 depth-00 biological_process [biological_process]
-- 30 GO:0001906 level-01 depth-01 cell killing [biological_process]
Expand Down Expand Up @@ -62,7 +75,10 @@
p.add_argument('--max_depth', default=None, type=int,
help="max depth for printing relative to GO Term")
p.add_argument('--num_child', default=None, action='store_true',
help="Print total number of children for each GO")
help="Print count of total number of children for each GO")
p.add_argument('--short', default=False, action='store_true',
help="If a branch has already been printed, do not re-print."
"Print '===' instead of dashes to note the point of compression")

args = p.parse_args()

Expand All @@ -78,13 +94,15 @@
file_out,
len_dash=lenprt,
max_depth=args.max_depth,
num_child=args.num_child)
num_child=args.num_child,
short_prt=args.short)
else:
obo_dag.write_hier_all(
file_out,
len_dash=lenprt,
max_depth=args.max_depth,
num_child=args.num_child)
num_child=args.num_child,
short_prt=args.short)

if args.o is not None:
file_out.close()
Expand Down

0 comments on commit 6fcff34

Please sign in to comment.