Skip to content

Commit

Permalink
Use new OBOReader class for a 35x speed improvement when reading larg…
Browse files Browse the repository at this point in the history
…e obo files.
  • Loading branch information
dvklopfenstein committed Jul 20, 2015
1 parent a83201c commit e06f151
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 108 deletions.
122 changes: 15 additions & 107 deletions goatools/obo_parser.py
Original file line number Diff line number Diff line change
@@ -1,104 +1,18 @@
#!/usr/bin/env python
"""Read and store Gene Ontology's obo file."""
# -*- coding: UTF-8 -*-
from __future__ import print_function
import sys
import os
import re
try:
from exceptions import EOFError
except ImportError:
pass

import collections as cx

typedef_tag, term_tag = "[Typedef]", "[Term]"
GraphEngines = ("pygraphviz", "pydot")


def after_colon(line):
# macro for getting anything after the :
return line.split(":", 1)[1].strip()


def read_until(handle, start):
# read each line until it has a certain start, and then puts
# the start tag back
while 1:
pos = handle.tell()
line = handle.readline()
if not line:
break
if line.startswith(start):
handle.seek(pos)
return
raise EOFError("%s tag cannot be found" % start)


class OBOReader:
"""
parse obo file, usually the most updated can be downloaded from
http://purl.obolibrary.org/obo/go/go-basic.obo
>>> reader = OBOReader()
>>> for rec in reader:
print rec
"""

def __init__(self, obo_file="go-basic.obo"):

try:
self._handle = open(obo_file, buffering=0) # dirty hack to fix seek() inconsistency
except:
print(("download obo file first\n "
"[http://purl.obolibrary.org/obo/"
"go/go-basic.obo]"), file=sys.stderr)
sys.exit(1)

def __iter__(self):

line = self._handle.readline()
if not line.startswith(term_tag):
read_until(self._handle, term_tag)
while 1:
yield self.__next__()

def __next__(self):

lines = []
line = self._handle.readline()
if not line or line.startswith(typedef_tag):
raise StopIteration

# read until the next tag and save everything in between
while 1:
pos = self._handle.tell() # save current postion for roll-back
line = self._handle.readline()
if not line or (line.startswith(typedef_tag)
or line.startswith(term_tag)):
self._handle.seek(pos) # roll-back
break
lines.append(line)

rec = GOTerm()
for line in lines:
if line.startswith("id:"):
rec.id = after_colon(line)
if line.startswith("alt_id:"):
rec.alt_ids.append(after_colon(line))
elif line.startswith("name:"):
rec.name = after_colon(line)
elif line.startswith("namespace:"):
rec.namespace = after_colon(line)
elif line.startswith("is_a:"):
rec._parents.append(after_colon(line).split()[0])
elif (line.startswith("is_obsolete:") and
after_colon(line) == "true"):
rec.is_obsolete = True

return rec

class OBOReader_alt(object):
class OBOReader(object):
"""Read goatools.org's obo file. Load into this iterable class.
Download obo from: http://purl.obolibrary.org/obo/go/go-basic.obo
Expand Down Expand Up @@ -126,10 +40,9 @@ def __iter__(self):
for lnum, line in enumerate(fstream):
# obo lines start with any of: [Term], [Typedef], /^\S+:/, or /^\s*/
if line[0:6] == "[Term]":
rec_curr = self._init_GOTerm_ref(rec_curr, "Term", lnum)
rec_curr = self._init_goterm_ref(rec_curr, "Term", lnum)
elif line[0:9] == "[Typedef]":
pass # Original OBOReader did not store these
#rec_curr = self._init_GOTerm_ref(rec_curr, "Typedef", lnum) # TBD remove
elif rec_curr is not None:
line = line.rstrip() # chomp
if ":" in line:
Expand All @@ -144,13 +57,13 @@ def __iter__(self):
if rec_curr is not None:
yield rec_curr

def _init_GOTerm_ref(self, rec_curr, name, lnum):
def _init_goterm_ref(self, rec_curr, name, lnum):
"""Initialize new reference and perform checks."""
if rec_curr is None:
return GOTerm()
msg = "PREVIOUS {REC} WAS NOT TERMINATED AS EXPECTED".format(REC=name)
self._die(msg, lnum)

def _add_to_ref(self, rec_curr, line, lnum):
"""Add new fields to the current reference."""
# Examples of record lines containing ':' include:
Expand All @@ -164,15 +77,15 @@ def _add_to_ref(self, rec_curr, line, lnum):
field_name = mtch.group(1)
field_value = mtch.group(2)
if field_name == "id":
self._chk_None(rec_curr.id, lnum)
self._chk_none(rec_curr.id, lnum)
rec_curr.id = field_value
if field_name == "alt_id":
rec_curr.alt_ids.append(field_value)
elif field_name == "name":
self._chk_None(rec_curr.name, lnum)
self._chk_none(rec_curr.name, lnum)
rec_curr.name = field_value
elif field_name == "namespace":
self._chk_None(rec_curr.namespace, lnum)
self._chk_none(rec_curr.namespace, lnum)
rec_curr.namespace = field_value
elif field_name == "is_a":
rec_curr._parents.append(field_value.split()[0])
Expand All @@ -186,12 +99,12 @@ def _die(self, msg, lnum):
raise Exception("**FATAL {FILE}({LNUM}): {MSG}\n".format(
FILE=self.obo_file, LNUM=lnum, MSG=msg))

def _chk_None(self, init_val, lnum):
def _chk_none(self, init_val, lnum):
"""Expect these lines to be uninitialized."""
if init_val is None or init_val is "":
return
self.die("FIELD IS ALREADY INITIALIZED", lnum)
self._die("FIELD IS ALREADY INITIALIZED", lnum)




Expand Down Expand Up @@ -300,19 +213,14 @@ def write_hier_rec(self, gos_printed, out=sys.stdout,

class GODag(dict):

def __init__(self, obo_file="go-basic.obo", OBOReader_test=False):
def __init__(self, obo_file="go-basic.obo"):

self.load_obo_file(obo_file, OBOReader_test)
self.load_obo_file(obo_file)

def load_obo_file(self, obo_file, OBOReader_test):
def load_obo_file(self, obo_file):

print("load obo file %s" % obo_file, file=sys.stderr)
obo_reader = None
if OBOReader_test:
obo_reader = OBOReader_alt(obo_file)
else:
obo_reader = OBOReader(obo_file)
for rec in obo_reader:
for rec in OBOReader(obo_file):
self[rec.id] = rec
for alt in rec.alt_ids:
self[alt] = rec
Expand Down
2 changes: 1 addition & 1 deletion goatools/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.6"
__version__ = "0.5.7"

0 comments on commit e06f151

Please sign in to comment.