Skip to content

Commit

Permalink
If unexpected qualifier: Load GAF line and report warning instead of …
Browse files Browse the repository at this point in the history
…fatal error
  • Loading branch information
dvklopfenstein committed Jul 7, 2018
1 parent 717dece commit ddfe095
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 25 deletions.
58 changes: 34 additions & 24 deletions goatools/anno/gaf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ def _get_not(keep_not):
return lambda nt: 'NOT' not in nt.Qualifier

def _init_assn(self, fin_gaf, hdr_only, prt):
"""Read GAF file. Store annotation data in a list of namedtuples."""
nts = self._read_gaf_nts(fin_gaf, hdr_only)
# GAF file has been read
if prt:
prt.write(" READ {N:9,} associations: {FIN}\n".format(N=len(nts), FIN=fin_gaf))
# If there are illegal GAF lines ...
if self.datobj:
if self.datobj.ignored or self.datobj.illegal_lines:
self.datobj.prt_error_summary(fin_gaf)
return self.evobj.sort_nts(nts, 'Evidence_Code')

def _read_gaf_nts(self, fin_gaf, hdr_only):
"""Read GAF file. Store annotation data in a list of namedtuples."""
nts = []
ver = None
Expand Down Expand Up @@ -112,13 +124,10 @@ def _init_assn(self, fin_gaf, hdr_only, prt):
sys.stderr.write("\n **FATAL: {MSG}\n\n".format(MSG=str(inst)))
sys.stderr.write("**FATAL: {FIN}[{LNUM}]:\n{L}".format(FIN=fin_gaf, L=line, LNUM=lnum))
if datobj is not None:
datobj.prt_line_detail(prt, line)
datobj.prt_line_detail(sys.stdout, line)
sys.exit(1)
# GAF file has been read
if prt:
datobj.prt_read_summary(prt, fin_gaf, nts)
self.datobj = datobj
return self.evobj.sort_nts(nts, 'Evidence_Code')
return nts

def prt_summary_anno2ev(self, prt=sys.stdout):
"""Print annotation/evidence code summary."""
Expand Down Expand Up @@ -179,7 +188,9 @@ class GafData(object):
"1.0" : 15}

# Expected values for a Qualifier
# enables seen Jul 2018 in goa_chicken_complex.gaf ...
exp_qualifiers = set(['not', 'contributes_to', 'colocalizes_with'])
# exp_qualifiers = set(['not', 'contributes_to', 'colocalizes_with', 'enables'])

def __init__(self, ver, allow_missing_symbol=False):
self.ver = ver
Expand Down Expand Up @@ -219,7 +230,7 @@ def _get_ntgaf(self, flds, num_flds, lnum):
return None
# Additional Formatting
taxons = self._do_taxons(taxons, flds, lnum)
self._chk_qualifier(qualifiers)
self._chk_qualifier(qualifiers, flds, lnum)
# Create list of values
gafvals = [
flds[0], # 0 DB
Expand Down Expand Up @@ -266,11 +277,13 @@ def _rd_fld_vals(name, val, set_list_ft=True, qty_min=0, qty_max=None):
"FIELD({F}): MAX QUANTITY({Q}) EXCEEDED: {V}".format(F=name, Q=qty_max, V=vals)
return vals if set_list_ft else set(vals)

def _chk_qualifier(self, qualifiers):
def _chk_qualifier(self, qualifiers, flds, lnum):
"""Check that qualifiers are expected values."""
# http://geneontology.org/page/go-annotation-conventions#qual
for qual in qualifiers:
assert qual in self.exp_qualifiers, "UNEXPECTED QUALIFIER({Q})".format(Q=qual)
if qual not in self.exp_qualifiers:
errname = 'UNEXPECTED QUALIFIER({QUAL})'.format(QUAL=qual)
self.illegal_lines[errname].append((lnum, "\t".join(flds)))

def prt_line_detail(self, prt, line):
"""Print line header and values in a readable format."""
Expand Down Expand Up @@ -306,23 +319,20 @@ def _do_taxons(self, taxons, flds, lnum):
self.illegal_lines['ILLEGAL TAXON'].append((lnum, "\t".join(flds)))
return taxons_int

def prt_read_summary(self, prt, fin_gaf, nts):
def prt_error_summary(self, fin_gaf):
"""Print a summary about the GAF file that was read."""
prt.write(" READ {N:9,} associations: {FIN}\n".format(N=len(nts), FIN=fin_gaf))
# If there are illegal GAF lines ...
if self.ignored or self.illegal_lines:
# Get summary of error types and their counts
errcnts = []
if self.ignored:
errcnts.append(" {N:9,} IGNORED associations\n".format(N=len(self.ignored)))
if self.illegal_lines:
for err_name, errors in self.illegal_lines.items():
errcnts.append(" {N:9,} {ERROR}\n".format(N=len(errors), ERROR=err_name))
# Save error details into a log file
fout_log = self._wrlog_details_illegal_gaf(fin_gaf, errcnts)
prt.write(" WROTE GAF ERROR LOG: {LOG}:\n".format(LOG=fout_log))
for err_cnt in errcnts:
sys.stdout.write(err_cnt)
# Get summary of error types and their counts
errcnts = []
if self.ignored:
errcnts.append(" {N:9,} IGNORED associations\n".format(N=len(self.ignored)))
if self.illegal_lines:
for err_name, errors in self.illegal_lines.items():
errcnts.append(" {N:9,} {ERROR}\n".format(N=len(errors), ERROR=err_name))
# Save error details into a log file
fout_log = self._wrlog_details_illegal_gaf(fin_gaf, errcnts)
sys.stdout.write(" WROTE GAF ERROR LOG: {LOG}:\n".format(LOG=fout_log))
for err_cnt in errcnts:
sys.stdout.write(err_cnt)

def _wrlog_details_illegal_gaf(self, fin_gaf, err_cnts):
"""Print details regarding illegal GAF lines seen to a log file."""
Expand Down
3 changes: 2 additions & 1 deletion tests/test_termcounts_asscs.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""Test TermCounts object used in Resnik and Lin similarity calculations."""

from __future__ import print_function
Expand Down Expand Up @@ -69,7 +70,7 @@ def test_semantic_similarity(usr_assc=None):
cwd = os.getcwd()
for assc_name in associations: # Limit test numbers for speed
# Get all the annotations from arabidopsis.
assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), go2obj, prt=None)
assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), go2obj, prt=sys.stdout)

# Calculate the information content of the single term, GO:0048364
# "Information content (GO:0048364) = 7.75481392334
Expand Down

0 comments on commit ddfe095

Please sign in to comment.