# Tables OmniParser
This notebook is meant for testing and development of an "OmniParser" that can parse all components of an HTML document, including the title, captions, sentences, tables, etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [23]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser
doc_parser = HTMLParser(path='data/diseases/diseases_in_hardware.html')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser)

In [24]:
%time corpus = cp.parse_corpus(name='Diseases Corpus')

CPU times: user 3.37 s, sys: 186 ms, total: 3.56 s
Wall time: 7.19 s


In [25]:
print len(corpus.documents)
print len(corpus.documents[0].tables)
print len(corpus.documents[0].tables[0].phrases)
print len(corpus.documents[0].phrases)
# for phrase in corpus.documents[0].phrases: print phrase.text

1
9
86
1321


In [15]:
from load_dictionaries import load_disease_dictionary

# Load the disease phrase dictionary
diseases = load_disease_dictionary()
print "Loaded %s disease phrases!" % len(diseases)

Loaded 507899 disease phrases!


In [26]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import DictionaryMatch

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
disease_matcher = DictionaryMatch(d=diseases, longest_match_only=False)

In [33]:
# With old Candidates object:
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(table_ngrams, disease_matcher)
%time candidates = ce.extract(corpus.get_phrases(), name='all')

for cand in candidates:
#     if cand.get_span() in ["Cancer","Pneumonia","Hernia", "Cholera", "Cough", "Cold", "Flu"]:
    if cand.get_span() in ["Cold"]:
        print list(cand.aligned_ngrams(n_max=1))
print len(candidates)

CPU times: user 189 ms, sys: 4.44 ms, total: 193 ms
Wall time: 194 ms
[u'collector', u'-', u'emitter', u'voltage', u'v', u'ceo', u'vdc', u'value', u'80', u'6.0', u'100', u'625', u'mw', u'/', u'c', u'1.5', u'mw', u'/', u'c', u'\u2212', u'55', u'to', u'+150']
222


In [18]:
c = candidates[-2]
print c
print list(c.pre_ngrams(n_max=1))
print list(c.post_ngrams(n_max=1))
print list(c.phrase_ngrams(n_max=1))
print list(c.cell_ngrams(n_max=1))
print list(c.neighborhood_ngrams(n_max=1))
print list(c.neighbor_ngrams(n_max=1))
print list(c.row_ngrams(n_max=1))
print list(c.col_ngrams(n_max=1))

Span("Hypochondria", context=None, chars=[0,11], words=[0,0])
[]
[]
[]
[u'hypochondria']
[u'yellow', u'fever', u'fear']
[(u'yellow', 'DOWN'), (u'fever', 'DOWN'), (u'fear', 'RIGHT')]
[u'fear', u'$', u'100']
[u'problem', u'arthritis', u'yellow', u'fever']


In [19]:
c = candidates[1]
print c.context.html_anc_tags

['html', 'body', 'h1']


In [20]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)

Building feature index...
Extracting features...
0/522


<22x522 sparse matrix of type '<type 'numpy.float64'>'
	with 915 stored elements in LInked List format>

In [22]:
featurizer.get_features_by_candidate(candidates[4])[:]

[u'DDLIB_WORD_SEQ_[Cancer]',
 u'DDLIB_LEMMA_SEQ_[cancer]',
 u'DDLIB_POS_SEQ_[NN]',
 u'DDLIB_DEP_SEQ_[ROOT]',
 u'DDLIB_W_LEFT_1_[brain]',
 u'DDLIB_W_LEFT_POS_1_[NN]',
 u'DDLIB_W_LEFT_2_[cancer brain]',
 u'DDLIB_W_LEFT_POS_2_[NN NN]',
 u'DDLIB_W_LEFT_3_[brain cancer brain]',
 u'DDLIB_W_LEFT_POS_3_[NN NN NN]',
 u'DDLIB_NUM_WORDS_1',
 u'TABLE_HTML_TAG_span',
 u'TABLE_HTML_ANC_TAG_html',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_TAG_p',
 u'TABLE_HTML_ANC_TAG_span',
 u'TABLE_HTML_ANC_ATTR_lang',
 u'TABLE_HTML_ANC_ATTR_xml:lang',
 u'TABLE_HTML_ANC_ATTR_xmlns']

The end.