# Tables OmniParser
This notebook is meant for testing and development of an "OmniParser" that can parse all components of an HTML document, including the title, captions, sentences, tables, etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [56]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser
doc_parser = HTMLParser(path='data/diseases/diseases.xhtml')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser)

In [57]:
%time corpus = cp.parse_corpus(name='Diseases Corpus')

> /Users/bradenhancock/snorkel/snorkel/parser.py(268)parse_tag()
-> if isinstance(child, NavigableString):
(Pdb) tag
<th>I don't like Chicken <i>Pox</i> or pizza. The plague is also bad.</th>
(Pdb) tag.contents
[u"I don't like Chicken ", <i>Pox</i>, u' or pizza. The plague is also bad.']
(Pdb) type(tag.contents[0])
<class 'bs4.element.NavigableString'>
(Pdb) type(tag.contents[1])
<class 'bs4.element.Tag'>
(Pdb) tag.contents[1]
<i>Pox</i>
(Pdb) tag.contents[1].unwrap()
<i></i>
(Pdb) tag.contents[1]
u'Pox'
(Pdb) tag.contents
[u"I don't like Chicken ", u'Pox', u' or pizza. The plague is also bad.']
(Pdb) q


BdbQuit: 

In [51]:
print len(corpus.documents)
print len(corpus.documents[0].tables)
print len(corpus.documents[0].tables[0].phrases)
print len(corpus.documents[0].phrases)
# for phrase in corpus.documents[0].phrases: print phrase.text

1
2
16
34


In [37]:
from load_dictionaries import load_disease_dictionary

# Load the disease phrase dictionary
diseases = load_disease_dictionary()
print "Loaded %s disease phrases!" % len(diseases)

Loaded 507899 disease phrases!


In [52]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import DictionaryMatch

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
disease_matcher = DictionaryMatch(d=diseases, longest_match_only=False)

In [53]:
# With old Candidates object:
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(table_ngrams, disease_matcher)
%time candidates = ce.extract(corpus.get_phrases(), name='all')

# for cand in candidates:
# #     if cand.get_span() in ["Cancer","Pneumonia","Hernia", "Cholera", "Cough", "Cold", "Flu"]:
#     if cand.get_span() in ["Cold"]:
#         print list(cand.aligned_ngrams(n_max=1))
for cand in candidates:
    print cand
print len(candidates)

CPU times: user 5.5 ms, sys: 1.4 ms, total: 6.91 ms
Wall time: 5.99 ms
Span("coughs", context=None, chars=[18,23], words=[4,4])
Span("colds", context=None, chars=[30,34], words=[7,7])
Span("Brain Cancer", context=None, chars=[0,11], words=[0,1])
Span("Brain", context=None, chars=[0,4], words=[0,0])
Span("Cancer", context=None, chars=[6,11], words=[1,1])
Span("Common", context=None, chars=[0,5], words=[0,0])
Span("Ailments", context=None, chars=[7,14], words=[1,1])
Span("Disease", context=None, chars=[0,6], words=[0,0])
Span("Location", context=None, chars=[0,7], words=[0,0])
Span("Polio", context=None, chars=[0,4], words=[0,0])
Span("plague", context=None, chars=[4,9], words=[1,1])
Span("Scurvy", context=None, chars=[0,5], words=[0,0])
Span("Infectious diseases", context=None, chars=[9,27], words=[3,4])
Span("Infectious", context=None, chars=[9,18], words=[3,3])
Span("diseases", context=None, chars=[20,27], words=[4,4])
Span("Problem", context=None, chars=[0,6], words=[0,0])
Span("Arth

In [18]:
c = candidates[-2]
print c
print list(c.pre_ngrams(n_max=1))
print list(c.post_ngrams(n_max=1))
print list(c.phrase_ngrams(n_max=1))
print list(c.cell_ngrams(n_max=1))
print list(c.neighborhood_ngrams(n_max=1))
print list(c.neighbor_ngrams(n_max=1))
print list(c.row_ngrams(n_max=1))
print list(c.col_ngrams(n_max=1))

Span("Hypochondria", context=None, chars=[0,11], words=[0,0])
[]
[]
[]
[u'hypochondria']
[u'yellow', u'fever', u'fear']
[(u'yellow', 'DOWN'), (u'fever', 'DOWN'), (u'fear', 'RIGHT')]
[u'fear', u'$', u'100']
[u'problem', u'arthritis', u'yellow', u'fever']


In [19]:
c = candidates[1]
print c.context.html_anc_tags

['html', 'body', 'h1']


In [20]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)

Building feature index...
Extracting features...
0/522


<22x522 sparse matrix of type '<type 'numpy.float64'>'
	with 915 stored elements in LInked List format>

In [22]:
featurizer.get_features_by_candidate(candidates[4])[:]

[u'DDLIB_WORD_SEQ_[Cancer]',
 u'DDLIB_LEMMA_SEQ_[cancer]',
 u'DDLIB_POS_SEQ_[NN]',
 u'DDLIB_DEP_SEQ_[ROOT]',
 u'DDLIB_W_LEFT_1_[brain]',
 u'DDLIB_W_LEFT_POS_1_[NN]',
 u'DDLIB_W_LEFT_2_[cancer brain]',
 u'DDLIB_W_LEFT_POS_2_[NN NN]',
 u'DDLIB_W_LEFT_3_[brain cancer brain]',
 u'DDLIB_W_LEFT_POS_3_[NN NN NN]',
 u'DDLIB_NUM_WORDS_1',
 u'TABLE_HTML_TAG_span',
 u'TABLE_HTML_ANC_TAG_html',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_TAG_p',
 u'TABLE_HTML_ANC_TAG_span',
 u'TABLE_HTML_ANC_ATTR_lang',
 u'TABLE_HTML_ANC_ATTR_xml:lang',
 u'TABLE_HTML_ANC_ATTR_xmlns']

The end.