# Tables OmniParser
This notebook is meant for testing and development of an "OmniParser" that can parse all components of an HTML document, including the title, captions, sentences, tables, etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [10]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser
doc_parser = HTMLParser(path='data/diseases/diseases.xhtml')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser)

In [11]:
# def parse_tag(tag):
#     if tag.name is not None:
#         print tag.name
#         print "---"
#         for child in tag.children:
#             parse_tag(child)

# from bs4 import BeautifulSoup
# with open('data/diseases/diseases.xhtml','r') as f:
#     soup = BeautifulSoup(f, 'lxml')
# for tag in soup.children:
#     parse_tag(tag)


In [12]:
%time corpus = cp.parse_corpus(name='Diseases Corpus')

CPU times: user 103 ms, sys: 12.4 ms, total: 115 ms
Wall time: 174 ms


In [13]:
print len(corpus.documents)
print len(corpus.documents[0].tables)
print len(corpus.documents[0].phrases)
print len(corpus.documents[0].sentences)
for sent in corpus.documents[0].sentences: print sent

1
2
29
0


In [14]:
from load_dictionaries import load_disease_dictionary

# Load the disease phrase dictionary
diseases = load_disease_dictionary()
print "Loaded %s disease phrases!" % len(diseases)

Loaded 507899 disease phrases!


In [15]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import DictionaryMatch

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
disease_matcher = DictionaryMatch(d=diseases, longest_match_only=False)

In [16]:
# With old Candidates object:
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(table_ngrams, disease_matcher)
%time candidates = ce.extract(corpus.get_phrases(), name='all')

for cand in candidates: print cand

CPU times: user 45.5 ms, sys: 1.06 ms, total: 46.6 ms
Wall time: 48.9 ms
Span("coughs", context=None, chars=[18,23], words=[4,4])
Span("colds", context=None, chars=[30,34], words=[7,7])
Span("Brain Cancer", context=None, chars=[0,11], words=[0,1])
Span("Brain", context=None, chars=[0,4], words=[0,0])
Span("Cancer", context=None, chars=[6,11], words=[1,1])
Span("Common", context=None, chars=[0,5], words=[0,0])
Span("Ailments", context=None, chars=[7,14], words=[1,1])
Span("Disease", context=None, chars=[0,6], words=[0,0])
Span("Location", context=None, chars=[0,7], words=[0,0])
Span("Polio", context=None, chars=[0,4], words=[0,0])
Span("Chicken Pox", context=None, chars=[13,23], words=[4,5])
Span("plague", context=None, chars=[4,9], words=[1,1])
Span("Scurvy", context=None, chars=[0,5], words=[0,0])
Span("Problem", context=None, chars=[0,6], words=[0,0])
Span("Arthritis", context=None, chars=[0,8], words=[0,0])
Span("Yellow Fever", context=None, chars=[0,11], words=[0,1])
Span("Fever", 

In [17]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)

Building feature index...
Extracting features...
0/371


<18x371 sparse matrix of type '<type 'numpy.float64'>'
	with 645 stored elements in LInked List format>

In [18]:
featurizer.get_features_by_candidate(candidates[1])[:]

[u'DDLIB_WORD_SEQ_[colds]',
 u'DDLIB_LEMMA_SEQ_[cold]',
 u'DDLIB_POS_SEQ_[NNS]',
 u'DDLIB_DEP_SEQ_[conj]',
 u'DDLIB_W_LEFT_1_[and]',
 u'DDLIB_W_LEFT_POS_1_[CC]',
 u'DDLIB_W_LEFT_2_[, and]',
 u'DDLIB_W_LEFT_POS_2_[, CC]',
 u'DDLIB_W_LEFT_3_[cough , and]',
 u'DDLIB_W_LEFT_POS_3_[NNS , CC]',
 u'DDLIB_NUM_WORDS_1',
 u'TABLE_HTML_TAG_h1',
 u'TABLE_HTML_ANC_TAG_[document]',
 u'TABLE_HTML_ANC_TAG_html',
 u'TABLE_HTML_ANC_TAG_html',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_TAG_p',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_TAG_p',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_TAG_body',
 u'TABLE_HTML_ANC_ATTR_lang',
 u'TABLE_HTML_ANC_ATTR_xml:lang',
 u'TABLE_HTML_ANC_ATTR_xmlns',
 u'TABLE_HTML_ANC_ATTR_lang',
 u'TABLE_HTML_ANC_ATTR_xml:lang',
 u'TABLE_HTML_ANC_ATTR_xmlns']

The end.