# Tables OmniParser
This notebook is meant for testing and development of an "OmniParser" that can parse all components of an HTML document, including the title, captions, sentences, tables, etc.

In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser
# note: [Cancer, Pneumonia, Hernia, Cholera, Cough, Cold, Flu] 
#   have been hidden inside of 'data/diseases/diseases_in_hardware.html'
doc_parser = HTMLParser(path='data/diseases/diseases.xhtml')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser)

In [5]:
%time corpus = cp.parse_corpus(name='Diseases Corpus')

CPU times: user 165 ms, sys: 16.4 ms, total: 181 ms
Wall time: 399 ms


In [8]:
print len(corpus.documents)
print len(corpus.documents[0].tables)
print len(corpus.documents[0].cells)
print len(corpus.documents[0].phrases)
print len(corpus.documents[0].tables[0].phrases)
print corpus.documents[0].tables[0].cells[0].phrases[0].text

1
2
24
34
16
Disease


In [10]:
from load_dictionaries import load_disease_dictionary

# Load the disease phrase dictionary
diseases = load_disease_dictionary()
print "Loaded %s disease phrases!" % len(diseases)

Loaded 507899 disease phrases!


In [11]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import DictionaryMatch

# Define a candidate space
table_ngrams = TableNgrams(n_max=3)

# Define a matcher
disease_matcher = DictionaryMatch(d=diseases)

In [15]:
# With old Candidates object:
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(table_ngrams, disease_matcher)
%time candidates = ce.extract(corpus.get_phrases(), name='all')

for cand in candidates:
    print cand
print len(candidates)

CPU times: user 10.7 ms, sys: 1.84 ms, total: 12.5 ms
Wall time: 13.7 ms
Span("coughs", context=None, chars=[18,23], words=[4,4])
Span("colds", context=None, chars=[30,34], words=[7,7])
Span("Brain Cancer", context=None, chars=[0,11], words=[0,1])
Span("Brain", context=None, chars=[0,4], words=[0,0])
Span("Cancer", context=None, chars=[6,11], words=[1,1])
Span("Common", context=None, chars=[0,5], words=[0,0])
Span("Ailments", context=None, chars=[7,14], words=[1,1])
Span("Disease", context=None, chars=[0,6], words=[0,0])
Span("Location", context=None, chars=[0,7], words=[0,0])
Span("Polio", context=None, chars=[0,4], words=[0,0])
Span("plague", context=None, chars=[4,9], words=[1,1])
Span("Scurvy", context=None, chars=[0,5], words=[0,0])
Span("Infectious diseases", context=None, chars=[9,27], words=[3,4])
Span("Infectious", context=None, chars=[9,18], words=[3,3])
Span("diseases", context=None, chars=[20,27], words=[4,4])
Span("Problem", context=None, chars=[0,6], words=[0,0])
Span("Ar

In [16]:
c = candidates[-2]
print c
print list(c.pre_ngrams(n_max=1))
print list(c.post_ngrams(n_max=1))
print list(c.phrase_ngrams(n_max=1))
print list(c.cell_ngrams(n_max=1))
print list(c.neighborhood_ngrams(n_max=1))
print list(c.neighbor_ngrams(n_max=1))
print list(c.row_ngrams(n_max=1))
print list(c.col_ngrams(n_max=1))

Span("Hypochondria", context=None, chars=[0,11], words=[0,0])
[]
[]
[]
[u'hypochondria']
[u'yellow', u'fever', u'fear']
[(u'yellow', 'DOWN'), (u'fever', 'DOWN'), (u'fear', 'RIGHT')]
[u'fear', u'$', u'100']
[u'problem', u'arthritis', u'yellow', u'fever']


In [17]:
c = candidates[1]
print c.context.html_anc_tags

['html', 'body', 'h1']


In [18]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)

Building feature index...
Extracting features...
0/464


<20x464 sparse matrix of type '<type 'numpy.float64'>'
	with 795 stored elements in LInked List format>

In [19]:
featurizer.get_features_by_candidate(candidates[0])[:]

[u'DDLIB_WORD_SEQ_[coughs]',
 u'DDLIB_LEMMA_SEQ_[cough]',
 u'DDLIB_POS_SEQ_[NNS]',
 u'DDLIB_DEP_SEQ_[conj]',
 u'DDLIB_W_LEFT_1_[,]',
 u'DDLIB_W_LEFT_POS_1_[,]',
 u'DDLIB_W_LEFT_2_[virus ,]',
 u'DDLIB_W_LEFT_POS_2_[NNS ,]',
 u'DDLIB_W_LEFT_3_[of virus ,]',
 u'DDLIB_W_LEFT_POS_3_[IN NNS ,]',
 u'DDLIB_W_RIGHT_1_[,]',
 u'DDLIB_W_RIGHT_POS_1_[,]',
 u'DDLIB_W_RIGHT_2_[, and]',
 u'DDLIB_W_RIGHT_POS_2_[, CC]',
 u'DDLIB_W_RIGHT_3_[, and cold]',
 u'DDLIB_W_RIGHT_POS_3_[, CC NNS]',
 u'DDLIB_W_LEMMA_L_1_R_1_[,]_[,]',
 u'DDLIB_W_POS_L_1_R_1_[,]_[,]',
 u'DDLIB_W_LEMMA_L_1_R_2_[,]_[, and]',
 u'DDLIB_W_POS_L_1_R_2_[,]_[, CC]',
 u'DDLIB_W_LEMMA_L_1_R_3_[,]_[, and cold]',
 u'DDLIB_W_POS_L_1_R_3_[,]_[, CC NNS]',
 u'DDLIB_W_LEMMA_L_2_R_1_[virus ,]_[,]',
 u'DDLIB_W_POS_L_2_R_1_[NNS ,]_[,]',
 u'DDLIB_W_LEMMA_L_2_R_2_[virus ,]_[, and]',
 u'DDLIB_W_POS_L_2_R_2_[NNS ,]_[, CC]',
 u'DDLIB_W_LEMMA_L_2_R_3_[virus ,]_[, and cold]',
 u'DDLIB_W_POS_L_2_R_3_[NNS ,]_[, CC NNS]',
 u'DDLIB_W_LEMMA_L_3_R_1_[of virus ,]_[,

The end.