# Tables Total Recall

The purpose of this notebook is to get to the point where we can extract 100% of part names from transistor hardware sheets.

In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
load_pickle = True # This takes 12sec
save_pickle = False # Saved all the documents last time. Took 30min.

corpus_loaded = False
if load_pickle:
    try:
        import cPickle
        with open("data/hardware/hardware_corpus.pkl","r") as pkl:
            %time corpus = cPickle.load(pkl)
        corpus_loaded = True
        print "Corpus has been loaded."
    except:
        print "Corpus could not be loaded."
        print "Corpus will be parsed instead..."
if not corpus_loaded:
    from snorkel.parser import CorpusParser
    from snorkel.parser import HTMLParser
    from snorkel.parser import TableParser, OmniParser

    doc_parser = HTMLParser(path='data/hardware/hardware_html/')
    context_parser = TableParser()

    cp = CorpusParser(doc_parser, context_parser, max_docs=101)
    %time corpus = cp.parse_corpus(name='Hardware Corpus')
    print "Corpus has been parsed."
    
    if save_pickle:
        with open("data/hardware/hardware_corpus.pkl","w") as pkl:
            %time cPickle.dump(corpus, pkl)
            print "Corpus has been pickled."

CPU times: user 12.7 s, sys: 588 ms, total: 13.3 s
Wall time: 13.5 s
Corpus has been loaded.


In [40]:
from utils import collect_hardware_doc_part_pairs
filename='data/hardware/gold_all.csv'
gold_pairs = collect_hardware_doc_part_pairs(filename)
(gold_docs, gold_parts) = zip(*gold_pairs)
print "Loaded %s gold (doc, part) pairs." % len(gold_pairs)

Loaded 542 gold (doc, part) pairs.


In [41]:
from snorkel.candidates import Ngrams, EntityExtractor
from snorkel.matchers import RegexMatchEach, DictionaryMatch, RangeMatcher

ngrams = Ngrams(n_max=2)
part_matcher = DictionaryMatch(d=gold_parts)
part_extractor = EntityExtractor(ngrams, part_matcher)

%time parts = part_extractor.extract(corpus.get_phrases(), name='all')
for p in parts[:3]: 
    print p
print "Extracted %s candidate part numbers." % len(parts)

CPU times: user 1.68 s, sys: 13.6 ms, total: 1.69 s
Wall time: 1.7 s
Span("BC547", context=None, chars=[0,4], words=[0,0])
Span("BC548", context=None, chars=[0,4], words=[0,0])
Span("BC547", context=None, chars=[0,4], words=[0,0])
Extracted 1125 candidate part numbers.


In [42]:
def print_stats(g, x):
    tp = len(g.intersection(x))
    fp = len(x.difference(g))
    fn = len(g.difference(x))
    precision = float(tp)/(tp + fp)
    recall = float(tp)/(tp + fn)
    print "Precision: %0.3f (%s/%s)" % (precision, tp, tp + fp)
    print "Recall: %0.3f (%s/%s)" % (recall, tp, tp + fn)    

In [43]:
g = set(gold_parts)
x = set([p.get_span() for p in parts])
print "Part Stats:"
print_stats(g, x)

Part Stats:
Precision: 1.000 (144/144)
Recall: 0.804 (144/179)


In [44]:
g = set(gold_pairs)
x = set([(p.context.document.name, p.get_span()) for p in parts])
print "Pair Stats:"
print_stats(g, x)

Pair Stats:
Precision: 0.691 (251/363)
Recall: 0.463 (251/542)
