# Tables Total Recall

The purpose of this notebook is to get to the point where we can extract 100% of part names from transistor hardware sheets.

In [None]:
%load_ext autoreload
%autoreload 2

In [37]:
import cPickle
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import TableParser, OmniParser
from snorkel.candidates import Ngrams, NgramsWithRanges, EntityExtractor
from snorkel.matchers import RegexMatchEach, DictionaryMatch, RangeMatcher
from utils import collect_hardware_doc_part_pairs

def parse_corpus(parser, max_docs=101):
    doc_parser = HTMLParser(path='data/hardware/hardware_html/')
    context_parser = OmniParser()
    cp = CorpusParser(doc_parser, context_parser, max_docs=max_docs)
    corpus = cp.parse_corpus(name='Hardware Corpus')
    print "Corpus has been parsed."
    return corpus

def load_corpus(parser):
    if isinstance(parser, TableParser):
        filename = "data/hardware/hardware_corpus_table.pkl"
    elif isinstance(parser, OmniParser):
        filename = "data/hardware/hardware_corpus_omni.pkl"
    else:
        raise ValueError("Input must be of type TableParser or OmniParser.")
    try:
        with open(filename,"r") as pkl:
            return cPickle.load(pkl)
        print "Corpus has been loaded."
    except:
        print "Corpus could not be loaded."
        return None

def save_corpus(corpus, parser):
    if isinstance(parser, TableParser):
        filename = "data/hardware/hardware_corpus_table.pkl"
    elif isinstance(parser, OmniParser):
        filename = "data/hardware/hardware_corpus_omni.pkl"
    else:
        raise ValueError("Input must be of type TableParser or OmniParser.")
    with open(filename,"w") as pkl:
        %time cPickle.dump(corpus, pkl)
        print "Corpus has been pickled."

def load_gold():
    filename='data/hardware/gold_all.csv'
    gold_pairs = collect_hardware_doc_part_pairs(filename)
    return gold_pairs
        
def extract_part_numbers(corpus, cand_space, gold_parts):
    part_matcher = DictionaryMatch(d=gold_parts)
    part_extractor = EntityExtractor(cand_space, part_matcher)
    parts = part_extractor.extract(corpus.get_phrases(), name='all')
    return parts

def print_stats(g, x):
    tp = len(g.intersection(x))
    fp = len(x.difference(g))
    fn = len(g.difference(x))
    precision = float(tp)/(tp + fp)
    recall = float(tp)/(tp + fn)
    print "Precision: %0.3f (%s/%s)" % (precision, tp, tp + fp)
    print "Recall: %0.3f (%s/%s)" % (recall, tp, tp + fn)  

In [27]:
gold_loaded = True

if gold_loaded:
    try:
        print "Using %s gold (doc, part) pairs." % len(gold_pairs)
    except:
        print "Gold data not in memory yet."
        gold_loaded = False
if not gold_loaded:
    gold_pairs = load_gold()
    (gold_docs, gold_parts) = zip(*gold_pairs)
    # make gold_parts_suffixed for matcher
    gold_parts_extended = []
    for part in gold_parts:
        for suffix in ['', 'A','B','C','-16','-25','-40']:
            gold_parts_extended.append(''.join([part,suffix]))
            if part.endswith(suffix):
                gold_parts_extended.append(part[:-len(suffix)])
                if part[:2].isalpha() and part[2:-1].isdigit() and part[-1].isalpha():
                    gold_parts_extended.append(' '.join([part[:2], part[2:-1], part[-1]]))
    print "Loaded %s gold (doc, part) pairs." % len(gold_pairs)
    print "Dictionary containts %d potential part numbers." % len(gold_parts_extended)

Using 542 gold (doc, part) pairs.


### Table Parser

In [28]:
corpus_loaded = True

if corpus_loaded:
    try:
        if isinstance(parser, TableParser):
            print "Corpus from %s in memory with %d documents" \
            % (type(parser).__name__, len(corpus.documents))
        else:
            print "Corpus in memory was parsed with %s"% type(parser).__name__
            corpus_loaded = False
    except:
        print "Corpus not in memory yet."
        corpus_loaded = False
if not corpus_loaded:
    parser = TableParser()
    print "Loading corpus for %s..." % type(parser).__name__
    %time corpus = load_corpus(parser)
    print "Corpus loaded with %d documents" % len(corpus.documents)

Corpus in memory was parsed with OmniParser
Loading corpus for TableParser...
CPU times: user 15.5 s, sys: 2.21 s, total: 17.7 s
Wall time: 18.3 s
Corpus loaded with 101 documents


In [32]:
cand_space = Ngrams(n_max=3)
%time parts = extract_part_numbers(corpus, cand_space, gold_parts_extended)
print "Extracted %s candidate part numbers." % len(parts)

g_table_parts = set(gold_parts)
x_table_parts = set([p.get_span() for p in parts])
print "\nPart Stats:"
print_stats(g_table_parts, x_table_parts)

g_table_pairs = set(gold_pairs)
x_table_pairs = set([(p.context.document.name, p.get_span()) for p in parts])
print "\nPair Stats:"
print_stats(g_table_pairs, x_table_pairs)

CPU times: user 1.62 s, sys: 48.9 ms, total: 1.67 s
Wall time: 1.66 s
Extracted 1312 candidate part numbers.

Part Stats:
Precision: 0.852 (144/169)
Recall: 0.804 (144/179)

Pair Stats:
Precision: 0.592 (251/424)
Recall: 0.463 (251/542)


### OmniParser

In [14]:
reparse_corpus = True
save_new_corpus = True

if reparse_corpus:
    parser = OmniParser()
    %time corpus = parse_corpus(parser, max_docs=101)
    if save_new_corpus:
        save_corpus(corpus, parser)



Corpus has been parsed.
CPU times: user 3min 45s, sys: 13.1 s, total: 3min 58s
Wall time: 6min 29s
CPU times: user 25.1 s, sys: 2.4 s, total: 27.5 s
Wall time: 28.9 s
Corpus has been pickled.


In [33]:
corpus_loaded = True

if corpus_loaded:
    try:
        if isinstance(parser, OmniParser):
            print "Corpus from %s in memory with %d documents" \
            % (type(parser).__name__, len(corpus.documents))
        else:
            print "Corpus in memory was parsed with %s"% type(parser).__name__
            corpus_loaded = False
    except:
        print "Corpus not in memory yet."
        corpus_loaded = False
if not corpus_loaded:
    parser = OmniParser()
    print "Loading corpus for %s..." % type(parser).__name__
    %time corpus = load_corpus(parser)
    print "Corpus loaded with %d documents" % len(corpus.documents)

Corpus in memory was parsed with TableParser
Loading corpus for OmniParser...
CPU times: user 29 s, sys: 2.62 s, total: 31.7 s
Wall time: 33.7 s
Corpus loaded with 101 documents


In [40]:
cand_space = Ngrams(n_max=3)
%time parts = extract_part_numbers(corpus, cand_space, gold_parts)
print "Extracted %s candidate part numbers." % len(parts)

g_omni_parts = set(gold_parts)
x_omni_parts = set([p.get_span().upper() for p in parts])
print "\nPart Stats:"
print_stats(g_omni_parts, x_omni_parts)

g_omni_pairs = set(gold_pairs)
x_omni_pairs = set([(p.context.document.name, p.get_span().upper()) for p in parts])
print "\nPair Stats:"
print_stats(g_omni_pairs, x_omni_pairs)

CPU times: user 8.97 s, sys: 80 ms, total: 9.05 s
Wall time: 9.13 s
Extracted 1912 candidate part numbers.

Part Stats:
Precision: 1.000 (167/167)
Recall: 0.933 (167/179)

Pair Stats:
Precision: 0.738 (385/522)
Recall: 0.710 (385/542)


### Implicit Mentions

In [42]:
from snorkel.utils import expand_implicit_text
list(expand_implicit_text("BC547A/B/C"))

['BC547A', 'BC547C', 'BC547B']

In [50]:
from snorkel.candidates import NgramsWithRanges
cand_space = NgramsWithRanges(n_max=3)
%time parts = extract_part_numbers(corpus, cand_space, gold_parts_extended)
print "Extracted %s candidate part numbers." % len(parts)

g_implicit_parts = set(gold_parts)
x_implicit_parts = set([p.get_span().upper() for p in parts])
print "\nPart Stats:"
print_stats(g_implicit_parts, x_implicit_parts)

g_implicit_pairs = set(gold_pairs)
x_implicit_pairs = set([(p.context.document.name, p.get_span().upper()) for p in parts])
print "\nPair Stats:"
print_stats(g_implicit_pairs, x_implicit_pairs)

CPU times: user 42.9 s, sys: 1.11 s, total: 44 s
Wall time: 55.9 s
Extracted 6458 candidate part numbers.

Part Stats:
Precision: 0.869 (172/198)
Recall: 0.961 (172/179)

Pair Stats:
Precision: 0.627 (416/664)
Recall: 0.768 (416/542)


### Suffix Groups

In [51]:
g_group_parts = g_implicit_parts
x_group_parts = set([])
for part in x_implicit_parts:
    for suffix in ['', 'A','B','C','-16','-25','-40']:
        x_group_parts.update([''.join([part,suffix])])
print "Considering %d candidate part numbers" % len(x_group_parts)
        
print "\nPart Stats:"
print_stats(g_group_parts, x_group_parts)        

g_group_pairs = g_implicit_pairs
x_group_pairs = set([])
for (doc,part) in x_implicit_pairs:
    for suffix in ['', 'A','B','C','-16','-25','-40']:
        x_group_pairs.update([(doc,''.join([part.replace(' ',''),suffix]))])
print "\nPair Stats:"
print_stats(g_group_pairs, x_group_pairs)    

Considering 1313 candidate part numbers

Part Stats:
Precision: 0.136 (179/1313)
Recall: 1.000 (179/179)

Pair Stats:
Precision: 0.125 (541/4323)
Recall: 0.998 (541/542)


In [52]:
g_group_pairs.difference(x_group_pairs)

{('PJECS00521-1', 'MMBT3904')}