# STG_TEMP_MAX_2Pass

## Setup

In [1]:
# If necessary:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

## Parsing

In [3]:
from snorkel import SnorkelSession
session = SnorkelSession()

### Define Parser

In [4]:
import os
from snorkel.parser import CorpusParser, HTMLParser, OmniParser
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

html_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_html/'
pdf_path  = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware100_pdf/'
doc_parser = HTMLParser(path=html_path)
context_parser = OmniParser(pdf_path=pdf_path, session=session, visual=True)
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

### Run Parser

In [5]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)

session.add(corpus)
session.commit()

CPU times: user 3min 57s, sys: 9.07 s, total: 4min 6s
Wall time: 5min 54s


### Split Corpus

In [6]:
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=3)

78 Documents added to corpus Hardware Training
19 Documents added to corpus Hardware Development


In [7]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

## Extraction

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ corpus snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
from snorkel.models import candidate_subclass

Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Define Matchers

In [3]:
from snorkel.matchers import RegexMatchSpan, Union

eeca_rgx = ur'([b]{1}[abcdefklnpqruyz]{1}[\swxyz]?[0-9]{3,5}[\s]?[A-Z]{0,2}[\/]?[A-Z]{0,2}[0-9]?[A-Z]?([(\-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2212)][A-Z0-9]{1,7})?([(\-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2212)][A-Z0-9]{1,2})?)'
eeca_matcher = RegexMatchSpan(rgx=eeca_rgx, longest_match_only=True)
jedec_rgx = '([123]N\d{3,4}[A-Z]{0,5}[0-9]?[A-Z]?)'
jedec_matcher = RegexMatchSpan(rgx=jedec_rgx, longest_match_only=True)
jis_rgx = '(2S[abcdefghjkmqrstvz]{1}[\d]{2,4})'
jis_matcher = RegexMatchSpan(rgx=jis_rgx, longest_match_only=True)
others_rgx = '((NSVBC|SMBT|MJ|MJE|MPS|MRF|RCA|TIP|ZTX|ZT|TIS|TIPL|DTC|MMBT|PZT){1}[\d]{2,4}[A-Z]{0,3}([-][A-Z0-9]{0,6})?([-][A-Z0-9]{0,1})?)'
others_matcher = RegexMatchSpan(rgx=others_rgx, longest_match_only=True)
# parts_rgx = '|'.join([eeca_rgx, jedec_rgx, jis_rgx, others_rgx])
parts_matcher = Union(eeca_matcher, jedec_matcher, jis_matcher, others_matcher)

temp_matcher = RegexMatchSpan(rgx=r'1[4-6]0', longest_match_only=False)

In [4]:
# import re
# part_finder = re.compile(parts_rgx, re.I)
# print any([part_finder.match(x) for x in ['blue', 'red', 'black', 'green']])
# print any([part_finder.match(x) for x in ['blue', 'red', 'BC546A', 'green']])

### Define ContextSpaces

In [3]:
from hardware_utils import get_gold_dict, get_first_pass_dict, OmniNgramsPart, OmniNgramsTemp, merge_two_dicts
from collections import defaultdict
from snorkel.candidates import OmniNgrams
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

eeca_suffix = '^(A|B|C|-16|-25|-40|16|25|40)$'
suffix_matcher = RegexMatchSpan(rgx=eeca_suffix, ignore_case=False)
suffix_ngrams = OmniNgrams(n_max=1)
part_ngrams = OmniNgramsPart(n_max=5)


dev_corpus = get_ORM_instance(Corpus, session, "Hardware Development")

train_corpus = get_ORM_instance(Corpus, session, "Hardware Training")


parts_dev, s_dev, p_dev = get_first_pass_dict(dev_corpus.documents, parts_matcher=parts_matcher, part_ngrams=part_ngrams, suffix_matcher=suffix_matcher, suffix_ngrams=suffix_ngrams)      

parts_train, s_train, p_train = get_first_pass_dict(train_corpus.documents, parts_matcher=parts_matcher, part_ngrams=part_ngrams, suffix_matcher=suffix_matcher, suffix_ngrams=suffix_ngrams)  

parts_by_doc = merge_two_dicts(parts_dev, parts_train)

ImportError: No module named hardware_utils

In [4]:
# import pickle
# filename = '1stpass.pkl'
# with open(filename, 'w') as f:
#     pickle.dump(parts_by_doc, f)
    
import pickle
filename = '1stpass.pkl'
with open(filename, 'r') as f:
    parts_by_doc = pickle.load(f)


In [5]:
from hardware_utils import OmniNgramsPart, OmniNgramsTemp

part_ngrams = OmniNgramsPart(parts_by_doc=parts_by_doc, n_max=5)

# TODO: This is missing the current represented as an Amp rather than a milliamp
temp_ngrams = OmniNgramsTemp(n_max=2)

### Define Throttler

In [6]:
from snorkel.lf_helpers import *

def part_throttler((part_span, attr_span)):
    """
    Keep only those candidates where both spans are on the same page and
    either aligned in the same table (visually or structurally) or the part is global.
    """
    # TODO: Write this in an easier to tweak way. No reason to try and
    # cram it all into a single return statement.
#     return(
#         same_page((part_span, attr_span)) or
#         (part_span.parent.table is None))
    
    # Filter out things like BC546-BC550 that pass through otherwise
    text = part_span.get_span()
    if text.count('-') == 1:
        split = text.split('-')
        if len(split[0]) == len(split[1]):
            return False

    return True

#     if (not part_span.has_visual_features() or not attr_span.has_visual_features()):
#         import pdb; pdb.set_trace()

#     return(part_span.parent.table is None or
#            same_table((part_span, attr_span)))


### Run CandidateExtractor

In [7]:
from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance


ce = CandidateExtractor(Part_Temp, 
                        [part_ngrams, temp_ngrams], 
                        [parts_matcher, temp_matcher],
                        throttler=part_throttler)

for corpus_name in ['Hardware Training', 'Hardware Development']:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = ce.extract(\
        corpus.documents, corpus_name + ' Candidates', session)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Extracting Candidates from Corpus (Hardware Training)
CPU times: user 2min 27s, sys: 1.16 s, total: 2min 28s
Wall time: 2min 28s
Candidate Set (Hardware Training Candidates) contains 115717 Candidates
Extracting Candidates from Corpus (Hardware Development)
CPU times: user 26.1 s, sys: 192 ms, total: 26.3 s
Wall time: 26.4 s
Candidate Set (Hardware Development Candidates) contains 17935 Candidates


### Assess Recall

In [8]:
from hardware_utils import entity_level_total_recall, most_common_document, get_gold_dict
from snorkel.utils import get_ORM_instance
from snorkel.models import Candidate, Corpus

all_candidates = session.query(Candidate).all()
gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'

corpus = get_ORM_instance(Corpus, session, 'Hardware')
(tp, fp, fn) = entity_level_total_recall(
    all_candidates, gold_file, 'stg_temp_max', corpus=corpus, relation=True, integerize=True)
print len(tp)
print len(fp)
print len(fn)

Preparing candidates...
Scoring on Entity-Level Total Recall
Entity-level Candidates extracted: 1838 
Entity-level Gold: 823
Intersection Candidates: 812
----------------------------------------
Overlap with Gold:  0.9866

812
1026
11


In [12]:
from pprint import pprint
fns = list(fn)
pprint(sorted(fns)[:40])
pprint(sorted(parts_by_doc["PHGLS18216-1"]))

[(u'112823', u'BC546/D', 150),
 (u'112823', u'BC548A', 150),
 (u'2N3906-D', u'1N916', 140),
 (u'2N3906-D', u'1N916', 150),
 (u'2N3906-D', u'1N916', 160),
 (u'2N3906-D', u'2N3906', 140),
 (u'2N3906-D', u'2N3906', 160),
 (u'2N3906-D', u'2N3906G', 140),
 (u'2N3906-D', u'2N3906G', 160),
 (u'2N3906-D', u'2N3906RL1', 140),
 (u'2N3906-D', u'2N3906RL1', 160),
 (u'2N3906-D', u'2N3906RL1G', 140),
 (u'2N3906-D', u'2N3906RL1G', 160),
 (u'2N3906-D', u'2N3906RLRA', 140),
 (u'2N3906-D', u'2N3906RLRA', 160),
 (u'2N3906-D', u'2N3906RLRAG', 140),
 (u'2N3906-D', u'2N3906RLRAG', 160),
 (u'2N3906-D', u'2N3906RLRM', 140),
 (u'2N3906-D', u'2N3906RLRM', 160),
 (u'2N3906-D', u'2N3906RLRMG', 140),
 (u'2N3906-D', u'2N3906RLRMG', 160),
 (u'2N3906-D', u'2N3906RLRP', 140),
 (u'2N3906-D', u'2N3906RLRP', 160),
 (u'2N3906-D', u'2N3906RLRPG', 140),
 (u'2N3906-D', u'2N3906RLRPG', 160),
 (u'2N3906-D', u'2N390D', 140),
 (u'2N3906-D', u'2N390D', 150),
 (u'2N3906-D', u'2N390D', 160),
 (u'2N4123-D', u'2N4123', 140),
 (u'2N41

In [None]:
# corpus = get_ORM_instance(Corpus, session, 'Hardware Training')
# for document in corpus.documents:
# #     print document.name
#     if document.name == 'MOTOS03160-1':
#         doc = document
# print doc

In [None]:
# for c in all_candidates:
#     if c.part.get_span()=='BC183' and c.part.parent.document.name=='MOTOS03160-1':
#         print c

In [None]:
# for phrase in doc.phrases:
#     if 'BC183' in phrase.words:
#         print phrase

In [13]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ candidates');

## Gold Labels

In [1]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ candidates snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

In [2]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
for set_name in ['Training', 'Development']:
    candidate_set_name = 'Hardware %s Candidates' % set_name
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % set_name
    annotation_key_name = 'Hardware %s Labels -- Gold' % set_name
    %time gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file, \
                           attrib='stg_temp_max')
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

Loading 115717 candidate labels
CPU times: user 3min 40s, sys: 716 ms, total: 3min 40s
Wall time: 3min 40s
80250/115717 Candidates in Candidate Set (Hardware Training Candidates) have positive Labels
Loading 17935 candidate labels
CPU times: user 40.9 s, sys: 156 ms, total: 41.1 s
Wall time: 41.1 s
12926/17935 Candidates in Candidate Set (Hardware Development Candidates) have positive Labels


In [3]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ labels');

## Features

In [1]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ labels snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

### Extract Features

In [2]:
from snorkel.models import CandidateSet
from snorkel.fast_annotations import FeatureManager
from snorkel.utils import get_ORM_instance

train = get_ORM_instance(CandidateSet, session, 'Hardware Training Candidates')
dev   = get_ORM_instance(CandidateSet, session, 'Hardware Development Candidates')

feature_manager = FeatureManager()
%time F_train = feature_manager.create(session, train, 'Train Features')
%time F_dev = feature_manager.update(session, dev, 'Train Features', expand_key_set=False)

Bulk upserting 8645941 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 18min 43s, sys: 1min 59s, total: 20min 43s
Wall time: 20min 43s
Bulk upserting 1492869 annotations...
Done.
Loading sparse Feature matrix...
CPU times: user 3min 56s, sys: 35.4 s, total: 4min 31s
Wall time: 4min 31s


In [3]:
# If necessary:
import os
os.system('cp snorkel.db snorkel.db\ featurized');

## LFs

In [1]:
# If necessary
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ featurized snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

### Define LFs

In [2]:
from snorkel.fast_annotations import LabelManager
from snorkel.lf_helpers import *
label_manager = LabelManager()

LFs = []


###################################################################
# POSITIVE
###################################################################
def LF_storage_row(c):
    return 1 if 'storage' in get_row_ngrams(c.temp) else 0
LFs.append(LF_storage_row)

def LF_junction_row(c):
    return 1 if 'junction' in get_row_ngrams(c.temp) else 0
LFs.append(LF_junction_row)

def LF_operating_row(c):
    return 1 if 'operating' in get_row_ngrams(c.temp) else 0
LFs.append(LF_operating_row)

def LF_temperature_row(c):
    return 1 if 'temperature' in get_row_ngrams(c.temp) else 0
LFs.append(LF_temperature_row)

def LF_to_left(c):
    return 1 if 'to' in get_left_ngrams(c.temp, window=2) else 0
LFs.append(LF_to_left)

def LF_negative_left(c):
    return 1 if any([re.match(r'-\s*\d+', ngram) for ngram in get_left_ngrams(c.temp, window=4)]) else 0
LFs.append(LF_negative_left)

def LF_part_type_aligned(c):
    return 1 if 'type' in get_col_ngrams(c.part) else 0
LFs.append(LF_part_type_aligned)


def LF_tstg_row(c):
    return 1 if overlap(
        ['tstg','stg','ts'], 
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_tstg_row)
pos_keys = set(['tstg', 'stg']) # 'value', 'rating'


###################################################################
# NEGATIVE
###################################################################

def LF_not_temp_relevant(c):
    return -1 if not overlap(
        ['storage','temperature','tstg','ts','stg'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_not_temp_relevant)

# def LF_temp_outside_table(c):
#     return -1 if c.temp.parent.table is None else 0
# LFs.append(LF_temp_outside_table)

def LF_too_many_numbers_row(c):
    num_numbers = list(get_row_ngrams(c.temp, attrib="ner_tags")).count('number')
    return -1 if num_numbers >= 3 else 0
LFs.append(LF_too_many_numbers_row)

def LF_high_table_num(c):
    return -1 if (c.temp.parent.table and
        c.temp.parent.table.position > 4) else 0
LFs.append(LF_high_table_num)

def LF_equals_in_row(c):
    return -1 if overlap('=', get_row_ngrams(c.temp)) else 0
LFs.append(LF_equals_in_row)

complementary_keys = set(['empfohlene', 'recommended','complement', 'complementary', 'full', 'line', 'production'])
def LF_complementary_keywords_part(c):
    return -1 if overlap(complementary_keys, get_aligned_ngrams(c.part, infer=True)) else 0
LFs.append(LF_complementary_keywords_part)

# For PHGLS25601-1
def LF_part_too_far_right(c):
    return -1 if (c.part.parent.col_start > 3) else 0
LFs.append(LF_part_too_far_right)

def LF_part_neighbor_phrase_complementary(c):
    neighbor_ngrams = set(get_neighbor_phrase_ngrams(c.part))
    if (overlap(complementary_keys, neighbor_ngrams)):
        return -1
    
    return 0
LFs.append(LF_part_neighbor_phrase_complementary)

def LF_i_in_row(c):
    return -1 if overlap('i', get_row_ngrams(c.temp)) else 0
LFs.append(LF_i_in_row)

def LF_collector_aligned(c):
    return -1 if overlap(
        ['collector', 'collector-current', 'collector-base', 'collector-emitter'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_collector_aligned)

def LF_current_aligned(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['current', 'dc', 'ic'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_current_aligned)

def LF_voltage_row_temp(c):
    ngrams = get_aligned_ngrams(c.temp)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_temp)

def LF_voltage_row_part(c):
    ngrams = get_aligned_ngrams(c.part)
    return -1 if overlap(
        ['voltage', 'cbo', 'ceo', 'ebo', 'v'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_voltage_row_part)

def LF_test_condition_aligned(c):
    return -1 if overlap(
        ['test', 'condition'],
        list(get_aligned_ngrams(c.temp))) else 0
LFs.append(LF_test_condition_aligned)

def LF_typ_row(c):
    return -1 if overlap(
        ['typ', 'typ.'],
        list(get_row_ngrams(c.temp))) else 0
LFs.append(LF_typ_row)

def LF_complement_left_row(c):
    return -1 if 'complement' in get_row_ngrams(c.part) else 0
LFs.append(LF_complement_left_row)

### Apply LFs

In [3]:
%time L_train = label_manager.create(session, train, 'LF Labels', f=LFs)
L_train

Bulk upserting 2661491 annotations...
Done.
Loading sparse Label matrix...
CPU times: user 37min 10s, sys: 2min, total: 39min 11s
Wall time: 39min 7s


<115717x23 sparse matrix of type '<type 'numpy.float64'>'
	with 471843 stored elements in Compressed Sparse Row format>

### Assess LF accuracy

In [4]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()
%time L_train.lf_stats(train_gold)

CPU times: user 2.08 s, sys: 148 ms, total: 2.22 s
Wall time: 2.16 s


Unnamed: 0,accuracy,conflicts,coverage,j,overlaps
LF_storage_row,0.749782,0.148371,0.148371,0,0.148371
LF_junction_row,0.756123,0.146772,0.146772,1,0.146772
LF_operating_row,0.824608,0.045231,0.045231,2,0.045231
LF_temperature_row,0.756282,0.249695,0.249695,3,0.249695
LF_to_left,0.772809,0.111107,0.111107,4,0.111107
LF_negative_left,0.747863,0.101109,0.101109,5,0.101109
LF_part_type_aligned,0.704022,0.034161,0.034161,6,0.034161
LF_tstg_row,0.806258,0.125383,0.125383,7,0.125383
LF_not_temp_relevant,0.329164,0.04301,0.74626,8,0.685751
LF_too_many_numbers_row,0.321162,0.021985,0.209693,9,0.209693


In [5]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ features');

## Learn and Evaluate

In [None]:
# If necessary:
import os
os.remove('snorkel.db');
os.system('cp snorkel.db\ features snorkel.db');

%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
Part_Temp = candidate_subclass('Part_Temp', ['part','temp'])

In [None]:
from snorkel.fast_annotations import FeatureManager, LabelManager
from snorkel.models import CandidateSet
train = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates').one()
dev = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates').one()

feature_manager = FeatureManager()
%time F_train = feature_manager.load(session, train, 'Train Features')
%time F_dev = feature_manager.load(session, dev, 'Train Features')

label_manager = LabelManager()
%time L_train = label_manager.load(session, train, 'LF Labels')

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(L_train, n_iter=100000, rate=1e-6)
%time gen_model.save(session, 'Generative Params')
train_marginals = gen_model.marginals(L_train)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
disc_model.train(F_train, train_marginals, n_iter=2000, rate=1e-4)
%time disc_model.save(session, "Discriminative Params")

In [None]:
train_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Training Candidates -- Gold').one()

dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Development Candidates -- Gold').one()

from snorkel.models import CandidateSet
from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_dev = label_manager.load(session, dev, 'Hardware Development Labels -- Gold')

tp, fp, tn, fn = disc_model.score(F_dev, L_dev, dev_gold)

In [None]:
from snorkel.models import Corpus
from snorkel.utils import get_ORM_instance
from hardware_utils import entity_level_f1
import os

dev_corpus = get_ORM_instance(Corpus, session, 'Hardware Development')

gold_file = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/hardware_gold.csv'
corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Development').one()
(TP, FP, FN) = entity_level_f1(tp, fp, tn, fn, gold_file, dev_corpus, 'stg_temp_max')

In [None]:
from pprint import pprint
FN_list = sorted(list(FN))
FP_list = sorted(list(FP))
TP_list = sorted(list(TP))
# pprint(FN_list[:])
pprint(FP_list[:100])
# pprint(TP_list[:10])

In [None]:
# docs = set()
# for doc in corpus.documents:
#     docs.add(doc.name.upper())
# pprint(docs)

In [None]:
# import itertools
# docs = set()
# for f in itertools.chain.from_iterable([tp, tn, fp, fn]):
#     docs.add(f.part.parent.document.name.upper())
# #     if f.part.parent.document.name.upper() == 'AUKCS04635-1':
# #         print f
# pprint(docs)

In [44]:
from hardware_utils import entity_to_candidates, part_error_analysis

# disc_model.get_feature_weights(F_dev)

entity = FP_list[10]
matches = entity_to_candidates(entity, dev)
print "Entity: (%d matches)" % len(matches)
print entity

Entity: (2 matches)
(u'DISES00645-1', u'BC808K-16', u'150')


In [45]:
match = matches[0]
print match.part

ImplicitSpan("BC808K-16", parent=50567, words=[1,1], position=[3])


In [62]:
from snorkel.lf_helpers import *

neighbor_ngrams = set(get_neighbor_phrase_ngrams(match.part))
from pprint import pprint
pprint(neighbor_ngrams)

set([u'%',
     '(',
     ')',
     u'+150',
     u',',
     u'-',
     u'--',
     u'-16',
     u'-25',
     u'-40',
     u'-55',
     u'.',
     u'...',
     u'/',
     u'0',
     u'0.1',
     u'0.4',
     u'0.7',
     u'1',
     u'1.1',
     u'1.2',
     u'1.3',
     u'1.9',
     u'10',
     u'100',
     u'115',
     u'160',
     u'170',
     u'2',
     u'2.5',
     u'2.9',
     u'200',
     u'2011-10-26',
     u'25',
     u'250',
     u'3',
     u'30',
     u'300',
     u'4',
     u'40',
     u'400',
     u'45',
     u'5',
     u'50',
     u'500',
     u'630',
     u'6a',
     u'6b',
     u'6c',
     u'6cr',
     u'6cs',
     u'6ct',
     u'6e',
     u'6f',
     u'6g',
     u'70',
     u'<<',
     u'=',
     u'>',
     '[',
     ']',
     u'a',
     u'ag',
     u'all',
     u'and',
     u'approx',
     u'auf',
     u'available',
     u'b',
     u'base',
     u'base-emitter',
     u'basis-emitter-s\xe4ttigungsspannung',
     u'basis-spitzenstrom',
     u'basisstrom',
     u'bc807k',

In [64]:
def LF_part_neighbor_phrase_complementary(c):
    neighbor_ngrams = set(get_neighbor_phrase_ngrams(match.part))
    if (overlap(complementary_keys, neighbor_ngrams)):
        return -1
    
    return 0

LF_part_neighbor_phrase_complementary(match)

-1

In [None]:
results = []
for lf in LFs:
    results.append(lf.__name__, lf(candidate))

In [None]:
print F_train.shape
print F_dev.shape

In [None]:
import os
os.system('cp snorkel.db snorkel.db\ final');