# HARDWARE

## Setup

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

"""
To change attributes:
1) Change ATTRIBUTE and you're good to go
"""
ATTRIBUTE = 'stg_temp_min'
COUNTER = 'scaling'
PARALLEL = 80
PARALLEL_EXTRACTION = 8
SCALE_SIZE = 10

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ['SNORKELDB'].startswith('postgres')
print snorkel_postgres

In [None]:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = ATTRIBUTE + str(COUNTER)
    print os.system("dropdb " + os.environ['SNORKELDBNAME'])
    print os.system("createdb " + os.environ['SNORKELDBNAME'])
    print "SNORKELDBNAME = %s" % os.environ['SNORKELDBNAME']
else:
    try:
        os.remove('snorkel.db')
    except:
        pass

from snorkel import SnorkelSession
session = SnorkelSession()

## Parsing

In [None]:
import os
if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
    print "Starting async parse..."
    
    # PARSE TRAIN
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/symlinked_html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/symlinked_pdf/'
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Scale', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=SCALE_SIZE, parallel=PARALLEL)

    print "%s contains %d documents" % (corpus, len(corpus))

### Timing Results

All parsing features to true.

Run 0: PARALLEL = 80, SCALE_SIZE = 1e2. Runtime = 35.4s

Run 1: PARALLEL = 80, SCALE_SIZE = 1e3. Runtime = 2min 50sec

Run 2: PARALLEL = 80, SCALE_SIZE = 1e4. Runtime = 22min 21sec

Run 3: PARALLEL = 80, SCALE_SIZE = 1e5. Runtime = 3h 41min 1s

Turning Lingual to False

Run 0: Parallel = 80, SCALE_SIZE = 1e4. Runtime = 11min 34s

Turing visual to False

Run 0: Parallel = 80, SCALE_SIZE = 1e4. Runtime = 21min 28s


## Candidate Extraction

In [None]:
from snorkel.models import candidate_subclass

Part_Attr = candidate_subclass('Part_Attr', ['part','attr'])

from hardware_matchers import get_matcher

dict_path = os.environ['SNORKELHOME'] +\
    '/tutorials/tables/data/hardware/gold_raw/digikey_part_dictionary.csv'
part_matcher = get_matcher('part', dict_path)
attr_matcher = get_matcher(ATTRIBUTE)

from hardware_spaces import get_space
    
part_ngrams = get_space('part')
attr_ngrams = get_space(ATTRIBUTE)

from hardware_throttlers import get_throttler

throttler = get_throttler(ATTRIBUTE)
# throttler = None

from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from snorkel.async_candidates import parallel_extract

ce = CandidateExtractor(Part_Attr, 
                        [part_ngrams, attr_ngrams], 
                        [part_matcher, attr_matcher], 
                        throttler=throttler)

corpus_names = ['Hardware Scale']

for corpus_name in corpus_names:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = parallel_extract(session, ce, corpus, \
                                        corpus_name + ' Candidates', \
                                        parallel=PARALLEL_EXTRACTION)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

### Timing Results

## Featurization

In [None]:
from snorkel.models import CandidateSet
from snorkel.utils import get_ORM_instance

scale = get_ORM_instance(CandidateSet, session, 'Hardware Scale Candidates')

from snorkel.async_annotations import annotate
print "Starting async featurization..."
%time F_scale = annotate(scale, parallel=PARALLEL)

### Timing Results

## Apply LFs

In [None]:
from hardware_lfs import get_lfs

LFs = get_lfs(ATTRIBUTE)

from snorkel.async_annotations import annotate
%time L_scale = annotate(scale, parallel=PARALLEL, lfs=LFs)

### Timing Results

## Learning

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
%time gen_model.train(L_scale, n_iter=100000, rate=1e-3, mu=1e-6)
scale_marginals = gen_model.marginals(L_scale)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
%time disc_model.train(F_scale, scale_marginals, n_iter=2000, rate=1e-4)

### Timing Results

