In [55]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Setup

In [56]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'parallelism': 1,
    'db_name': 'babble_spouse_demo',
    'debug': False,
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'disc_model_search_space': 1,
    'gen_model_search_space': 1,
    'supervision': 'majority_vote',
}

In [57]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [58]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

In [59]:
from snorkel.models import candidate_subclass
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

## Parse, Extract, Load

In [60]:
# %time pipe.parse()

In [61]:
# %time pipe.extract()

In [62]:
# %time pipe.load_gold()

## Now the real work begins...

In [63]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidate_class=Spouse, balanced=True, seed=123)

In [64]:
# from tutorials.babble.spouse.spouse_examples import get_explanations, get_user_lists

# candidates = session.query(Spouse).filter(Spouse.split == 0).all()
# spouse_explanations = get_explanations(candidates)
# spouse_user_lists = get_user_lists()

In [65]:
# bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

In [66]:
c = bs.next()

In [67]:
from snorkel.lf_helpers import *
from IPython.core.display import display, HTML

In [68]:
chunks = get_text_splits(c)
arg_tmpl = u'<b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">{0}<small style="color:#4B86A8; font-size:9.5pt;">{1}</small></b>'
sent_tmpl = u'<p style="font-size:12pt;">{}</p>'
div_tmpl = u'''<div style="border: 1px dotted #858585; border-radius:8px;
    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt">{}</div>'''
text = u""
for s in chunks[0:]:
    if s in [u"{{A}}", u"{{B}}"]:
        span = (c[0].get_span(), " arg1 ") if s == u"{{A}}" else (c[1].get_span(), " arg2 ")
        text += arg_tmpl.format(span[0], span[1])
    else:
        text += s.replace(u"\n", u"<BR/>")
html = div_tmpl.format(sent_tmpl.format(text.strip()))
HTML(html)

In [69]:
print c[0].get_span()
print c[1].get_span()

Sherilyn
Stephen


In [70]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=200)
sv

<IPython.core.display.Javascript object>

A Jupyter Widget

In [71]:
from snorkel.contrib.babble import Explanation
label = True
condition = "husband is between arg1 and arg2"
explanation = Explanation(condition, label, candidate=c)
explanation

Explanation("True, husband is between arg1 and arg2")

In [72]:
%time parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

Created grammar with 494 rules
1 explanation(s) out of 1 were parseable.
1 parse(s) generated from 1 explanation(s).
1 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
1 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
Applying labeling functions to split 1

1 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
1 parse(s) remain (0 parse(s) removed by DuplicateSignatureFilter).
CPU times: user 10.3 s, sys: 2.17 s, total: 12.4 s
Wall time: 15.7 s


In [73]:
len(stats_list)

1

In [74]:
parse_list[0].semantics

('.root',
 ('.label',
  ('.bool', True),
  ('.call',
   ('.in', ('.extract_text', ('.sentence',))),
   ('.string', 'husband'))))

In [75]:
conf_matrix_list[0]

ConfusionMatrix(correct=set([Spouse(Span("Danny Moder", sentence=24849, chars=[24,34], words=[4,5]), Span("Julia", sentence=24849, chars=[316,320], words=[64,64])), Spouse(Span("Neil Patrick Harris", sentence=47685, chars=[27,45], words=[7,9]), Span("David Burtka", sentence=47685, chars=[79,90], words=[16,17])), Spouse(Span("Davis", sentence=37356, chars=[13,17], words=[2,2]), Span("Joe", sentence=37356, chars=[62,64], words=[12,12])), Spouse(Span("Jennifer Aniston", sentence=8115, chars=[72,87], words=[13,14]), Span("Justin Theroux", sentence=8115, chars=[101,114], words=[17,18])), Spouse(Span("Kate", sentence=57681, chars=[16,19], words=[4,4]), Span("Jon", sentence=57681, chars=[34,36], words=[7,7])), Spouse(Span("Avril", sentence=52126, chars=[68,72], words=[13,13]), Span("Chad Kroeger", sentence=52126, chars=[95,106], words=[18,19])), Spouse(Span("Simpson", sentence=46386, chars=[0,6], words=[0,0]), Span("O.J. Simpson", sentence=46386, chars=[68,79], words=[15,16])), Spouse(Span("A

In [76]:
print(stats_list[0].accuracy)
print(stats_list[0].class_coverage)

Accuracy: 25.56% (57/223)
ClassCoverage: 121.86% (223/183)


In [77]:
from snorkel.viewer import SentenceNgramViewer
error_set = conf_matrix_list[0].correct
sv = SentenceNgramViewer(list(error_set)[:10], session, n_per_page=3, height=300)
sv

<IPython.core.display.Javascript object>

A Jupyter Widget

In [78]:
bs.get_label_matrix()

You must commit before retrieving the label matrix.


In [24]:
global_coverage = bs.get_global_coverage()
print(global_coverage)

AxisError: axis 1 is out of bounds for array of dimension 0

In [79]:
bs.commit([0]) # Permanently adds the parses corresponding to these idxs

Added 1 parse(s) to set. (Total # parses = 1)
Added 1 explanation(s) to set. (Total # explanations = 1)


Confirm that after committing, global coverage goes up.

In [27]:
global_coverage = bs.get_global_coverage()
print(global_coverage)

AxisError: axis 1 is out of bounds for array of dimension 0

In [28]:
L_train = bs.get_label_matrix()
L_train

### Add another explanation

In [None]:
from snorkel.contrib.babble import Explanation
label = False
condition = "'where' is within two words to the right of arg 1"
explanation = Explanation(condition, label, candidate=c, name='')

In [None]:
%time parse_list, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
print(stats_list[0].accuracy)
print(stats_list[0].class_coverage)

In [None]:
bs.commit()

In [None]:
parse = parse_list[0]
parse.semantics

In [None]:
bs.semparser.grammar.translate(parse.semantics)

In [None]:
pipe.lfs = [parse.function for parse in bs.parses]
pipe.label()

In [None]:
# %time pipe.supervise()

In [None]:
# %time pipe.classify()