In [1]:
%load_ext autoreload
%autoreload 2

## Setup

In [4]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'db_name': 'babble_spouse_demo',
    'debug': False,
    'babbler_candidate_split': 1,
    'babbler_label_split': 0,
}

In [5]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [6]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

Overwriting domain=None to domain=spouse
Overwriting babbler_candidate_split=0 to babbler_candidate_split=1
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting decay=0.95 to decay=0.99


In [7]:
from snorkel.models import candidate_subclass
from snorkel.contrib.babble import ExplanationIO
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

## Parse, Extract, Load

In [8]:
# %time pipe.parse()

In [9]:
# %time pipe.extract()

In [10]:
# %time pipe.load_gold()

## Now the real work begins...

In [11]:
candidates = session.query(candidate_class).filter(
    candidate_class.split == config['babbler_candidate_split']).all()

In [12]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidates[:10], strategy='linear', preload=False)

In [13]:
c = bs.next()

In [14]:
from snorkel.lf_helpers import *
from IPython.core.display import display, HTML

In [88]:
chunks = get_text_splits(c)
arg_tmpl = u'<b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">{0}<small style="color:#4B86A8; font-size:9.5pt;">{1}</small></b>'
sent_tmpl = u'<p style="font-size:12pt;">{}</p>'
div_tmpl = u'''<div style="border: 1px dotted #858585; border-radius:8px;
    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt">{}</div>'''
text = u""
for s in chunks[0:]:
    if s in [u"{{A}}", u"{{B}}"]:
        span = (c[0].get_span(), " arg1 ") if s == u"{{A}}" else (c[1].get_span(), " arg2 ")
        text += arg_tmpl.format(span[0], span[1])
    else:
        text += s.replace(u"\n", u"<BR/>")
html = div_tmpl.format(sent_tmpl.format(text.strip()))
HTML(html)

In [16]:
print c[0].get_span()
print c[1].get_span()

Mr Simms
Penny Wright


In [48]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=150)
# sv.get_selected()[0].sentence.text
sv.get_selected()[0]

<IPython.core.display.Javascript object>

Span("Mr Simms", sentence=65556, chars=[0,7], words=[0,1])

In [25]:
from snorkel.contrib.babble import Explanation
label = False
condition = "'replace' between arg1 and arg2"
# condition = "'consultant' is to the left of arg 1"
explanation = Explanation(condition, label, candidate=c, name='')

In [26]:
explanation

Explanation("False, 'replace' between arg1 and arg2")

In [27]:
parse_results = bs.apply(explanation)
print parse_results

1 explanation(s) out of 1 were parseable.
1 parse(s) generated from 1 explanation(s).
1 parse(s) remain (0 parse(s) removed by DuplicateSemanticsFilter).
1 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
([ConfusionMatrix(tp=set([]), fp=set([]), tn=set([Spouse(Span("Mr Bennett", sentence=49945, chars=[0,9], words=[0,1]), Span("     '", sentence=49945, chars=[125,130], words=[29,30])), Spouse(Span("Labour", sentence=57408, chars=[43,48], words=[8,8]), Span("Mr Blumenthal's", sentence=57408, chars=[114,128], words=[20,22])), Spouse(Span("Mr Simms", sentence=65556, chars=[0,7], words=[0,1]), Span("Penny Wright", sentence=65556, chars=[22,33], words=[4,5])), Spouse(Span("Penny Wright", sentence=65549, chars=[24,35], words=[3,4]), Span("Robert Simms", sentence=65549, chars=[87,98], words=[13,14]))]), fn=set([]))], [None])


In [28]:
conf_matrix_list, stats_list = parse_results

In [51]:
conf_matrix_items  =[]
for matrix in conf_matrix_list:
    for item in matrix.tn:
        print item[0].sentence.text

Mr Bennett, who walked with two sticks due to a hip replacement, had his back to the van, which was about a metre behind him.     '
The alleged plan never came to fruition as Labour was removed from power and replaced by the Coalition soon after Mr Blumenthal's email.     
Mr Simms will replace Penny Wright in the Senate after she announced her decision to quit politics because of a family illness.   
Outgoing Greens senator Penny Wright is expected to be replaced by Adelaide councillor Robert Simms.   


In [30]:
len(conf_matrix_list[0].tp)

57

In [36]:
len(conf_matrix_list[0].fp)

166

In [29]:
# TBD: (will return a pandas DataFrame)
stats_list[0].precision

In [37]:
bs.commit_lfs()

Added 1 parse(s) to set. (Total # parses = 1)
TODO: add to label_matrix...


In [31]:
# TBD: (will return a csr_AnnotationMatrix)
L_train = bs.get_label_matrix()