# Environment Setup

In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
config = {
    'domain': 'spouse',
    'postgres': False,
    'parallelism': 1,
    'db_name': 'babble_spouse_demo',
    'babbler_candidate_split': 1,
    'babbler_label_split': 1,
    'supervision': 'majority',
    'disc_model_class': 'logreg',
    'gen_model_search_space': 1,
    'disc_model_search_space': 1,
}

In [25]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_demo.db


In [26]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs
config = merge_configs(config)

Overwriting disc_model_search_space=10 to disc_model_search_space=1


In [27]:
from snorkel.models import candidate_subclass
from tutorials.babble.spouse import SpousePipeline

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
candidate_class = Spouse
pipe = SpousePipeline(session, Spouse, config)

### Initialization

In [28]:
# %time pipe.parse()

In [29]:
# %time pipe.extract()

In [30]:
# %time pipe.load_gold()

In [31]:
# %time pipe.featurize()

### Pre-load Explanations (10) + User Lists (4)

In [32]:
from snorkel.contrib.babble import BabbleStream
bs = BabbleStream(session, candidate_class=Spouse, balanced=True, shuffled=True, seed=1234)

In [33]:
# from tutorials.babble.spouse.spouse_examples import get_explanations, get_user_lists

# spouse_explanations = get_explanations()
# spouse_user_lists = get_user_lists()
# spouse_explanations = []
# spouse_user_lists = {}

In [34]:
# bs.preload(explanations=spouse_explanations, user_lists=spouse_user_lists)

Created grammar with 595 rules
Flushing all parses from previous explanation set.
9 explanation(s) out of 9 were parseable.
23 parse(s) generated from 9 explanation(s).
11 parse(s) remain (12 parse(s) removed by DuplicateSemanticsFilter).
Note: 11 LFs did not have candidates and therefore could not be filtered.
11 parse(s) remain (0 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 40.2s.

10 parse(s) remain (1 parse(s) removed by UniformSignatureFilter: (1 None, 0 All)).
8 parse(s) remain (2 parse(s) removed by DuplicateSignatureFilter).
Added 8 parse(s) from 8 explanations to set. (Total # parses = 8)
### Applying labeling functions to split 0

Stored 68168 triples for split 0. Now shape is (23425, 8).
### Done in 242.3s.

### Applying labeling functions to split 2

Stored 5200 triples for split 2. Now shape is (1815, 8).
### Done in 18.7s.



# Start Demo:

### View user_lists

In [35]:
user_lists = bs.user_lists
for alias, values in user_lists.items():
    if len(values) <= 10:
        print("{}:\n {}\n".format(alias, values))
    else:
        print("{}:\n {}...\n".format(alias, list(values)[:10]))

Optionally add another user_list.

In [36]:
ALIAS = None    # e.g., 'marriage_words'
VALUES = []    # e.g., ['engaged', 'betrothed', 'proposed']

if ALIAS:
    bs.add_user_lists({ALIAS: VALUES})

### View a candidate

In [37]:
c = bs.next()
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer([c], session, n_per_page=1, height=max(len(c.get_parent().words)*2, 80))
sv

<IPython.core.display.Javascript object>

A Jupyter Widget

In [38]:
from snorkel.lf_helpers import *
from IPython.core.display import HTML

In [39]:
def candidate_html(c):
    chunks = get_text_splits(c)
    div_tmpl = u'''<div style="border: 1px #858585; box-shadow:0 4px 8px 0 rgba(0, 0, 0, 0.2), 0 6px 20px 0 rgba(0, 0, 0, 0.19);
    background-color:#FDFDFD; padding:5pt 10pt 5pt 10pt; width: 80%; margin: auto; margin-top: 2%">{}</div>'''
    arg_tmpl = u'<b style="background-color:#ffd77c;padding:1pt 5pt 1pt 5pt;">{}</b>'
    sent_tmpl = u'<p style="font-size:12pt;">{}</p>'
    text = u""
    for s in chunks[0:]:
        if s in [u"{{A}}", u"{{B}}"]:
            span = c[0].get_span() if s == u"{{A}}" else c[1].get_span()
            text += arg_tmpl.format(span)
        else:
            text += s.replace(u"\n", u"<BR/>")
    html = div_tmpl.format(sent_tmpl.format(text.strip()))
    return html

In [40]:
c

Spouse(Span("Kim", sentence=23467, chars=[42,44], words=[8,8]), Span("Andy", sentence=23467, chars=[158,161], words=[31,31]))

In [41]:
HTML(candidate_html(c))

### Give an explanation

(See MTurk instructions for examples)

In [42]:
LABEL = True
#CONDITION = ("there are no people between X and Y and 'husband' is immediately to the left of Y")
#CONDITION = ("'husband' is immediately to the left of X") # it's faster
CONDITION = ("")
# LABEL = True
# CONDITION = "X and Y are less than 10 words apart and 'wife' is between them"

# LABEL = False
# CONDITION = "'syndrome' occurs within three words to the right of arg 2"

# LABEL = False
# CONDITION = "')' is between X and Y"

# LABEL = True
# CONDITION = "'wife' is immediately before Y and X and Y are within 7 words of each other"


With the user input and the current candidate, we make an Explanation object.

In [43]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(CONDITION, LABEL, candidate=c)

In [44]:
explanation

Explanation("True, ")

### Parse and view labeling stats

In [45]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

Flushing all parses from previous explanation set.
0 explanation(s) out of 1 were parseable.
0 parse(s) generated from 1 explanation(s).
CPU times: user 3.32 ms, sys: 5.35 ms, total: 8.67 ms
Wall time: 6.29 ms


In [46]:
filtered_parses

{'DuplicateSemanticsFilter': [],
 'UnparseableExplanations': [FilteredExplanation(parse=Explanation("Explanation0: True, "), reason='Unparseable')]}

In [47]:
PARSE_IDX = 0
if parse_list:
    parse = parse_list[PARSE_IDX]
    conf_matrix = conf_matrix_list[PARSE_IDX]
    stats = stats_list[PARSE_IDX]

print("Parse {}:\n{}\n".format(PARSE_IDX, bs.semparser.grammar.translate(parse.semantics)))
print(stats.accuracy.numer)
print(stats.accuracy.denom)
print(stats.accuracy)
print(stats.class_coverage.numer)
print(stats.class_coverage.denom)
print(stats.class_coverage)
print(stats.coverage)

IndexError: list index out of range

In [None]:
# some parses were filtered. 
if sum([len(val) for key, val in filtered_parses.iteritems()]) > 0: 
    filtered_analysis = bs.filtered_analysis(filtered_parses)
else:
    filtered_analysis = None

In [None]:
tup_list = zip(parse_list, conf_matrix_list, stats_list)

In [None]:
for item in tup_list[0][1].correct:
    print item

In [None]:
# bs.semparser.grammar.print_chart()

### View labeled candidates

Select the subset of labeled candidates you would like to view.

In [None]:
SUBSET = 'correct' # Must be one of ['correct', 'incorrect', 'abstained']

subset = getattr(conf_matrix_list[0], SUBSET)
def candidate_generator(subset):
    for c in subset: 
        yield c
subset_generator = candidate_generator(subset)

In [None]:
from snorkel.viewer import SentenceNgramViewer
c = subset_generator.next()
sv = SentenceNgramViewer([c], session, n_per_page=3, height=max(len(c.get_parent().words)*2, 80))
sv

In [None]:
for idx in range(len(conf_matrix_list)):
    tf_sentence_dict = {}
    tf_sentence_dict["True"] = [candidate_html(sentence) for sentence in conf_matrix_list[idx].correct]

In [None]:
tf_sentence_dict

### Commit parses

If you are satisfied with the given parses, commit them.

In [48]:
bs.commit()

In [49]:
(f1, pr, re) = bs.get_majority_quality(split=1)
print(f1, pr, re)

### Applying labeling functions to split 0

Stored 68168 triples for split 0. Now shape is (23425, 16).
### Done in 248.7s.

CPU times: user 3min 51s, sys: 10.7 s, total: 4min 2s
Wall time: 4min 9s


In [50]:
num_labels_equiv = bs.get_labeled_equivalent(f1)
num_labels_equiv

### Applying labeling functions to split 2

Stored 5200 triples for split 2. Now shape is (1815, 16).
### Done in 19.0s.

CPU times: user 17.9 s, sys: 1.6 s, total: 19.5 s
Wall time: 19.1 s


In [51]:
# This is here for illustration purposes.
# No need to call this every time; it will be called by set_babbler_matrices.
%time bs.get_label_matrix(1)

CPU times: user 1.24 s, sys: 622 ms, total: 1.86 s
Wall time: 1.95 s


<23425x16 sparse matrix of type '<type 'numpy.int64'>'
	with 136336 stored elements in Compressed Sparse Row format>

### View global stats

In [52]:
bs.get_global_coverage().numer

2381

In [53]:
HTML(bs.get_lf_stats().to_html(columns=['Coverage', 'Overlaps', 'TP', 'FP', 'FN', 'TN', 'Empirical Acc.']))

Unnamed: 0,Coverage,Overlaps,TP,FP,FN,TN,Empirical Acc.
LF_spouse_to_left_0,0.076797,0.07598,88,100,0,0,0.468085
LF_no_spouse_in_sentence_0,0.822304,0.795752,0,0,63,1950,0.968703
LF_family_between_0,0.965278,0.946895,0,0,179,2184,0.924249
LF_family_to_left_0,0.112745,0.112337,0,0,13,263,0.952899
LF_friend_between_0,0.026961,0.026961,0,0,3,63,0.954545
LF_too_far_apart_0,0.555964,0.555964,0,0,76,1285,0.944159
LF_third_wheel_0,0.365196,0.365196,0,0,36,858,0.959732
LF_identical_args_0,0.04902,0.048611,0,0,0,120,1.0


In [54]:
bs.get_parses()

["return 1 if any([s.(any([in(text(no more than 2 word(s) to the left of X)),in(text(no more than 2 word(s) to the left of Y))])) for s in user_list('spouse')]) else 0",
 "return -1 if sum([s.in(text(the sentence)) for s in user_list('spouse')]).(= 0) else 0",
 'return -1 if count([w for w in the word(s) between([X,Y])]).(>= 1) else 0',
 "return -1 if any([s.(any([in(text(no more than 3 word(s) to the left of X)),in(text(no more than 3 word(s) to the left of Y))])) for s in user_list('family')]) else 0",
 "return -1 if any([s.in(text(between([X,Y]))) for s in user_list('friend')]) else 0",
 'return -1 if count(between([X,Y])).(> 10) else 0',
 'return -1 if count([w for w in the word(s) between([X,Y]) if w.ner_tags == PERSON]).(>= 1) else 0',
 'return -1 if text(X).(= text(Y)) else 0']

In [55]:
bs.get_lfs()

[<function snorkel.contrib.babble.grammar.grammar.LF_spouse_to_left_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_no_spouse_in_sentence_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_family_between_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_family_to_left_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_friend_between_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_too_far_apart_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_third_wheel_0>,
 <function snorkel.contrib.babble.grammar.grammar.LF_identical_args_0>]

In [56]:
bs.get_explanations()

[Explanation("LF_spouse_to_left: True, there is a spouse word within two words to the left of arg 1 or arg 2"),
 Explanation("LF_no_spouse_in_sentence: False, there are no spouse words in the sentence"),
 Explanation("LF_family_between: False, there is a family word between arg 1 and arg 2"),
 Explanation("LF_family_to_left: False, there is a family word within three words to the left of arg 1 or arg 2"),
 Explanation("LF_friend_between: False, there is an friend word between arg 1 and arg 2"),
 Explanation("LF_too_far_apart: False, the number of words between arg 1 and arg 2 is larger than 10"),
 Explanation("LF_third_wheel: False, there is a person between arg 1 and arg 2"),
 Explanation("LF_identical_args: False, arg 1 is identical to arg 2")]

In [57]:
bs.get_majority_quality()

(0.0, 0.0, 0.0)

## REPEAT (go back to "START DEMO")

When you have entered all of the explanations that you would like to, run these final cells.

In [None]:
# NOTE: Don't use this. Instead, incrementally label using bs.label_split() after each commit.
pipe.lfs = bs.get_lfs()
%time pipe.label(split=0)
%time pipe.label(split=2)

Clearing existing...
Running UDF...

Labeled split 0: (22195,1) sparse (nnz = 279)

CPU times: user 2min 15s, sys: 2.78 s, total: 2min 17s
Wall time: 2min 29s
Clearing existing...
Running UDF...

Labeled split 2: (2697,1) sparse (nnz = 49)

CPU times: user 15.3 s, sys: 218 ms, total: 15.5 s
Wall time: 15.7 s


In [None]:
%time pipe.set_babbler_matrices(bs, split=1) # Pulls out and saves label matrices from babbler.

In [None]:
%time pipe.supervise()

In [None]:
%time pipe.classify()

Note: in general, we expect Disc to do better than Gen. However, with small sample sizes, major class imbalance, or lack of grid search, those may flip.

## Scratch

This portion of the notebook can be used to find candidates that match a certain explanation.

In [None]:
candidates = session.query(bs.candidate_class).filter(bs.candidate_class.split == 1).all()

In [None]:
from snorkel.contrib.babble import Explanation
explanation = Explanation(
    label=False,
    condition="The last word of X is different than the last word of Y",
    candidate=None)

In [None]:
%time parse_list, filtered_parses, conf_matrix_list, stats_list = bs.apply(explanation)

In [None]:
# bs.filtered_analysis(filtered_parses)

In [None]:
# bs.semparser.grammar.print_chart()

In [None]:
lf = parse_list[0].function

In [None]:
matches = []
for c in candidates:
    if lf(c):
        matches.append(c)

In [None]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(matches, session, n_per_page=3, height=300)
sv

In [None]:
c = sv.get_selected()
print(c.get_stable_id())