## Part VI: Natural Language Supervision

We will compare the performance of:

(a) traditional supervision - positive and negative labels on examples

(b) natural language supervision - explanations converted into LFs, which are then denoised and applied to unlabeled data to create a much larger but noisy training set

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os

# TO USE A DATABASE OTHER THAN SQLITE, USE THIS LINE
# Note that this is necessary for parallel execution amongst other things...
os.environ['SNORKELDB'] = 'postgres:///semparse'

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

We repeat our definition of the `Spouse` `Candidate` subclass from Parts II and III.

In [2]:
from snorkel.models import candidate_subclass
Spouse = candidate_subclass('Spouse', ['person1', 'person2'])

In [3]:
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()

In [4]:
F_train = featurizer.load_matrix(session, split=0)
F_dev   = featurizer.load_matrix(session, split=1)
F_unlabeled   = featurizer.load_matrix(session, split=3)

In [5]:
from snorkel.annotations import load_gold_labels
L_gold_train = load_gold_labels(session, annotator_name='gold', split=0)
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

## (b) Natural Language Supervision

In [8]:
# PYTHON LFs
from pprint import pprint
from python_lfs import get_python_lfs

python_lfs = get_python_lfs()
pprint(python_lfs)
# LFs = python_lfs

[<function LF_distant_supervision at 0x7f60812e3398>,
 <function LF_distant_supervision_last_names at 0x7f60812e3410>,
 <function LF_husband_wife at 0x7f608130cc80>,
 <function LF_husband_wife_left_window at 0x7f608130cf50>,
 <function LF_same_last_name at 0x7f608130cb90>,
 <function LF_no_spouse_in_sentence at 0x7f60812e3050>,
 <function LF_and_married at 0x7f60812e30c8>,
 <function LF_familial_relationship at 0x7f60812e3140>,
 <function LF_family_left_window at 0x7f60812e31b8>,
 <function LF_other_relationship at 0x7f60812e3230>]


In [9]:
# NL LFs
spouse = ['wife', 'husband', 'ex-wife', 'ex-husband']
family = ['father', 'mother', 'sister', 'brother', 'son', 'daughter',
          'grandfather', 'grandmother', 'uncle', 'aunt', 'cousin']
family = family + [f + '-in-law' for f in family]
coworker = ['boss', 'employee', 'secretary', 'co-worker']

user_lists = {'spouse': spouse,
              'family': family,
              'coworker': coworker}

explanations = [
    "Label false because the number of words between arg 1 and arg 2 is larger than 10",
    "Label false because there is a person between arg 1 and arg 2",
    "Label true because there is at least one spouse word in the words between arg 1 and arg 2",
    "Label true because there is at least one spouse word within two words to the left of arg 1 or arg 2",
    "Label false because there are no spouse words in the sentence",
    "Label true because the word 'and' is between arg 1 and arg 2 and 'married' is to the right of arg 2",
    "Label false because there is at least one family word between arg 1 and arg 2",
    "Label false because there is at least one family word within two words to the left of arg 1 or arg 2",
    "Label false because there is at least one coworker word between arg 1 and arg 2",
    "Label false because arg 1 is identical to arg 2",
]

In [10]:
import sys
sys.path.insert(0, '/Users/bradenhancock/sippycup')

In [11]:
from sippy_snorkel_interface import SemanticParser

sp = SemanticParser()
LFs = sp.parse(explanations, user_lists=user_lists, verbose=True)

ImportError: No module named sippy_snorkel_interface

In [None]:
for lf in LFs:
    print lf

### Investigate one LF

In [None]:
# lf = LFs[4]
# print lf

In [None]:
# labeled = []
# for c in session.query(Spouse).filter(Spouse.split == 3).all():
#     try:
#         if lf(c) != 0:
#             labeled.append(c)
#     except:
#         pass
# print "Number labeled:", len(labeled)

In [None]:
# from snorkel.viewer import SentenceNgramViewer

# sv = SentenceNgramViewer(labeled[:300], session)
# sv

In [None]:
# from snorkel.lf_helpers import test_LF
# tp, fp, tn, fn = test_LF(session, lf, split=0, annotator_name='gold')

### Apply all LFs

In [None]:
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(f=LFs)

In [None]:
np.random.seed(1701)
%time L_unlabeled = labeler.apply(split=3)
L_unlabeled

In [None]:
L_unlabeled.lf_stats(session, )

### Generative Model

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_unlabeled, epochs=500, decay=0.95, step_size=0.1/L_unlabeled.shape[0], reg_param=1e-6)

In [None]:
train_marginals = gen_model.marginals(L_unlabeled)

In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
L_dev = labeler.apply_existing(split=1)

In [None]:
tp, fp, tn, fn = gen_model.score(session, L_dev, L_gold_dev)

In [None]:
L_dev.lf_stats(session, L_gold_dev, gen_model.weights.lf_accuracy())

### Discriminative Model

In [None]:
from snorkel.learning import SparseLogisticRegression
disc_model = SparseLogisticRegression()

In [None]:
from snorkel.learning.utils import MentionScorer
from snorkel.learning import RandomSearch, ListParameter, RangeParameter

# Searching over learning rate
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

searcher = RandomSearch(session, disc_model, F_unlabeled, train_marginals, [rate_param, l1_param, l2_param], n=20)

In [None]:
np.random.seed(1701)
searcher.fit(F_dev, L_gold_dev, n_epochs=50, rebalance=True, print_freq=25)

In [None]:
disc_model.train(F_unlabeled, train_marginals, n_epochs=20, lr=0.001)

In [None]:
_, _, _, _ = disc_model.score(session, F_dev, L_gold_dev)