In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'project': 'babble',
    'domain': 'spouse',
    'gold_explanations': True,
#     'debug': True,
    'db_name': 'babble_spouse_goldlabeled_tocopy',
#     'lf_source': 'gradturk',
#     'max_explanations': 30,
    'seed': 1,
    'parallelism': 1,
    'splits': [0, 1],
    'disc_model_class': 'logreg',
}

if config['gold_explanations'] and not 'gold' in config.get('db_name',''):
    print("WARNING!!!")

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_spouse_goldlabeled_tocopy.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=spouse
Overwriting lr=0.01 to lr=0.001
Overwriting print_freq=1 to print_freq=5
Overwriting l1_penalty=1.0 to l1_penalty=0
Overwriting n_epochs=25 to n_epochs=20
Overwriting disc_model_search_space=1 to disc_model_search_space=10
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting init_class_prior=0 to init_class_prior=-1.15
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting gold_explanations=False to gold_explanations=True
Overwriting seed=0 to seed=1
Overwriting splits=[0, 1, 2] to splits=[0, 1]
Overwriting babbler_candidate_split=1 to babbler_candidate_split=[0, 1, 2]
Overwriting disc_model_class=lstm to disc_model_class=logreg
Using SpousePipeline object.


In [5]:
for split in [0,1,2]:
    print(session.query(pipe.candidate_class).filter(
        pipe.candidate_class.split == split).count())

22195
2796
2697


In [6]:
# %time pipe.parse()

In [7]:
# %time pipe.extract()

In [8]:
# %time pipe.load_gold()

In [9]:
# %time pipe.featurize()

In [10]:
%time pipe.collect()

Linking candidates...
# CANDIDATES: 27688
Building list of target candidate ids...
Collected 30 unique target candidate ids from 30 explanations.
Gathering desired candidates...
Found 30/30 desired candidates
Linking explanations to candidates...
Linked 30/30 explanations
Calling babbler...
Created grammar with 597 rules
CPU times: user 24.4 s, sys: 1.1 s, total: 25.5 s
Wall time: 25.6 s


In [11]:
# %time pipe.label(clear=True)

## Make X

In [12]:
TRAIN = 0
DEV = 1
TEST = 2

train = [exp.candidate for exp in pipe.explanations]
dev = session.query(pipe.candidate_class).filter(pipe.candidate_class.split == DEV).all()
test = session.query(pipe.candidate_class).filter(pipe.candidate_class.split == TEST).all()

In [13]:
from snorkel.annotations import load_label_matrix

Ls = []
Ls.append(load_label_matrix(pipe.session, split=TRAIN))
Ls.append(load_label_matrix(pipe.session, split=DEV))
Ls.append(load_label_matrix(pipe.session, split=TEST))

In [14]:
from scipy.sparse import vstack

def candidates_to_features(candidates, Ls):
    for i, c in enumerate(candidates):
        L = Ls[c.split]
        row_idx = L.get_row_index(c)
        features = L[row_idx,:]
        if i == 0:
            X = features
        else:
            X = vstack((X, features))
    
    # All features are indicators ({0,1})
    print("WARNING: This has been modified so it no longer converts -1 -> 1")
#     X = abs(X.todense())
    X = X.todense()
    return X

In [15]:
X_train = candidates_to_features(train, Ls)
print(X_train.shape)

X_dev   = candidates_to_features(dev, Ls)
print(X_dev.shape)

X_test  = candidates_to_features(test, Ls)
print(X_test.shape)

(30, 30)
(2796, 30)
(2697, 30)


In [16]:
# X_train2 = np.zeros(X_train.shape)
# for i, c in enumerate(train):
#     for j, lf in enumerate(pipe.lfs):
#         X_train2[i, j] = lf(c)
#         if X_train2[i,j] != X_train[i,j]:
#             import pdb; pdb.set_trace()

In [37]:
negatives = np.sum(np.where(X_train == -1, 1, 0), axis=1)
positives = np.sum(np.where(X_train == 1, 1, 0), axis=1)
marginals = np.divide(positives.astype(float), (positives + negatives))
marginals

array([ 0.        ,  0.        ,  0.5       ,  0.        ,  0.75      ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.5       ,  0.5       ,  0.66666667,
        0.66666667,  0.66666667,  0.        ,  0.        ,  0.        ,
        0.        ,  0.5       ,  0.33333333,  1.        ,  0.66666667,
        0.66666667,  0.75      ,  0.5       ,  0.5       ,  0.75      ])

In [31]:
X_train

matrix([[ 0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, -1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, -1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1],
        [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0, -1,  0,  0,  0

In [40]:
X = X_train
net_votes = np.sum(X, axis=1)
num_votes = np.sum(abs(X), axis=1)
marginals = np.divide(net_votes + num_votes, 2.0 * num_votes)
marginals

matrix([[ 0.        ],
        [ 0.        ],
        [ 0.5       ],
        [ 0.        ],
        [ 0.75      ],
        [ 1.        ],
        [ 0.        ],
        [ 0.        ],
        [ 1.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.5       ],
        [ 0.5       ],
        [ 0.66666667],
        [ 0.66666667],
        [ 0.66666667],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.5       ],
        [ 0.33333333],
        [ 1.        ],
        [ 0.66666667],
        [ 0.66666667],
        [ 0.75      ],
        [ 0.5       ],
        [ 0.5       ],
        [ 0.75      ]])

In [44]:
a = np.ravel(marginals)
a[5] = np.nan
a

array([ 0.        ,  0.        ,  0.5       ,  0.        ,  0.75      ,
               nan,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.5       ,  0.5       ,  0.66666667,
        0.66666667,  0.66666667,  0.        ,  0.        ,  0.        ,
        0.        ,  0.5       ,  0.33333333,  1.        ,  0.66666667,
        0.66666667,  0.75      ,  0.5       ,  0.5       ,  0.75      ])

In [45]:
a[np.where(np.isnan(a))] = 0.5
a

array([ 0.        ,  0.        ,  0.5       ,  0.        ,  0.75      ,
        0.5       ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.5       ,  0.5       ,  0.66666667,
        0.66666667,  0.66666667,  0.        ,  0.        ,  0.        ,
        0.        ,  0.5       ,  0.33333333,  1.        ,  0.66666667,
        0.66666667,  0.75      ,  0.5       ,  0.5       ,  0.75      ])

## Make y

In [17]:
from snorkel.annotations import load_gold_labels

L_golds = []
L_golds.append(load_gold_labels(pipe.session, annotator_name='gold', split=TRAIN))
L_golds.append(load_gold_labels(pipe.session, annotator_name='gold', split=DEV))
L_golds.append(load_gold_labels(pipe.session, annotator_name='gold', split=TEST))

In [18]:
import numpy as np

print("WARNING: This has been modified so it no longer converts -1 -> 0")

def candidates_to_labels(candidates, L_golds):
    labels = []
    for i, c in enumerate(candidates):
        L_gold = L_golds[c.split]
        row_idx = L_gold.get_row_index(c)
        label = L_gold[row_idx,0]
#         if label == -1:
#             label = 0
        labels.append(label)
    y = np.array(labels)
    return y



In [19]:
y_train = np.array([int(exp.label)*2 - 1 for exp in pipe.explanations])
print(y_train.shape)

y_dev   = candidates_to_labels(dev, L_golds)
print(y_dev.shape)

y_test  = candidates_to_labels(test, L_golds)
print(y_test.shape)

(30,)
(2796,)
(2697,)


In [20]:
print(sum(y_train)/float(len(y_train)))
print(sum(y_dev)/float(len(y_dev)))
print(sum(y_test)/float(len(y_test)))

0.133333333333
-0.859799713877
-0.833889506859


In [21]:
y_train

array([-1, -1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1,  1,  1,
       -1, -1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1])

In [28]:
accs = []
covs = []

for i in range(X_train.shape[1]):
    sig = np.squeeze(np.asarray(X_train[:,i]))
    correct = 0
    incorrect = 0
    for j, label in enumerate(sig):
        if label:
            if y_train[j] == label:
                correct += 1
            else:
                incorrect += 1
    if correct + incorrect > 0:
        acc = float(correct)/(correct + incorrect)
    else:
        acc = np.nan
    cov = float(correct + incorrect)/len(y_train)
    covs.append(cov)
    accs.append(acc)

for a, c in zip(accs, covs):
    print("{:.1f}, {:.1f}".format(a * 100, c * 100))
    
for i, a in enumerate(accs):
    if np.isnan(a):
        print(pipe.explanations[i])
        print(pipe.babbler.semparser.grammar.translate(pipe.explanations[i].semantics))

nan, 0.0
46.2, 86.7
100.0, 3.3
100.0, 3.3
nan, 0.0
100.0, 3.3
100.0, 3.3
100.0, 3.3
100.0, 16.7
100.0, 3.3
nan, 0.0
100.0, 3.3
100.0, 3.3
100.0, 3.3
100.0, 13.3
100.0, 13.3
nan, 0.0
nan, 0.0
100.0, 3.3
100.0, 3.3
100.0, 3.3
100.0, 3.3
100.0, 3.3
100.0, 3.3
83.3, 20.0
nan, 0.0
100.0, 3.3
100.0, 3.3
100.0, 3.3
100.0, 6.7
Explanation("LF1: False, person2 occurs in a phrase surrounded by quotes")
return -1 if ('"'.in(text(greater than 0 word(s) to the left of Y)) and '"'.in(text(greater than 0 word(s) to the right of Y))) else 0
Explanation("LF5: True, person2 is the subject of the sentence and person1 is immediately preceded by 'husband'")
return -1 if False.(= True) else 0
Explanation("LF11: False, person2 is an empty string.")
return -1 if text(Y).(= '  ') else 0
Explanation("LF17: True, 'and' is between person1 and person2, and 'their son' comes less than 10 words after person1 and person2.")
return 1 if ('and'.in(text(between([X,Y]))) and 'their son'.(all([in(text(less than 10 word(s)

## Train and Test

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

best = -1
for C in [1, 5, 10, 50, 100, 500, 1000, 10000]:
    logreg = LogisticRegression(C=C)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_dev)
    if sum(y_pred) == 0:
        f1 = 0
    else:
        f1 = f1_score(y_dev, y_pred)
    print("[C = {}]: f1 = {}".format(C, f1))
    if f1 > best:
        best = f1
        best_C = C
        
print("\nUsing C = {}".format(best_C))

In [None]:
# for i, j in zip(y_dev, y_pred):
#     print((i,j))

In [None]:
logreg.set_params(C=best_C)
y_pred = logreg.predict(X_test)

f1 = f1_score(y_test, y_pred)
print("Test F1 = {:.5f}".format(f1))

In [None]:
def homemade_f1(y_test, y_pred):
    tp = fp = tn = fn = 0
    for i, j in zip(y_test, y_pred):
        if i == j:
            if i == 1:
                tp += 1
            else:
                tn += 1
        else:
            if i == 0:
                fp += 1
            else:
                fn += 1
    p = float(tp)/(tp + fp)
    r = float(tp)/(tp + fn)
    f1 = float(2*p*r)/(p + r)
    return f1

In [None]:
print(homemade_f1(y_test, y_pred))

## Try Snorkel LogReg model

In [None]:
from snorkel.learning import LogisticRegression

In [None]:
slogreg = LogisticRegression()

for l2_penalty in [1e-3, 1e-2, 1e-1, 1, 10, 0]:
    slogreg.train(X_train, y_train, lr=0.001, batch_size=32, n_epochs=100)
    p, r, f1 = slogreg.score(X_test, y_test)
    print("F1 = {}".format(f1))

In [None]:
p, r, f1 = slogreg.score(X_test, y_test)
print("F1 = {}".format(f1))