In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'project': 'babble',
    'domain': 'protein',
    'gold_explanations': True,
#     'debug': True,
#     'db_name': 'babble_cdr_featurized_temp',
#     'lf_source': 'gradturk',
#     'max_explanations': 30,
    'seed': 1,
    'parallelism': 1,
    'splits': [0,1,2],
    'disc_model_class': 'lstm',
    'supervision': 'majority',
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_protein.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 2
    config['disc_model_search_space'] = 2
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting domain=None to domain=protein
Overwriting babbler_candidate_split=1 to babbler_candidate_split=[0, 1, 2]
Overwriting LF_acc_prior_weight_default=1.0 to LF_acc_prior_weight_default=0.5
Overwriting init_class_prior=0 to init_class_prior=-1.39
Overwriting reg_param=0.1 to reg_param=0.5
Overwriting supervision=generative to supervision=majority
Overwriting seed=0 to seed=1
Overwriting gold_explanations=False to gold_explanations=True
Overwriting traditional_split=0 to traditional_split=1
Using ProteinPipeline object.


In [5]:
for split in [0,1,2]:
    print(session.query(pipe.candidate_class).filter(
        pipe.candidate_class.split == split).count())

5546
1011
1058


In [6]:
# %time pipe.parse()

In [7]:
# %time pipe.extract()

In [8]:
# %time pipe.load_gold()

In [9]:
# %time pipe.featurize()

In [10]:
%time pipe.collect()

Linking candidates...
# CANDIDATES: 7615
Building list of target candidate ids...
Collected 29 unique target candidate ids from 30 explanations.
Gathering desired candidates...
Found 29/29 desired candidates
Linking explanations to candidates...
Linked 30/30 explanations
Calling babbler...
Created grammar with 599 rules
CPU times: user 10.5 s, sys: 671 ms, total: 11.2 s
Wall time: 11.3 s


In [11]:
for lf in pipe.lfs:
    print(lf.__name__)

LF_by_with_gold
LF_NucAc_in_sentence_gold
LF_activate_B_gold
LF_activates_gold
LF_associat_with_gold
LF_between_before_gold
LF_bind_B_I_gold
LF_close_I_gold
LF_comma_gold
LF_complex_L_gold
LF_complex_R_gold
LF_dist_sup_gold
LF_distant_gold
LF_induc_gold
LF_influence_B_gold
LF_interact_in_sentence_gold
LF_interaction_gold
LF_levels_gold
LF_mutation_list_I_gold
LF_no_B_gold
LF_phosphory_gold
LF_prepositions_I_gold
LF_regulate_Betw_gold
LF_residue_gold
LF_same_gold
LF_signaling_gold
LF_substrate_B_I_gold
LF_transfect_gold
LF_uncertain_gold
LF_sequenc_in_sentence_gold


In [12]:
%time pipe.label()

Clearing existing...
Running UDF...

Labeled split 0: (5546,30) sparse (nnz = 13055)

Clearing existing...
Running UDF...

Labeled split 1: (1011,30) sparse (nnz = 2442)

                               j  Coverage  Overlaps  Conflicts   TP   FP  FN  \
LF_by_with_gold                0  0.077151  0.068249   0.042532   44   33   0   
LF_NucAc_in_sentence_gold      1  0.090999  0.088032   0.014837    0    0   4   
LF_activate_B_gold             2  0.041543  0.032641   0.029674   18   23   0   
LF_activates_gold              3  0.006924  0.006924   0.003956    6    1   0   
LF_associat_with_gold          4  0.072206  0.058358   0.026706    0    0  13   
LF_between_before_gold         5  0.025717  0.024728   0.017804    8   16   0   
LF_bind_B_I_gold               6  0.070227  0.060336   0.040554   34   37   0   
LF_close_I_gold                7  0.070227  0.067260   0.031652   38   31   0   
LF_comma_gold                  8  0.146390  0.123640   0.057369    0    0  14   
LF_complex_L_gold  

  ac = (tp+tn).astype(float) / (tp+tn+fp+fn)



Labeled split 2: (1058,30) sparse (nnz = 2546)

CPU times: user 1min 24s, sys: 911 ms, total: 1min 25s
Wall time: 1min 26s


## Make X

In [13]:
TRAIN = 0
DEV = 1
TEST = 2

train = [exp.candidate for exp in pipe.explanations]
dev = session.query(pipe.candidate_class).filter(pipe.candidate_class.split == DEV).all()
test = session.query(pipe.candidate_class).filter(pipe.candidate_class.split == TEST).all()

In [14]:
from snorkel.annotations import load_label_matrix

Ls = []
Ls.append(load_label_matrix(pipe.session, split=TRAIN))
Ls.append(load_label_matrix(pipe.session, split=DEV))
Ls.append(load_label_matrix(pipe.session, split=TEST))

In [15]:
from scipy.sparse import vstack

def candidates_to_features(candidates, Ls):
    for i, c in enumerate(candidates):
        L = Ls[c.split]
        row_idx = L.get_row_index(c)
        features = L[row_idx,:]
        if i == 0:
            X = features
        else:
            X = vstack((X, features))
    
    # All features are indicators ({0,1})
    X = abs(X.todense())
    return X

In [16]:
X_train = candidates_to_features(train, Ls)
print(X_train.shape)

X_dev   = candidates_to_features(dev, Ls)
print(X_dev.shape)

X_test  = candidates_to_features(test, Ls)
print(X_test.shape)

(30, 30)
(1011, 30)
(1058, 30)


## Make y

In [17]:
from snorkel.annotations import load_gold_labels

L_golds = []
L_golds.append(load_gold_labels(pipe.session, annotator_name='gold', split=TRAIN))
L_golds.append(load_gold_labels(pipe.session, annotator_name='gold', split=DEV))
L_golds.append(load_gold_labels(pipe.session, annotator_name='gold', split=TEST))

In [18]:
import numpy as np

def candidates_to_labels(candidates, L_golds):
    labels = []
    for i, c in enumerate(candidates):
        L_gold = L_golds[c.split]
        row_idx = L_gold.get_row_index(c)
        label = L_gold[row_idx,0]
        labels.append(label)
    y = np.array(labels)
    return y

In [19]:
y_train = candidates_to_labels(train, Ls)
print(y_train.shape)

y_dev   = candidates_to_labels(dev, Ls)
print(y_dev.shape)

y_test  = candidates_to_labels(test, Ls)
print(y_test.shape)

(30,)
(1011,)
(1058,)


## Train and Test

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

best = -1
for C in [1, 5, 10, 50, 100, 500, 1000, 10000]:
    logreg = LogisticRegression(C=C)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_dev)
    if sum(y_pred) == 0:
        f1 = 0
    else:
        f1 = f1_score(y_dev, y_pred)
    print("[C = {}]: f1 = {}".format(C, f1))
    if f1 > best:
        best = f1
        best_C = C
        
print("\nUsing C = {}".format(best_C))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 1]: f1 = 0.0740740740741


LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 5]: f1 = 0.689075630252


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 10]: f1 = 0.809160305344


LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 50]: f1 = 0.909090909091


LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 100]: f1 = 0.924137931034


LogisticRegression(C=500, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 500]: f1 = 0.96


LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 1000]: f1 = 0.966887417219


LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

[C = 10000]: f1 = 0.980392156863

Using C = 10000


In [21]:
# for i, j in zip(y_dev, y_pred):
#     print((i,j))

In [28]:
logreg.set_params(C=best_C)
y_pred = logreg.predict(X_test)

f1 = f1_score(y_test, y_pred)
print("Test F1 = {:.4f}".format(f1))

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Test F1 = 0.9802


In [43]:
tp = fp = tn = fn = 0
for i, j in zip(y_test, y_pred):
    if i == j:
        if i == 1:
            tp += 1
        else:
            tn += 1
    else:
        if i == 0:
            fp += 1
        else:
            fn += 1
p = float(tp)/(tp + fp)
r = float(tp)/(tp + fn)
f1 = float(2*p*r)/(p + r)
print(f1)

0.980198019802
