In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
config = {
    'domain': 'protein',
    'supervision': 'generative',
    'traditional_split': 1,
    'disc_model_class': 'logreg',
    'display_learned_accuracies': True,
}

In [3]:
# Get DB connection string and add to globals
# NOTE: $SNORKELDB must be set before any snorkel imports
import os

default_db_name = 'babble_' + config['domain'] + ('_debug' if config.get('debug', False) else '')
DB_NAME = config.get('db_name', default_db_name)
if 'postgres' in config and config['postgres']:
    DB_TYPE = 'postgres'
else:
    DB_TYPE = 'sqlite'
    DB_NAME += '.db'
DB_ADDR = "localhost:{0}".format(config['db_port']) if 'db_port' in config else ""
os.environ['SNORKELDB'] = '{0}://{1}/{2}'.format(DB_TYPE, DB_ADDR, DB_NAME)
print("$SNORKELDB = {0}".format(os.environ['SNORKELDB']))

$SNORKELDB = sqlite:///babble_protein.db


In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

# Resolve config conflicts (nb_config > local_config > global_config)
from snorkel.contrib.babble.pipelines import merge_configs, get_local_pipeline
config = merge_configs(config)

if config['debug']:
    print("NOTE: --debug=True: modifying parameters...")
    config['max_docs'] = 100
    config['gen_model_search_space'] = 1
    config['disc_model_search_space'] = 1
    config['gen_params_default']['epochs'] = 25
    config['disc_params_default']['n_epochs'] = 5

from snorkel.models import candidate_subclass
candidate_class = candidate_subclass(config['candidate_name'], config['candidate_entities'])

pipeline = get_local_pipeline(config['domain'])
pipe = pipeline(session, candidate_class, config)

Overwriting gen_f_beta=0.5 to gen_f_beta=1.0
Overwriting domain=None to domain=protein
Overwriting babbler_candidate_split=1 to babbler_candidate_split=[0, 1, 2]
Overwriting disc_model_class=lstm to disc_model_class=logreg
Overwriting display_learned_accuracies=False to display_learned_accuracies=True
Overwriting traditional_split=0 to traditional_split=1


In [5]:
from snorkel.models import Document

session.query(Document).count()

4735

In [6]:
for split in [0,1,2]:
    print(session.query(pipe.candidate_class).filter(
        pipe.candidate_class.split == split).count())

5546
1011
1058


In [7]:
# %time pipe.parse()

In [8]:
# %time pipe.extract()

In [9]:
# %time pipe.load_gold()

In [10]:
# %time pipe.featurize()

In [11]:
%time pipe.collect()

Linking candidates...
# CANDIDATES: 7615
Building list of target candidate ids...
Collected 29 unique target candidate ids from 30 explanations.
Gathering desired candidates...
Found 29/29 desired candidates
Linking explanations to candidates...
Linked 30/30 explanations
Calling babbler...
Created grammar with 598 rules
Flushing all parses from previous explanation set.

22 explanation(s) out of 30 were parseable.
104 parse(s) generated from 30 explanation(s).
70 parse(s) remain (34 parse(s) removed by DuplicateSemanticsFilter).
32 parse(s) remain (38 parse(s) removed by ConsistencyFilter).
### Applying labeling functions to split 1

### Done in 6.0s.

32 parse(s) remain (0 parse(s) removed by UniformSignatureFilter: (0 None, 0 All)).
19 parse(s) remain (13 parse(s) removed by DuplicateSignatureFilter).
Added 19 parse(s) from 15 explanations to set. (Total # parses = 19)
CPU times: user 2min 31s, sys: 2.52 s, total: 2min 34s
Wall time: 2min 33s


In [12]:
pipe.babbler.get_explanations()

[Explanation("LF_activate_B: True, 'activ' is within 40 characters between Kinase and Protein "),
 Explanation("LF_activates: True, sentence contains 'activates' and Kinase and Protein are less than 6 words apart."),
 Explanation("LF_between_before: True, the word 'between' is within 50 characters before the Protein or the Kinase and the word 'and' is within 40 characters between Protein and Kinase"),
 Explanation("LF_close_I: True, the number of words between Kinase and Protein is less than 6 and the sentence contains at least one of the int_ind words is in the sentence and none of the negative words are in the sentence and the order of appearance in the sentence is Kinase, Protein and Kinase and Protein are not separated by 'and', 'or', ',' "),
 Explanation("LF_dist_sup: True, The Kinase and Protein pair correspond to pairs in the list known_targets and the order of appearance in the sentence is Kinase, Protein and Kinase and Protein are separated by less than 8 words"),
 Explanation

In [13]:
pipe.babbler.get_parses()

["return 1 if 'activ'.(all([in(text(within 40 chars of X)),in(text(within 40 chars of Y))])) else 0",
 "return 1 if (text(the sentence).contains('activates') and text(X).in(text(less than 6 word(s) to the right of Y))) else 0",
 "return 1 if ('between'.in(text(exactly 50 chars to the left of X)) or all([s.(all([in(text(within 40 chars of X)),in(text(within 40 chars of Y))])) for s in [text(Y),'and']])) else 0",
 "return 1 if ('between'.in(text(exactly 50 chars to the left of X)) or 'and'[0].(all([in(text(within 40 chars of X)),in(text(within 40 chars of Y))]))) else 0",
 "return 1 if ('between'.in(text(no more than 50 chars to the left of X)) or all([s.(all([in(text(within 40 chars of X)),in(text(within 40 chars of Y))])) for s in [text(Y),'and']])) else 0",
 "return 1 if ('between'.in(text(no more than 50 chars to the left of X)) or 'and'[0].(all([in(text(within 40 chars of X)),in(text(within 40 chars of Y))]))) else 0",
 "return 1 if (count(between([X,Y])).(< 6) and (sum([s.in(text(t

In [14]:
pipe.babbler.get_lf_stats()

Unnamed: 0,j,Coverage,Overlaps,Conflicts,TP,FP,FN,TN,Empirical Acc.
LF_activate_B_0,0,0.064293,0.059347,0.059347,23,41,0,0,0.359375
LF_activates_1,1,0.006924,0.006924,0.006924,6,1,0,0,0.857143
LF_between_before_0,2,0.261128,0.261128,0.25816,46,214,0,0,0.176923
LF_between_before_2,3,0.318497,0.318497,0.305638,50,268,0,0,0.157233
LF_between_before_4,4,0.271019,0.271019,0.263106,47,223,0,0,0.174074
LF_between_before_6,5,0.327399,0.327399,0.309594,51,276,0,0,0.155963
LF_close_I_0,6,0.117705,0.117705,0.117705,55,61,0,0,0.474138
LF_dist_sup_4,7,0.058358,0.055391,0.053412,21,38,0,0,0.355932
LF_induc_0,8,0.18002,0.160237,0.062315,0,0,21,161,0.884615
LF_influence_B_0,9,0.005935,0.005935,0.005935,2,4,0,0,0.333333


In [15]:
# pipe.babbler.filtered_analysis()

In [16]:
# %time pipe.label()

In [17]:
# %time pipe.supervise()

In [18]:
# %time pipe.classify()