In [1]:
DOMAIN = 'semparse_cdr' # test, spouse, semparse_cdr

In [2]:
import sys
import os

os.environ['SNORKELDB'] = 'postgres:///{}'.format(DOMAIN)

from snorkel import SnorkelSession
session = SnorkelSession()

from snorkel.models import candidate_subclass
if DOMAIN in ['test', 'spouse']:
    Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
    candidate_class = Spouse
elif DOMAIN == 'semparse_cdr':
    ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
    candidate_class = ChemicalDisease
else:
    raise Exception("Invalid DOMAIN.")

In [3]:
candidates = session.query(candidate_class).all()
print("Candidates: {}".format(len(candidates)))

Candidates: 13780


In [4]:
from snorkel.semantic import (get_left_phrases, get_right_phrases, 
                              get_between_phrases, get_sentence_phrases)
c = candidates[0]
print c[0].get_parent()._asdict()['text']
print c

TDP is a side-effect that has led to withdrawal of several drugs from the market (e.g. terfenadine and terodiline). 
ChemicalDisease(Span("terfenadine", sentence=6413, chars=[87,97], words=[17,17]), Span("TDP", sentence=6413, chars=[0,2], words=[0,0]))


In [5]:
for p in get_left_phrases(c[0], cmp='.leq', num=3, n_max=3):
    print getattr(p, 'text')

market
(
e.g.
market (
(e.g.
market (e.g.


In [6]:
# c[0].get_parent()._asdict()

In [7]:
# from snorkel.viewer import SentenceNgramViewer
# sv = SentenceNgramViewer(candidates[:300], session)
# sv

In [8]:
# c = sv.get_selected()
# print c
# print c[0].get_parent()._asdict()['pos_tags']
# print c[0].get_parent()._asdict()['ner_tags']

In [9]:
# hash(c)

In [20]:
from semparse_examples import get_examples

examples = get_examples(DOMAIN, candidates)

if DOMAIN == 'test':
    user_lists = {
        'colors':['red','green','blue'],
        'bluebird':['blue','bird','fly'],
        'greek':['alpha','beta','gamma'],
        'letters':['a','B','C'],
        'smalls':['a','b','c','d'],
        'spouse':['wife','husband','spouse']}
elif DOMAIN == 'spouse':
    user_lists = {
        'spouse':['wife','husband','spouse'],
        'family':['father', 'mother', 'brother', 'sister']}
elif DOMAIN == 'semparse_cdr':
    import bz2
    import cPickle
    with bz2.BZ2File('data/ctd.pkl.bz2', 'rb') as ctd_f:
        ctd_unspecified, ctd_therapy, ctd_marker = cPickle.load(ctd_f)
    user_lists = {
        'uncertain': ['combin', 'possible', 'unlikely'],
        'causal': ['causes', 'caused', 'induce', 'induces', 'induced', 'associated with'],
        'treat': ['treat', 'effective', 'prevent', 'resistant', 'slow', 'promise', 'therap'],
        'procedure': ['inject', 'administrat'],
        'patient': ['in a patient with', 'in patients with'],
        'weak': ['none', 'although', 'was carried out', 'was conducted', 'seems', 
                 'suggests', 'risk', 'implicated', 'the aim', 'to investigate',
                 'to assess', 'to study'],
        'ctd_unspecified': ctd_unspecified,
        'ctd_therapy': ctd_therapy,
        'ctd_marker': ctd_marker,
    }

In [11]:
# examples[0].display()

In [12]:
# examples[28].candidate[0].get_parent()

In [13]:
%time from snorkel.semantic import SemanticParser

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 14.1 µs


In [14]:
%time sp = SemanticParser(candidate_class, user_lists, absorb=False)

Created grammar with 260 rules
CPU times: user 4.97 ms, sys: 9.07 ms, total: 14 ms
Wall time: 23.3 ms


In [15]:
%time results = sp.evaluate(examples,\
                            show_everything=False,\
                            show_explanation=True,\
                            show_candidate=False,\
                            show_sentence=False,\
                            show_parse=False,\
                            show_semantics=False,\
                            only=[])
results

Example 0: Label true because any causal phrase is between the 
            chemical and the disease and the word 'not' is not between the 
            chemical and the disease

Example 1: Label true because the disease is immediately after the chemical

Example 2: Label true because the disease is immediately after the 
            chemical and 'induc' or 'assoc' is in the chemical

Example 3: Label false because any word between the chemical and 
            the disease contains a treat word and the chemical is within 100 
            characters to the left of the disease

Example 4: Label false because any word between the chemical and 
            the disease contains a treat word and the chemical is left of the 
            disease

Example 5: Label true because 'following' is between the disease 
            and the chemical and any word after the chemical contains a 
            procedure word

Example 6: Label True because 'induced by', 'caused by', or 'due to' 
            is 

Unnamed: 0,Correct,Incorrect,Redundant,Failed,Unknown
Example0,1,0,0,0,0
Example1,1,0,0,0,0
Example2,1,0,0,0,0
Example3,1,0,0,0,0
Example4,1,0,0,0,0
Example5,1,0,0,0,0
Example6,1,0,0,0,0
Example7,1,0,0,0,0
Example8,1,0,0,0,0
Example9,1,0,0,0,0


In [16]:
import numpy as np
print np.sum(results, axis=0)

Correct      24
Incorrect     0
Redundant     0
Failed        0
Unknown       0
dtype: int64


In [17]:
# sp.grammar.print_chart(nested=True)

In [18]:
# sp.grammar.print_grammar()