In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
DOMAIN = 'semparse_cdr'
# DOMAIN = 'semparse_spouse'
# DOMAIN = 'semparse_test'

In [3]:
import os

if DOMAIN in ['semparse_test', 'semparse_spouse']:
    os.environ['SNORKELDB'] = 'postgres://localhost:5432/semparse_spouse'
else:
    os.environ['SNORKELDB'] = 'postgres://localhost:5432/semparse_cdr'

In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

In [5]:
from snorkel.models import candidate_subclass

if DOMAIN in ['semparse_test', 'semparse_spouse']:
    Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
    candidate_class = Spouse
elif DOMAIN == 'semparse_cdr':
    ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
    candidate_class = ChemicalDisease
else:
    raise Exception("Invalid DOMAIN.")

In [6]:
candidates = session.query(candidate_class).all()
print("Candidates: {}".format(len(candidates)))

Candidates: 13780


In [7]:
%%time

if DOMAIN == 'semparse_test':
    user_lists = {
        'colors':['red','green','blue'],
        'bluebird':['blue','bird','fly'],
        'greek':['alpha','beta','gamma'],
        'letters':['a','B','C'],
        'smalls':['a','b','c','d'],
        'spouse':['wife','husband','spouse']}
elif DOMAIN == 'spouse':
    user_lists = {
        'spouse':['wife','husband','spouse'],
        'family':['father', 'mother', 'brother', 'sister']}
elif DOMAIN == 'semparse_cdr':
    import bz2
    import cPickle
    if 'ctd_unspecified' not in globals():
        with bz2.BZ2File('data/ctd.pkl.bz2', 'rb') as ctd_f:
            ctd_unspecified, ctd_therapy, ctd_marker = cPickle.load(ctd_f)
    user_lists = {
        'uncertain': ['combin', 'possible', 'unlikely'],
        'causal': ['causes', 'caused', 'induce', 'induces', 'induced', 'associated with'],
        'treat': ['treat', 'effective', 'prevent', 'resistant', 'slow', 'promise', 'therap'],
        'procedure': ['inject', 'administrat'],
        'patient': ['in a patient with', 'in patients with'],
        'weak': ['none', 'although', 'was carried out', 'was conducted', 'seems', 
                 'suggests', 'risk', 'implicated', 'the aim', 'to investigate',
                 'to assess', 'to study'],
        'ctd_unspecified': ctd_unspecified,
        'ctd_therapy': ctd_therapy,
        'ctd_marker': ctd_marker,
    }

CPU times: user 18.2 s, sys: 347 ms, total: 18.6 s
Wall time: 18.6 s


In [8]:
from semparse_examples import get_examples

%time examples = get_examples(DOMAIN, candidates)

CPU times: user 11.4 s, sys: 469 ms, total: 11.8 s
Wall time: 14.4 s


In [9]:
# from snorkel.viewer import SentenceNgramViewer
# sv = SentenceNgramViewer(candidates[:300], session)
# sv

In [27]:
from snorkel.semantic import SemanticParser

%time sp = SemanticParser(candidate_class, user_lists, beam_width=10, top_k=-1)

Created grammar with 321 rules
CPU times: user 7.54 ms, sys: 15.1 ms, total: 22.6 ms
Wall time: 24.5 ms


In [33]:
%time results = sp.evaluate(examples,\
                            show_everything=False,\
                            show_nothing=False,\
                            show_explanation=True,\
                            show_candidate=False,\
                            show_sentence=False,\
                            show_parse=False,\
                            show_semantics=True,\
                            show_correct=False,\
                            show_passing=False,\
                            show_failing=False,\
                            pseudo_python=True,\
                            remove_paren=True,\
                            paraphrases=True,\
                            only=[])
results

Example 0: Label true because between the chemical and the disease, 
            there is a causal word and the word 'not' is not between them.

C: return 1 if (any(map(in text(between([arg1,arg2])), 'CAUSAL')) and not (call(in text(between([arg1,arg2])), 'not'))) else 0

Example 1: Label true because the disease is immediately preceded by the chemical.

C: return 1 if call(in text(right(arg1,'.eq',1,'words')), text(arg2)) else 0

P: return 1 if call(in text(left(arg2,'.eq',1,'words')), text(arg1)) else 0

F: return 1 if call(('.eq', ('.int', 1)), count(tokens(right(arg1)))) else 0

Example 2: Label true because the disease is immediately preceded by the chemical, 
            and the chemical name contains an "induc" or "assoc" root.

P: return 1 if (call(in text(right(arg1,'.eq',1,'words')), text(arg2)) and call(('.composite_or', ('.contains',), ('.list', ('.string', u'induc'), ('.string', u'assoc'))), text(arg1))) else 0

P: return 1 if (call(in text(left(arg2,'.eq',1,'words')), tex

Unnamed: 0,Correct,Passing,Failing,Redundant,Erroring,Unknown,Index
LF_c_cause_d,1,0,0,0,0,0,0
LF_c_d,1,1,1,0,0,0,1
LF_c_induced_d,0,2,1,0,0,0,2
LF_c_treat_d,0,2,1,0,0,0,3
LF_c_treat_d_wide,0,1,0,0,0,0,4
LF_ctd_marker_c_d,1,2,5,0,0,0,5
LF_ctd_therapy_treat,0,2,2,0,0,0,6
LF_ctd_unspecified_treat,0,2,2,0,0,0,7
LF_d_following_c,0,1,0,0,0,0,8
LF_d_induced_by_c,0,1,0,0,0,0,9


In [29]:
import numpy as np
print np.sum(results, axis=0)

Correct       30
Passing        6
Failing       14
Redundant      2
Erroring       0
Unknown        0
Index        435
dtype: int64


In [13]:
sp.grammar.print_chart(nested=True)

(0, 1)      
             ($QueryToken <START>)
             ($Start <START>)
(1, 2)      
             ($QueryToken label)
             ($Label label)
(2, 3)      
             ($QueryToken false)
             ($False false)
             ($BoolLit ($False false))
             ($Bool ($BoolLit ($False false)))
(3, 4)      
             ($QueryToken because)
             ($Because because)
(4, 5)      
             ($QueryToken at)
(5, 6)      
             ($QueryToken least)
(6, 7)      
             ($QueryToken one)
             ($Int one)
(7, 8)      
             ($QueryToken weak)
             ($UserList weak)
             ($StringList ($UserList weak))
             ($List ($StringList ($UserList weak)))
(8, 9)      
             ($QueryToken phrase)
             ($Word phrase)
             ($Unit ($Word phrase))
(9, 10)     
             ($QueryToken is)
             ($Is is)
             ($Exists ($Is is))
(10, 11)    
             ($QueryToken in)
             ($In in)
       

In [14]:
# sp.grammar.print_grammar()