In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
DOMAIN = 'semparse_cdr'
# DOMAIN = 'semparse_spouse'
# DOMAIN = 'semparse_test'

In [3]:
import os

if DOMAIN in ['semparse_test', 'semparse_spouse']:
    os.environ['SNORKELDB'] = 'postgres://localhost:5432/semparse_spouse'
else:
    os.environ['SNORKELDB'] = 'postgres://localhost:5432/semparse_cdr'

In [4]:
from snorkel import SnorkelSession
session = SnorkelSession()

In [5]:
from snorkel.models import candidate_subclass

if DOMAIN in ['semparse_test', 'semparse_spouse']:
    Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
    candidate_class = Spouse
elif DOMAIN == 'semparse_cdr':
    ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])
    candidate_class = ChemicalDisease
else:
    raise Exception("Invalid DOMAIN.")

In [6]:
candidates = session.query(candidate_class).all()
print("Candidates: {}".format(len(candidates)))

Candidates: 13780


In [7]:
%%time

if DOMAIN == 'semparse_test':
    user_lists = {
        'colors':['red','green','blue'],
        'bluebird':['blue','bird','fly'],
        'greek':['alpha','beta','gamma'],
        'letters':['a','B','C'],
        'smalls':['a','b','c','d'],
        'spouse':['wife','husband','spouse']}
elif DOMAIN == 'spouse':
    user_lists = {
        'spouse':['wife','husband','spouse'],
        'family':['father', 'mother', 'brother', 'sister']}
elif DOMAIN == 'semparse_cdr':
    import bz2
    import cPickle
    if 'ctd_unspecified' not in globals():
        with bz2.BZ2File('data/ctd.pkl.bz2', 'rb') as ctd_f:
            ctd_unspecified, ctd_therapy, ctd_marker = cPickle.load(ctd_f)
    user_lists = {
        'uncertain': ['combin', 'possible', 'unlikely'],
        'causal': ['causes', 'caused', 'induce', 'induces', 'induced', 'associated with'],
        'treat': ['treat', 'effective', 'prevent', 'resistant', 'slow', 'promise', 'therap'],
        'procedure': ['inject', 'administrat'],
        'patient': ['in a patient with', 'in patients with'],
        'weak': ['none', 'although', 'was carried out', 'was conducted', 'seems', 
                 'suggests', 'risk', 'implicated', 'the aim', 'to investigate',
                 'to assess', 'to study'],
        'ctd_unspecified': ctd_unspecified,
        'ctd_therapy': ctd_therapy,
        'ctd_marker': ctd_marker,
    }

CPU times: user 17.1 s, sys: 331 ms, total: 17.4 s
Wall time: 17.5 s


In [8]:
from semparse_examples import get_examples

%time examples = get_examples(DOMAIN, candidates)

In [9]:
from snorkel.semantic import (get_left_phrases, get_right_phrases, 
                              get_between_phrases, get_sentence_phrases)
c = candidates[1]
print c[0].get_parent()._asdict()['text']
print c

The objective of this study was to report our experience concerning the effectiveness of the prophylactic administration of lamivudine in hepatitis B virus surface antigen (HBs Ag) positive patients with rheumatologic disease. 
ChemicalDisease(Span("HBs Ag)", sentence=93015, chars=[173,179], words=[26,28]), Span("rheumatologic disease", sentence=93015, chars=[204,224], words=[32,33]))


In [10]:
# print c[0].get_attrib_tokens(a='words')
# c[0].get_attrib_tokens(a='entity_cids')

In [11]:
# for p in get_left_phrases(c[0], cmp='.leq', num=3, n_max=3):
#     print getattr(p, 'text')

In [12]:
# c[0].get_parent()._asdict()

In [13]:
# from snorkel.viewer import SentenceNgramViewer
# sv = SentenceNgramViewer(candidates[:300], session)
# sv

In [14]:
# c = sv.get_selected()
# print c
# print c[0].get_parent()._asdict()['pos_tags']
# print c[0].get_parent()._asdict()['ner_tags']

In [15]:
# hash(c)

In [16]:
# examples[0].display()

In [17]:
# examples[28].candidate[0].get_parent()

In [18]:
from snorkel.semantic import SemanticParser

%time sp = SemanticParser(candidate_class, user_lists)

Created grammar with 297 rules
CPU times: user 3.29 ms, sys: 13.7 ms, total: 17 ms
Wall time: 19.4 ms


In [19]:
%time results = sp.evaluate(examples,\
                            show_everything=False,\
                            show_explanation=False,\
                            show_candidate=False,\
                            show_sentence=False,\
                            show_parse=False,\
                            show_passing=False,\
                            show_correct=False,\
                            pseudo_python=False,\
                            absorb=False,\
                            remove_paren=False,\
                            only=[])
results

CPU times: user 361 ms, sys: 16.5 ms, total: 377 ms
Wall time: 1.32 s


Unnamed: 0,Correct,Passing,Failing,Redundant,Erroring,Unknown,Index
LF_c_cause_d,1,0,0,0,0,0,0
LF_c_d,1,0,0,0,0,0,1
LF_c_induced_d,1,0,0,0,0,0,2
LF_c_treat_d,1,0,0,0,0,0,3
LF_c_treat_d_wide,1,0,0,0,0,0,4
LF_ctd_marker_c_d,1,0,1,0,0,0,5
LF_ctd_marker_induce,1,0,1,0,0,0,6
LF_ctd_therapy_treat,1,0,1,0,0,0,7
LF_ctd_unspecified_treat,1,0,1,0,0,0,8
LF_ctd_unspecified_induce,1,0,1,0,0,0,9


In [20]:
import numpy as np
print np.sum(results, axis=0)

Correct       32
Passing        0
Failing        8
Redundant      0
Erroring       0
Unknown        0
Index        496
dtype: int64


In [21]:
(correct, passing, failing, redundant, erroring, unknown) = sp.LFs
LFs = correct + passing
print len(LFs)

32


In [22]:
from snorkel.semantic import sem_to_str

sem = ('.root', ('.label', ('.bool', True), ('.and', ('.any', ('.map', ('.in', ('.extract_text', ('.between', ('.list', ('.arg', ('.int', 1)), ('.arg', ('.int', 2)))))), ('.user_list', ('.string', u'causal')))), ('.not', ('.call', ('.in', ('.extract_text', ('.between', ('.list', ('.arg', ('.int', 1)), ('.arg', ('.int', 2)))))), ('.string', u'not'))))))
print sem
print sem_to_str(sem)

('.root', ('.label', ('.bool', True), ('.and', ('.any', ('.map', ('.in', ('.extract_text', ('.between', ('.list', ('.arg', ('.int', 1)), ('.arg', ('.int', 2)))))), ('.user_list', ('.string', u'causal')))), ('.not', ('.call', ('.in', ('.extract_text', ('.between', ('.list', ('.arg', ('.int', 1)), ('.arg', ('.int', 2)))))), ('.string', u'not'))))))
return -1 if (any(map(in text(between([arg1,arg2])), 'CAUSAL')) and not (call(in text(between([arg1,arg2])), 'not'))) else 0


In [23]:
# sp.grammar.print_chart(nested=True)

In [24]:
# sp.grammar.print_grammar()