In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import matplotlib.pyplot as plt
from six.moves.cPickle import load
import cPickle
import numpy as np

from snorkel import SnorkelSession
from snorkel.parser import XMLMultiDocPreprocessor, CorpusParser
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser.corenlp import StanfordCoreNLPServer
from snorkel.models import Document, Sentence, Candidate, candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.viewer import SentenceNgramViewer
from snorkel.annotations import LabelAnnotator, load_gold_labels, FeatureAnnotator, save_marginals, load_marginals
from snorkel.learning import SparseLogisticRegression, GenerativeModel, RandomSearch
from snorkel.learning.structure import DependencySelector
from snorkel.learning.utils import MentionScorer
# from snorkel.contrib.rnn import reRNN

import matchers
import LF
from candidate_adjective_fixer import *
from load_external_annotations_new import load_external_labels



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [2]:
#------------------
# Helper Functions
#------------------

def grabCandidates(extractor, schema):
    # Candidate Counts
    for k, sents in enumerate([train_sents, dev_sents, test_sents]):
        extractor.apply(sents, split=k, clear=False)
        print "Number of candidates: ", session.query(schema).filter(schema.split == k).count()
        session.commit()
        
    train_cands = session.query(schema).filter(
        schema.split == 0).all()
    dev_cands = session.query(schema).filter(
        schema.split == 1).all()
    test_cands = session.query(schema).filter(
        schema.split == 2).all()

    return [train_cands, dev_cands, test_cands]

In [3]:
#-----------------------
# Setup & Preprocessing
#-----------------------

# Instantiate the Session
session = SnorkelSession()

# Doc Preprocessing
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id='.//front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

# Parsing
# corenlp_server = StanfordCoreNLPServer(version="3.6.0", num_threads=4, port=12348)
# corpus_parser = CorpusParser(corenlp_server, parser=Spacy())
corpus_parser = CorpusParser(parser=Spacy())
# corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)


Clearing existing...
Running UDF...

Running UDF...

Running UDF...



In [4]:
# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()


# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception(
                'ID <{0}> not found in any id set'.format(doc.name))


In [5]:
#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition'])
BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug'])
BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium'])
BiomarkerType = candidate_subclass('BiomarkerType', ['biomarker', 'typ3'])
# BiomarkerLevelUnit = candidate_subclass('BiomarkerLevelUnit', ['biomarker', 'level', 'unit'])
#can eventually add MEASUREMENT and COHORT SIZE among other entities

# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)
drug_ngrams = Ngrams(n_max=5)
medium_ngrams = Ngrams(n_max=5)
type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
# level_ngrams = Ngrams(n_max=1)
# unit_ngrams = Ngrams(n_max=1)

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getConditionMatcher()
dMatcher = matchers.getDrugMatcher()
mMatcher = matchers.getMediumMatcher()
tMatcher = matchers.getTypeMatcher()
# lMatcher = matchers.getLevelMatcher()
# uMatcher = matchers.getUnitMatcher()

# Building the CandidateExtractors
candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
candidate_extractor_BD = CandidateExtractor(BiomarkerDrug, [biomarker_ngrams, drug_ngrams], [bMatcher, dMatcher])
candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
# candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

# List of Candidate Sets for each relation type: [train, dev, test]
cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition)
cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
# cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)


ASDFASDF
Running UDF...

Number of candidates:  2347
Running UDF...

Number of candidates:  622
Running UDF...

Number of candidates:  119
Running UDF...

Number of candidates:  1057
Running UDF...

Number of candidates:  270
Running UDF...

Number of candidates:  29
Running UDF...

Number of candidates:  781
Running UDF...

Number of candidates:  396
Running UDF...

Number of candidates:  48
Running UDF...

Number of candidates:  2377
Running UDF...

Number of candidates:  431
Running UDF...

Number of candidates:  45


In [None]:
session.rollback()

# {train,dev,test} is split order

# Adjective Boosting - BC
print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
add_adj_candidate_BC(session, BiomarkerCondition, cands_BC[1], 0)
# fix_specificity(session, BiomarkerCondition, cands_BC[1])
print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 1).count()
session.commit()

# Adjective Boosting - BD
print "Number of dev BD candidates without adj. boosting: ", len(cands_BD[1])
add_adj_candidate_BD(session, BiomarkerDrug, cands_BD[1], 0)
print "Number of dev BD candidates with adj. boosting: ", session.query(BiomarkerDrug).filter(BiomarkerDrug.split == 1).count()
session.commit()

# Adjective Boosting - BM
print "Number of dev BM candidates without adj. boosting: ", len(cands_BM[1])
add_adj_candidate_BM(session, BiomarkerMedium, cands_BM[1], 0)
print "Number of dev BM candidates with adj. boosting: ", session.query(BiomarkerMedium).filter(BiomarkerMedium.split == 1).count()
session.commit()

# Adjective Boosting - BT (none as of now)
# Adjective Boosting - BLU (none as of now)


Number of dev BC candidates without adj. boosting:  622
Checking Biomarker: 28261322::span:154:156
Checking Condition: 28261322::span:82:100
0
Couldn't find: 28261322::span:82:100
ADDING NEW CANDIDATE
Checking Biomarker: 28261322::span:67:69
Checking Condition: 28261322::span:82:100
1
Checking Biomarker: 28261204::span:1301:1304
Checking Condition: 28261204::span:110:124
0
Couldn't find: 28261204::span:110:124
ADDING NEW CANDIDATE
Checking Biomarker: 28261204::span:1223:1226
Checking Condition: 28261204::span:110:124
1
Checking Biomarker: 28261204::span:1451:1454
Checking Condition: 28261204::span:45:61
0
Couldn't find: 28261204::span:45:61
ADDING NEW CANDIDATE
Checking Biomarker: 28263981::span:17:20
Checking Condition: 28263981::span:70:103
0
Couldn't find: 28263981::span:70:103
ADDING NEW CANDIDATE
Checking Biomarker: 28263981::span:272:275
Checking Condition: 28263981::span:100:133
0
Couldn't find: 28263981::span:100:133
ADDING NEW CANDIDATE
Checking Biomarker: 28263981::span:161:1

Checking Condition: 28262306::span:134:147
0
Couldn't find: 28262306::span:134:147
ADDING NEW CANDIDATE
Checking Biomarker: 28262306::span:358:361
Checking Condition: 28262306::span:134:147
1
Checking Biomarker: 28262306::span:507:509
Checking Condition: 28262306::span:134:147
1
Checking Biomarker: 28262306::span:500:501
Checking Condition: 28262306::span:134:147
1
Checking Biomarker: 28262306::span:1871:1874
Checking Condition: 28262306::span:87:100
0
Couldn't find: 28262306::span:87:100
ADDING NEW CANDIDATE
Checking Biomarker: 28262306::span:1880:1883
Checking Condition: 28262306::span:87:100
1
Checking Biomarker: 28260718::span:164:166
Checking Condition: 28260718::span:132:171
0
Couldn't find: 28260718::span:132:171
ADDING NEW CANDIDATE
Checking Biomarker: 28260718::span:217:221
Checking Condition: 28260718::span:132:171
1
Checking Biomarker: 28262306::span:32:35
Checking Condition: 28262306::span:85:98
0
Couldn't find: 28262306::span:85:98
ADDING NEW CANDIDATE
Checking Biomarker: 

ADDING NEW CANDIDATE
Checking Biomarker: 28260391::span:1635:1638
Checking Condition: 28260391::span:63:84
0
Couldn't find: 28260391::span:63:84
ADDING NEW CANDIDATE
Checking Biomarker: 28260391::span:1604:1613
Checking Condition: 28260391::span:63:84
1
Checking Biomarker: 28260391::span:1616:1618
Checking Condition: 28260391::span:63:84
1
Checking Biomarker: 28262211::span:79:87
Checking Condition: 28262211::span:185:205
0
Couldn't find: 28262211::span:185:205
ADDING NEW CANDIDATE
Checking Biomarker: 28262211::span:319:325
Checking Condition: 28262211::span:283:317
0
Couldn't find: 28262211::span:283:317
ADDING NEW CANDIDATE
Checking Biomarker: 28262211::span:387:389
Checking Condition: 28262211::span:283:317
1
Checking Biomarker: 28262211::span:305:313
Checking Condition: 28262211::span:283:317
1
Checking Biomarker: 28262211::span:392:394
Checking Condition: 28262211::span:283:317
1
Checking Biomarker: 28262211::span:315:317
Checking Condition: 28262211::span:283:317
1
Checking Bioma

Checking Condition: 28261899::span:183:198
0
Couldn't find: 28261899::span:183:198
ADDING NEW CANDIDATE
 added to imiquimod
 added to fluorouracil
 added to oxaliplatin
 added to irinotecan
 added to gemcitabine
 added to fluorouracil
 added to oxaliplatin
 added to irinotecan
 added to gemcitabine
 added to fluorouracil
 added to oxaliplatin
 added to irinotecan
 added to gemcitabine
 added to fluorouracil
 added to oxaliplatin
 added to irinotecan
 added to gemcitabine
 added to sorafenib
 added to sorafenib
 added to sorafenib
 added to sorafenib
 added to estradiol
 added to diclofenac
 added to estradiol
 added to diclofenac
 added to estradiol
 added to diclofenac
 added to bleomycin
 added to bleomycin
 added to bleomycin
 added to bleomycin
 added to bleomycin
 added to bleomycin
 added to estradiol
 added to estradiol
 added to pentoxifylline
 added to pentoxifylline
 added to pentoxifylline
 added to pentoxifylline
 added to pentoxifylline
 added to pentoxifylline
 added to p

 added to Blood
maternal  added to serum
Checking Biomarker: 28262321::span:1708:1710
Checking Condition: 28262321::span:129:142
0
Couldn't find: 28262321::span:129:142
ADDING NEW CANDIDATE
cord  added to serum
Checking Biomarker: 28262321::span:1708:1710
Checking Condition: 28262321::span:233:242
0
Couldn't find: 28262321::span:233:242
ADDING NEW CANDIDATE
maternal  added to serum
Checking Biomarker: 28262321::span:1767:1769
Checking Condition: 28262321::span:129:142
1
cord  added to serum
Checking Biomarker: 28262321::span:1767:1769
Checking Condition: 28262321::span:233:242
1
 added to Serum
 added to Serum
 added to Serum
baseline  added to serum
Checking Biomarker: 28260649::span:901:906
Checking Condition: 28260649::span:93:106
0
Couldn't find: 28260649::span:93:106
ADDING NEW CANDIDATE
baseline  added to serum
Checking Biomarker: 28260649::span:1280:1285
Checking Condition: 28260649::span:153:166
0
Couldn't find: 28260649::span:153:166
ADDING NEW CANDIDATE
baseline  added to ser

1
 added to CSF
white  added to blood
Checking Biomarker: 28264059::span:885:887
Checking Condition: 28264059::span:33:43
1
 added to CSF
white  added to blood
Checking Biomarker: 28264059::span:838:840
Checking Condition: 28264059::span:33:43
1
 added to CSF
white  added to blood
Checking Biomarker: 28264059::span:967:969
Checking Condition: 28264059::span:33:43
1
fetal  added to serum
Checking Biomarker: 28262321::span:1005:1007
Checking Condition: 28262321::span:27:37
0
Couldn't find: 28262321::span:27:37
ADDING NEW CANDIDATE
fetal  added to serum
Checking Biomarker: 28262321::span:918:921
Checking Condition: 28262321::span:27:37
1
fetal  added to serum
Checking Biomarker: 28262321::span:823:825
Checking Condition: 28262321::span:27:37
1
 added to plasma
 added to plasma
 added to plasma
 added to urine
 added to Urine
 added to urine
 added to Urine
 added to urine
 added to Urine
 added to urine
 added to Urine
 added to urine
 added to serum
 added to serum
 added to serum
 added

In [None]:
#-------------------------------------------
# External Gold Labels & Labeling Functions
#-------------------------------------------
from LF import *
# Labeling Functions
LFs_BC = [LF_markerDatabase, LF_keyword, LF_distance, LF_abstract_titleWord, LF_single_letter,
          LF_auxpass, LF_known_abs, LF_same_thing_BC, LF_common_1000, LF_common_2000]
LFs_BD = [LF_colon, LF_known_abs, LF_single_letter,
          LF_roman_numeral, LF_common_2000, LF_same_thing_BD]
LFs_BM = [LF_distance_far, LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000, LF_same_thing]
LFs_BT = [LF_colon, LF_known_abs, LF_single_letter, LF_roman_numeral, LF_common_2000, LF_same_thing]

labeler_BC = LabelAnnotator(lfs=LFs_BC)
labeler_BD = LabelAnnotator(lfs=LFs_BD)
labeler_BM = LabelAnnotator(lfs=LFs_BM)
labeler_BT = LabelAnnotator(lfs=LFs_BT)

# Training
L_train_BC = labeler_BC.apply(split=0)
L_train_BD = labeler_BD.apply(split=0)
L_train_BM = labeler_BM.apply(split=0)
L_train_BT = labeler_BT.apply(split=0)
L_train_BC
L_train_BD
L_train_BM
L_train_BT

# Labeling Function Performance - Coverage, Overlaps, Conflicts
L_train_BC.lf_stats(session)
L_train_BD.lf_stats(session)
L_train_BM.lf_stats(session)
L_train_BT.lf_stats(session)



Clearing existing...
Running UDF...

In [None]:
tp, fp, tn, fn = L_train_BC.error_analysis(session, L_dev, L_gold_dev)
