Misc imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import matplotlib.pyplot as plt
from six.moves.cPickle import load
import cPickle
import numpy as np

from snorkel import SnorkelSession
from snorkel.parser import XMLMultiDocPreprocessor, CorpusParser
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser.corenlp import StanfordCoreNLPServer
from snorkel.models import Document, Sentence, Candidate, candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.viewer import SentenceNgramViewer
from snorkel.annotations import LabelAnnotator, load_gold_labels, FeatureAnnotator, save_marginals, load_marginals
from snorkel.learning import SparseLogisticRegression, GenerativeModel, RandomSearch
from snorkel.learning.structure import DependencySelector
from snorkel.learning.utils import MentionScorer
# from snorkel.contrib.rnn import reRNN

import matchers
import LF
from candidate_adjective_fixer import *
from load_external_annotations_new import load_external_labels

session = SnorkelSession()

BiomarkerLevelUnit = candidate_subclass('BiomarkerLevelUnit', ['biomarker', 'level', 'unit'])
os.environ['SNORKELDB']= 'sqlite:///' + os.getcwd() + os.sep + 'cameron.db'




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Helper functions

In [2]:
#------------------
# Helper Functions
#------------------

def grabCandidates(extractor, schema):
    # Candidate Counts
    for k, sents in enumerate([train_sents, dev_sents, test_sents]):
        extractor.apply(sents, split=k, clear=False)
        print "Number of candidates: ", session.query(schema).filter(schema.split == k).count()
        session.commit()
        
    train_cands = session.query(schema).filter(
        schema.split == 0).all()
    dev_cands = session.query(schema).filter(
        schema.split == 1).all()
    test_cands = session.query(schema).filter(
        schema.split == 2).all()

    return [train_cands, dev_cands, test_cands]

In [3]:
#-----------------------
# Setup & Preprocessing
#-----------------------

# Instantiate the Session
session = SnorkelSession()

# Doc Preprocessing
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id='.//front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

# Parsing
# corenlp_server = StanfordCoreNLPServer(version="3.6.0", num_threads=4, port=12348)
# corpus_parser = CorpusParser(corenlp_server, parser=Spacy())
corpus_parser = CorpusParser(parser=Spacy())
# corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)



Clearing existing...
Running UDF...

Running UDF...

Running UDF...



In [4]:
# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()


# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception(
                'ID <{0}> not found in any id set'.format(doc.name))

In [6]:
#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
# BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition'])

# BiomarkerDrug = candidate_subclass('BiomarkerDrug', ['biomarker', 'drug'])
# BiomarkerMedium = candidate_subclass('BiomarkerMedium', ['biomarker', 'medium'])
# BiomarkerType = candidate_subclass('BiomarkerType', ['biomarker', 'typ3'])
# # BiomarkerLevelUnit = candidate_subclass('BiomarkerLevelUnit', ['biomarker', 'level', 'unit'])
#can eventually add MEASUREMENT and COHORT SIZE among other entities

# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
# condition_ngrams = Ngrams(n_max=7)
# drug_ngrams = Ngrams(n_max=5)
# medium_ngrams = Ngrams(n_max=5)
# type_ngrams = Ngrams(n_max=5)  # <--- Q: should we cut these down?
level_ngrams = Ngrams(n_max=1)
unit_ngrams = Ngrams(n_max=1)

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
# cMatcher = matchers.getConditionMatcher()
# dMatcher = matchers.getDrugMatcher()
# mMatcher = matchers.getMediumMatcher()
# tMatcher = matchers.getTypeMatcher()
lMatcher = matchers.getLevelMatcher()
uMatcher = matchers.getUnitMatcher()

# Building the CandidateExtractors
# candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])
# candidate_extractor_BD = CandidateExtractor(BiomarkerDrug, [biomarker_ngrams, drug_ngrams], [bMatcher, dMatcher])
# candidate_extractor_BM = CandidateExtractor(BiomarkerMedium, [biomarker_ngrams, medium_ngrams], [bMatcher, mMatcher])
# candidate_extractor_BT = CandidateExtractor(BiomarkerType, [biomarker_ngrams, type_ngrams], [bMatcher, tMatcher])
candidate_extractor_BLU = CandidateExtractor(BiomarkerLevelUnit, [biomarker_ngrams, level_ngrams, unit_ngrams], [bMatcher, lMatcher, uMatcher])

# List of Candidate Sets for each relation type: [train, dev, test]
# cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition)
# cands_BD = grabCandidates(candidate_extractor_BD, BiomarkerDrug)
# cands_BM = grabCandidates(candidate_extractor_BM, BiomarkerMedium)
# cands_BT = grabCandidates(candidate_extractor_BT, BiomarkerType)
cands_BLU = grabCandidates(candidate_extractor_BLU, BiomarkerLevelUnit)



Running UDF...

Number of candidates:  762
Running UDF...

Number of candidates:  951
Running UDF...

Number of candidates:  6


In [9]:
train_cands = session.query(BiomarkerLevelUnit).filter(BiomarkerLevelUnit.split == 0).order_by(BiomarkerLevelUnit.id).all()
dev_cands   = session.query(BiomarkerLevelUnit).filter(BiomarkerLevelUnit.split == 1).order_by(BiomarkerLevelUnit.id).all()
test_cands  = session.query(BiomarkerLevelUnit).filter(BiomarkerLevelUnit.split == 2).order_by(BiomarkerLevelUnit.id).all()

In [11]:
for cand in train_cands:
    print cand

BiomarkerLevelUnit(Span("UPT", sentence=12728, chars=[30,32], words=[6,6]), Span("116", sentence=12728, chars=[103,105], words=[25,25]), Span("mL", sentence=12728, chars=[171,172], words=[42,42]))
BiomarkerLevelUnit(Span("UPT", sentence=12728, chars=[30,32], words=[6,6]), Span("116", sentence=12728, chars=[103,105], words=[25,25]), Span("ng", sentence=12728, chars=[107,108], words=[26,26]))
BiomarkerLevelUnit(Span("UPT", sentence=12728, chars=[30,32], words=[6,6]), Span("116", sentence=12728, chars=[103,105], words=[25,25]), Span("ng", sentence=12728, chars=[168,169], words=[40,40]))
BiomarkerLevelUnit(Span("UPT", sentence=12728, chars=[30,32], words=[6,6]), Span("150", sentence=12728, chars=[164,166], words=[39,39]), Span("mL", sentence=12728, chars=[171,172], words=[42,42]))
BiomarkerLevelUnit(Span("UPT", sentence=12728, chars=[30,32], words=[6,6]), Span("150", sentence=12728, chars=[164,166], words=[39,39]), Span("ng", sentence=12728, chars=[107,108], words=[26,26]))
BiomarkerLevelU

In [7]:
# session.rollback()
# print "Number of dev BD candidates without adj. boosting: ", len(cands_BLU[1])
# add_adj_candidate_BLU(session, BiomarkerLevel, cands_BLU[1], 0)
# # fix_specificity(session, BiomarkerCondition, cands_BC[1])
# print "Number of dev BD candidates with adj. boosting: ", session.query(BiomarkerLevel).filter(BiomarkerLevel.split == 1).count()
# session.commit()

Number of dev BD candidates without adj. boosting:  951


NameError: name 'add_adj_candidate_BLU' is not defined

In [None]:
from LF import *
LFs_BD = [LF_colon, LF_known_abs, LF_single_letter,
          LF_roman_numeral, LF_common_2000, LF_same_thing_BD]

In [None]:
from snorkel.annotations import LabelAnnotator
BD_labeler = LabelAnnotator(lfs=LFs_BD)

In [None]:
np.random.seed(1701)
%time L_train_BD = BD_labeler.apply(split=0)
L_train_BD

In [None]:
%time L_train_BD = BD_labeler.load_matrix(session, split=0)
L_train_BD

In [None]:
L_train_BD.get_candidate(session, 0)

In [None]:
L_train_BD.get_key(session, 0)

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train_BD, epochs=100, decay=0.95, step_size=0.1 / L_train_BD.shape[0], reg_param=1e-6)

In [None]:
gen_model.weights.lf_accuracy

In [None]:
train_marginals = gen_model.marginals(L_train_BD)


In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
L_dev = BD_labeler.apply_existing(split=1)

In [None]:
from snorkel.annotations import save_marginals
%time save_marginals(session, L_train_BD, train_marginals)

In [None]:
from snorkel.annotations import load_marginals

train_marginals = load_marginals(session, split=0)

In [None]:
train_cands = session.query(BiomarkerDrug).filter(BiomarkerDrug.split == 0).order_by(BiomarkerDrug.id).all()
dev_cands   = session.query(BiomarkerDrug).filter(BiomarkerDrug.split == 1).order_by(BiomarkerDrug.id).all()
test_cands  = session.query(BiomarkerDrug).filter(BiomarkerDrug.split == 2).order_by(BiomarkerDrug.id).all()

In [None]:
from snorkel.annotations import load_gold_labels
load_external_labels(session, BiomarkerDrug, 'Biomarker', 'Drug', 'articles/drug_gold_labels.tsv', dev_cands, annotator_name='gold')
load_external_labels(session, BiomarkerDrug, 'Biomarker', 'Drug', 'articles/drug_test_labels.tsv', test_cands, annotator_name='gold')

L_gold_dev  = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)




In [None]:
print len(train_cands)
print len(dev_cands)

In [None]:
from snorkel.learning.disc_models.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        50,
    'n_epochs':   10,
    'dropout':    0.25,
    'print_freq': 1,
    'max_sentence_length': 100
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train_cands, train_marginals, X_dev=dev_cands, Y_dev=L_gold_dev, **train_kwargs)

In [None]:
p, r, f1 = lstm.score(test_cands, L_gold_test)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

In [None]:
tp, fp, tn, fn = lstm.error_analysis(session, test_cands, L_gold_test)

In [None]:
lstm.save_marginals(session, test_cands)

In [None]:
predictions = lstm.predictions(train_cands)

In [None]:
i = 0
for prediction in predictions: 
    if(prediction == 1):
        i+=1
print i

In [None]:
i = 0
while( i< len(train_cands)):
    print("Candidate: {}. Prediction: {}").format(train_cands[i], predictions[i])
    i += 1

In [None]:
lstm.save()