# Markerville Backend

## Imports and Candidate Subclass

All the imports and the creation of the candidate_subclass or desired relationship to extract

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import matplotlib.pyplot as plt
from six.moves.cPickle import load
import cPickle
import numpy as np

from snorkel import SnorkelSession
from snorkel.parser import XMLMultiDocPreprocessor, CorpusParser
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser.corenlp import StanfordCoreNLPServer
from snorkel.models import Document, Sentence, Candidate, candidate_subclass
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.viewer import SentenceNgramViewer
from snorkel.annotations import LabelAnnotator, load_gold_labels, FeatureAnnotator, save_marginals, load_marginals
from snorkel.learning import SparseLogisticRegression, GenerativeModel, RandomSearch
from snorkel.learning.structure import DependencySelector
from snorkel.learning.utils import MentionScorer
# from snorkel.contrib.rnn import reRNN

import matchers
import LF
from candidate_adjective_fixer import *
from load_external_annotations_new import load_external_labels

session = SnorkelSession()

BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition'])



The following can be ignored

In [None]:
#------------------
# Helper Functions
#------------------

def grabCandidates(extractor, schema):
    # Candidate Counts
    for k, sents in enumerate([train_sents, dev_sents, test_sents]):
        extractor.apply(sents, split=k, clear=False)
        print "Number of candidates: ", session.query(schema).filter(schema.split == k).count()
        session.commit()
        
    train_cands = session.query(schema).filter(
        schema.split == 0).all()
    dev_cands = session.query(schema).filter(
        schema.split == 1).all()
    test_cands = session.query(schema).filter(
        schema.split == 2).all()

    return [train_cands, dev_cands, test_cands]

## Setup and Preprocessing

Load in the XML files containing the corpuses. In this case, there are three, a training corpus, test corpus, and development corpus. 

In [None]:
#-----------------------
# Setup & Preprocessing
#-----------------------

# Instantiate the Session
session = SnorkelSession()

# Doc Preprocessing
file_path = 'articles/training.xml'
train_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//article',
    text='.//front/article-meta/abstract/p/text()',
    id='.//front/article-meta/article-id/text()'
)

file_path = 'articles/development.xml'
dev_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

file_path = 'articles/testcorpus.xml'
test_preprocessor = XMLMultiDocPreprocessor(
    path=file_path,
    doc='.//document',
    text='.//passage/text/text()',
    id='.//id/text()'
)

# Parsing
# corenlp_server = StanfordCoreNLPServer(version="3.6.0", num_threads=4, port=12348)
# corpus_parser = CorpusParser(corenlp_server, parser=Spacy())



The CorpusParser is applied to each of these corpuses to break them into Documents and Sentences

In [None]:
corpus_parser = CorpusParser(parser=Spacy())
# corpus_parser = CorpusParser()

# Note: Parallelism can be run with a Postgres DBMS, but not SQLite
corpus_parser.apply(list(train_preprocessor))
corpus_parser.apply(list(dev_preprocessor), clear=False)
corpus_parser.apply(list(test_preprocessor), clear=False)



The sentences are then split into train, dev, and test according to the document IDs associated with them. 

In [None]:
# Retrieving Stable IDs for each of the candidate sentences
with open('articles/doc_ids.pkl', 'rb') as f:
    train_ids, dev_ids, test_ids = load(f)

train_ids, dev_ids, test_ids = set(train_ids), set(dev_ids), set(test_ids)
train_sents, dev_sents, test_sents = set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()


# Assigning each sentence to {train,dev,test}-set based on Stable ID
for i, doc in enumerate(docs):
    for s in doc.sentences:
        if doc.name in train_ids:
            train_sents.add(s)
        elif doc.name in dev_ids:
            dev_sents.add(s)
        elif doc.name in test_ids:
            test_sents.add(s)
        else:
            raise Exception(
                'ID <{0}> not found in any id set'.format(doc.name))

## Candidate Extraction

The Ngrams for each entity, or number of words to match, is established. Then the matchers (collection of regular expressions and dictionaries) are initialized. 

The matchers, ngrams, and candidate_subclass are passed into the CandidateExtractor to extract candidates. 

In [None]:
#----------------------
# Candidate Extraction
#----------------------

# Defining the Candidate Schemas
BiomarkerCondition = candidate_subclass('BiomarkerCondition', ['biomarker', 'condition'])

# N-grams: the probabilistic search space of our entities
biomarker_ngrams = Ngrams(n_max=1)
condition_ngrams = Ngrams(n_max=7)

# Construct our Matchers
bMatcher = matchers.getBiomarkerMatcher()
cMatcher = matchers.getConditionMatcher()

# Building the CandidateExtractors
candidate_extractor_BC = CandidateExtractor(BiomarkerCondition, [biomarker_ngrams, condition_ngrams], [bMatcher, cMatcher])

# List of Candidate Sets for each relation type: [train, dev, test]
cands_BC = grabCandidates(candidate_extractor_BC, BiomarkerCondition)

In case of specificity issues, for medium, condition, and drug, grabs the adjectives in front of the entity as well. The goal is to have more specific entities, such as esophaegal cancer rather than just cancer. 

In [None]:
session.rollback()
print "Number of dev BC candidates without adj. boosting: ", len(cands_BC[1])
add_adj_candidate_BC(session, BiomarkerCondition, cands_BC[1], 0)
# fix_specificity(session, BiomarkerCondition, cands_BC[1])
print "Number of dev BC candidates with adj. boosting: ", session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 1).count()
session.commit()

## Labelling Functions

The weak supervision portion of the pipeline, these labelling funtions are used to label the training data. In order to modify the accuracy of the pipeline, these should be modified, and new labelling functions should be added. More information about evaluating the accuracy of labelling functions can be found on the Snorkel website

In [None]:
from LF import *
LFs_BC = [LF_markerDatabase, LF_keyword, LF_distance, LF_abstract_titleWord, LF_single_letter,
          LF_auxpass, LF_known_abs, LF_same_thing, LF_common_1000, LF_common_2000]

In [None]:
from snorkel.annotations import LabelAnnotator
BC_labeler = LabelAnnotator(lfs=LFs_BC)

## Train the final model

In [None]:
np.random.seed(1701)
%time L_train_BC = BC_labeler.apply(split=0)
L_train_BC

In [None]:
%time L_train_BC = BC_labeler.load_matrix(session, split=0)
L_train_BC

In [None]:
L_train_BC.get_candidate(session, 0)

In [None]:
L_train_BC.get_key(session, 0)

In [None]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train_BC, epochs=100, decay=0.95, step_size=0.1 / L_train_BC.shape[0], reg_param=1e-6)

In [None]:
gen_model.weights.lf_accuracy

In [None]:
train_marginals = gen_model.marginals(L_train_BC)


In [None]:
import matplotlib.pyplot as plt
plt.hist(train_marginals, bins=20)
plt.show()

In [None]:
L_dev = BC_labeler.apply_existing(split=1)

In [None]:
from snorkel.annotations import save_marginals
%time save_marginals(session, L_train_BC, train_marginals)

In [None]:
from snorkel.annotations import load_marginals

train_marginals = load_marginals(session, split=0)

In [None]:
train_cands = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 0).order_by(BiomarkerCondition.id).all()
dev_cands   = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 1).order_by(BiomarkerCondition.id).all()
test_cands  = session.query(BiomarkerCondition).filter(BiomarkerCondition.split == 2).order_by(BiomarkerCondition.id).all()

In [None]:
from snorkel.annotations import load_gold_labels
load_external_labels(session, BiomarkerCondition, 'Biomarker', 'Condition', 'articles/disease_gold_labels.tsv', dev_cands, annotator_name='gold')
load_external_labels(session, BiomarkerCondition, 'Biomarker', 'Condition', 'articles/disease_gold_labels.tsv', test_cands, annotator_name='gold')

L_gold_dev  = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=1)




In [None]:
print len(train_cands)
print len(dev_cands)

In [None]:
from snorkel.learning.disc_models.rnn import reRNN

train_kwargs = {
    'lr':         0.01,
    'dim':        50,
    'n_epochs':   10,
    'dropout':    0.25,
    'print_freq': 1,
    'max_sentence_length': 100
}

lstm = reRNN(seed=1701, n_threads=None)
lstm.train(train_cands, train_marginals, X_dev=dev_cands, Y_dev=L_gold_dev, **train_kwargs)

The below information is generated using the test set as an accuracy metric

In [None]:
p, r, f1 = lstm.score(dev_cands, L_gold_test)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

In [None]:
tp, fp, tn, fn = lstm.error_analysis(session, dev_cands, L_gold_test)

In [None]:
lstm.save_marginals(session, test_cands)

In [None]:
predictions = lstm.predictions(train_cands)

In [None]:
i = 0
for prediction in predictions: 
    if(prediction == 1):
        i+=1
print i

In [None]:
i = 0
while( i< len(train_cands)):
    print("Candidate: {}. Prediction: {}").format(train_cands[i], predictions[i])
    i += 1

Export the Final Model

In [None]:
lstm.save('biomarker')