# Extract hypothesis candidates from sentences
### We focus only a subset of documents for which the full body is available

In [30]:
%load_ext autoreload
%autoreload 2

import cPickle, os, sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from ddlite import *
from spacy.en import English
nlp = English()

from NounPhraseExtractor import *
from HypothesesCandidates import *

# init database handler
dbHandler = DatabaseHandler('database.ini')

# grab sentences from DB
sents = dbHandler.get_random_sentences(table='keyword_matched_sentences',count=1000)
keywords = dbHandler.get_doc_keywords()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
# init NounPhrase Extractor
Treatment_Extr = SentDictionaryMatch(label='Keyword1', dictionary=keywords, ignore_case=False)
Outcome_Extr = SentDictionaryMatch(label='Keyword2', dictionary=keywords, ignore_case=False)

In [32]:
# extract cross-product of noun-phrases from valid sentences
hypotheses = HypothesesCandidates(sents, Treatment_Extr, Outcome_Extr)

In [33]:
hypotheses[0].render()

In [34]:
# build a model for the current candidates and generate features
CM = DDLiteModel(hypotheses)
print "Extracted {} features for each of {} mentions".format(CM.num_feats(), CM.num_candidates())

Extracted 3275 features for each of 103 mentions


In [35]:
# label a small amount of data with mindtagger to be used for evaluating the labeling functions
CM.open_mindtagger(num_sample=100, width='100%', height=1200)

Making sure MindTagger is installed. Hang on!


In [29]:
# Add Mindtagger tags to the model
CM.add_mindtagger_tags()

  self.dev1 = idxs[ : np.floor(dev_split * len(idxs))]
  self.dev2 = idxs[np.floor(dev_split * len(idxs)) : ]


## Labeling Functions

In [None]:
def post_window(m, key, n=3):
    s = list(m.idxs)
    b = len(m.lemmas) - np.max(s)
    s.extend([np.max(s) + i for i in range(1, min(b,n+1))])
    return key in [m.lemmas[i] for i in s]
def pre_window(m, key, n=3):
    s = list(m.idxs)
    b = np.min(s)
    s.extend([b - i for i in range(1, min(b,n+1))])
    return key in [m.lemmas[i] for i in s]
def stopper(m, stop):
    return stop in [m.lemmas[i] for i in m.idxs]

def LF_gene(m):
    return 1 if (post_window(m, 'gene') or pre_window(m, 'gene')) else 0
def LF_mutant(m):
    return 1 if (post_window(m, 'mutant') or pre_window(m, 'mutant')) else 0
def LF_express(m):
    return 1 if (post_window(m, 'express') or pre_window(m, 'express')) else 0
def LF_mutation(m):
    return 1 if 'mutation' in [m.lemmas[m.dep_parents[i] - 1] for i in m.idxs] else 0
def LF_dna(m):
    return -1 if stopper(m, 'dna') else 0
def LF_rna(m):
    return -1 if stopper(m, 'rna') else 0
def LF_snp(m):
    return -1 if stopper(m, 'snp') else 0