using scipy minimize function, results on test set

In [1]:
import bz2
from six.moves.cPickle import load

with bz2.BZ2File('data/ctd.pkl.bz2', 'rb') as ctd_f:
    ctd_unspecified, ctd_therapy, ctd_marker = load(ctd_f)
    
def cand_in_ctd_unspecified(c):
    return 1 if c.get_cids() in ctd_unspecified else 0

def cand_in_ctd_therapy(c):
    return 1 if c.get_cids() in ctd_therapy else 0

def cand_in_ctd_marker(c):
    return 1 if c.get_cids() in ctd_marker else 0

def LF_in_ctd_unspecified(c):
    return -1 * cand_in_ctd_unspecified(c)

def LF_in_ctd_therapy(c):
    return -1 * cand_in_ctd_therapy(c)

def LF_in_ctd_marker(c):
    return cand_in_ctd_marker(c)

import re
from snorkel.lf_helpers import (
    get_tagged_text,
    rule_regex_search_tagged_text,
    rule_regex_search_btw_AB,
    rule_regex_search_btw_BA,
    rule_regex_search_before_A,
    rule_regex_search_before_B,
)

# List to parenthetical
def ltp(x):
    return '(' + '|'.join(x) + ')'

def LF_induce(c):
    return 1 if re.search(r'{{A}}.{0,20}induc.{0,20}{{B}}', get_tagged_text(c), flags=re.I) else 0

causal_past = ['induced', 'caused', 'due']
def LF_d_induced_by_c(c):
    return rule_regex_search_btw_BA(c, '.{0,50}' + ltp(causal_past) + '.{0,9}(by|to).{0,50}', 1)
def LF_d_induced_by_c_tight(c):
    return rule_regex_search_btw_BA(c, '.{0,50}' + ltp(causal_past) + ' (by|to) ', 1)

def LF_induce_name(c):
    return 1 if 'induc' in c.chemical.get_span().lower() else 0     

causal = ['cause[sd]?', 'induce[sd]?', 'associated with']
def LF_c_cause_d(c):
    return 1 if (
        re.search(r'{{A}}.{0,50} ' + ltp(causal) + '.{0,50}{{B}}', get_tagged_text(c), re.I)
        and not re.search('{{A}}.{0,50}(not|no).{0,20}' + ltp(causal) + '.{0,50}{{B}}', get_tagged_text(c), re.I)
    ) else 0

treat = ['treat', 'effective', 'prevent', 'resistant', 'slow', 'promise', 'therap']
def LF_d_treat_c(c):
    return rule_regex_search_btw_BA(c, '.{0,50}' + ltp(treat) + '.{0,50}', -1)
def LF_c_treat_d(c):
    return rule_regex_search_btw_AB(c, '.{0,50}' + ltp(treat) + '.{0,50}', -1)
def LF_treat_d(c):
    return rule_regex_search_before_B(c, ltp(treat) + '.{0,50}', -1)
def LF_c_treat_d_wide(c):
    return rule_regex_search_btw_AB(c, '.{0,200}' + ltp(treat) + '.{0,200}', -1)

def LF_c_d(c):
    return 1 if ('{{A}} {{B}}' in get_tagged_text(c)) else 0

def LF_c_induced_d(c):
    return 1 if (
        ('{{A}} {{B}}' in get_tagged_text(c)) and 
        (('-induc' in c[0].get_span().lower()) or ('-assoc' in c[0].get_span().lower()))
        ) else 0

def LF_improve_before_disease(c):
    return rule_regex_search_before_B(c, 'improv.*', -1)

pat_terms = ['in a patient with ', 'in patients with']
def LF_in_patient_with(c):
    return -1 if re.search(ltp(pat_terms) + '{{B}}', get_tagged_text(c), flags=re.I) else 0

uncertain = ['combin', 'possible', 'unlikely']
def LF_uncertain(c):
    return rule_regex_search_before_A(c, ltp(uncertain) + '.*', -1)

def LF_induced_other(c):
    return rule_regex_search_tagged_text(c, '{{A}}.{20,1000}-induced {{B}}', -1)

def LF_far_c_d(c):
    return rule_regex_search_btw_AB(c, '.{100,5000}', -1)

def LF_far_d_c(c):
    return rule_regex_search_btw_BA(c, '.{100,5000}', -1)

def LF_risk_d(c):
    return rule_regex_search_before_B(c, 'risk of ', 1)

def LF_develop_d_following_c(c):
    return 1 if re.search(r'develop.{0,25}{{B}}.{0,25}following.{0,25}{{A}}', get_tagged_text(c), flags=re.I) else 0

procedure, following = ['inject', 'administrat'], ['following']
def LF_d_following_c(c):
    return 1 if re.search('{{B}}.{0,50}' + ltp(following) + '.{0,20}{{A}}.{0,50}' + ltp(procedure), get_tagged_text(c), flags=re.I) else 0

def LF_measure(c):
    return -1 if re.search('measur.{0,75}{{A}}', get_tagged_text(c), flags=re.I) else 0

def LF_level(c):
    return -1 if re.search('{{A}}.{0,25} level', get_tagged_text(c), flags=re.I) else 0

def LF_neg_d(c):
    return -1 if re.search('(none|not|no) .{0,25}{{B}}', get_tagged_text(c), flags=re.I) else 0

WEAK_PHRASES = ['none', 'although', 'was carried out', 'was conducted',
                'seems', 'suggests', 'risk', 'implicated',
               'the aim', 'to (investigate|assess|study)']

WEAK_RGX = r'|'.join(WEAK_PHRASES)

def LF_weak_assertions(c):
    return -1 if re.search(WEAK_RGX, get_tagged_text(c), flags=re.I) else 0


def LF_ctd_marker_c_d(c):
    return LF_c_d(c) * cand_in_ctd_marker(c)

def LF_ctd_marker_induce(c):
    return (LF_c_induced_d(c) or LF_d_induced_by_c_tight(c)) * cand_in_ctd_marker(c)

def LF_ctd_therapy_treat(c):
    return LF_c_treat_d_wide(c) * cand_in_ctd_therapy(c)

def LF_ctd_unspecified_treat(c):
    return LF_c_treat_d_wide(c) * cand_in_ctd_unspecified(c)

def LF_ctd_unspecified_induce(c):
    return (LF_c_induced_d(c) or LF_d_induced_by_c_tight(c)) * cand_in_ctd_unspecified(c)

def LF_closer_chem(c):
    # Get distance between chemical and disease
    chem_start, chem_end = c.chemical.get_word_start(), c.chemical.get_word_end()
    dis_start, dis_end = c.disease.get_word_start(), c.disease.get_word_end()
    if dis_start < chem_start:
        dist = chem_start - dis_end
    else:
        dist = dis_start - chem_end
    # Try to find chemical closer than @dist/2 in either direction
    sent = c.get_parent()
    closest_other_chem = float('inf')
    for i in range(dis_end, min(len(sent.words), dis_end + dist / 2)):
        et, cid = sent.entity_types[i], sent.entity_cids[i]
        if et == 'Chemical' and cid != sent.entity_cids[chem_start]:
            return -1
    for i in range(max(0, dis_start - dist / 2), dis_start):
        et, cid = sent.entity_types[i], sent.entity_cids[i]
        if et == 'Chemical' and cid != sent.entity_cids[chem_start]:
            return -1
    return 0

def LF_closer_dis(c):
    # Get distance between chemical and disease
    chem_start, chem_end = c.chemical.get_word_start(), c.chemical.get_word_end()
    dis_start, dis_end = c.disease.get_word_start(), c.disease.get_word_end()
    if dis_start < chem_start:
        dist = chem_start - dis_end
    else:
        dist = dis_start - chem_end
    # Try to find chemical disease than @dist/8 in either direction
    sent = c.get_parent()
    for i in range(chem_end, min(len(sent.words), chem_end + dist / 8)):
        et, cid = sent.entity_types[i], sent.entity_cids[i]
        if et == 'Disease' and cid != sent.entity_cids[dis_start]:
            return -1
    for i in range(max(0, chem_start - dist / 8), chem_start):
        et, cid = sent.entity_types[i], sent.entity_cids[i]
        if et == 'Disease' and cid != sent.entity_cids[dis_start]:
            return -1
    return 0

LFs = [
    LF_c_cause_d,
    LF_c_d,
    LF_c_induced_d,
    LF_c_treat_d,
    LF_c_treat_d_wide,
    LF_closer_chem,
    LF_closer_dis,
    LF_ctd_marker_c_d,
    LF_ctd_marker_induce,
    LF_ctd_therapy_treat,
    LF_ctd_unspecified_treat,
    LF_ctd_unspecified_induce,
    LF_d_following_c,
    LF_d_induced_by_c,
    LF_d_induced_by_c_tight,
    LF_d_treat_c,
    LF_develop_d_following_c,
    LF_far_c_d,
    LF_far_d_c,
    LF_improve_before_disease,
    LF_in_ctd_therapy,
    LF_in_ctd_marker,
    LF_in_patient_with,
    LF_induce,
    LF_induce_name,
    LF_induced_other,
    LF_level,
    LF_measure,
    LF_neg_d,
    LF_risk_d,
    LF_treat_d,
    LF_uncertain,
    LF_weak_assertions,
]
print("NO. of Labelling Functions: ", len(LFs))

('NO. of Labelling Functions: ', 33)


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession

session = SnorkelSession()

from snorkel.models import candidate_subclass

ChemicalDisease = candidate_subclass('ChemicalDisease', ['chemical', 'disease'])

train_cands = session.query(ChemicalDisease).filter(ChemicalDisease.split == 0).all()
dev_cands = session.query(ChemicalDisease).filter(ChemicalDisease.split == 1).all()
print(len(dev_cands))
print(len(train_cands))

888
8272


In [3]:
from load_external_annotations import load_external_labels
load_external_labels(session, ChemicalDisease, split=1, annotator='gold')

from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

#print(L_gold_dev)
gold_labels = []
for i,L in enumerate(L_gold_dev):
    gold_labels.append(L[0,0])

AnnotatorLabels created: 0


In [4]:
import numpy as np
import math

def PHI(K,LAMDAi):
    return [K*j for j in LAMDAi]

def softmax(THETA,LAMDAi):
    x = []
    for k in [1,-1]:
        product = np.dot(PHI(k,LAMDAi),THETA)
        x.append(product)
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def function_conf(THETA,LAMDA,P_cap,Confidence):
    s = 0.0
    i = 0
    for LAMDAi in LAMDA:
        s = s + Confidence[i]*np.dot(np.log(softmax(THETA,LAMDAi)),P_cap[i])
        i = i+1
    return -s

def function(THETA,LAMDA,P_cap):
    s = 0.0
    i = 0
    for LAMDAi in LAMDA:
        s = s + np.dot(np.log(softmax(THETA,LAMDAi)),P_cap[i])
        i = i+1
    return -s

def P_K_Given_LAMDAi_THETA(K,THETA,LAMDAi):
    x = softmax(THETA,LAMDAi)
    if(K==1):
        return x[0]
    else:
        return x[1]
      

np.random.seed(78)
THETA = np.random.rand(len(LFs),1)

def PHIj(j,K,LAMDAi):
    return LAMDAi[j]*K

def RIGHT(j,LAMDAi,THETA):
    phi = []
    for k in [1,-1]:
        phi.append(PHIj(j,k,LAMDAi))
    x = softmax(THETA,LAMDAi)
    return np.dot(phi,x)
    

def function_conf_der(THETA,LAMDA,P_cap,Confidence):
    der = []
    for j in range(len(THETA)):
        i = 0
        s = 0.0
        for LAMDAi in LAMDA:
            p = 0
            for K in [1,-1]:
                s = s + Confidence[i]*(PHIj(j,K,LAMDAi)-RIGHT(j,LAMDAi,THETA))*P_cap[i][p]
                p = p+1
            i = i+1
        der.append(-s)
    return np.array(der)

def function_der(THETA,LAMDA,P_cap):
    der = []
    for j in range(len(THETA)):
        i = 0
        s = 0.0
        for LAMDAi in LAMDA:
            p = 0
            for K in [1,-1]:
                s = s + (PHIj(j,K,LAMDAi)-RIGHT(j,LAMDAi,THETA))*P_cap[i][p]
                p = p+1
            i = i+1
        der.append(-s)
    return np.array(der)


import numpy as np


def get_LAMDA(cands):
    LAMDA = []
    for ci in cands:
        L=[]
        P_ik = []
        for LF in LFs:
            L.append(LF(ci))
        LAMDA.append(L)
    return LAMDA

def get_Confidence(LAMDA):
    confidence = []
    for L in LAMDA:
        Total_L = float(len(L))
        No_zeros = L.count(0)
        No_Non_Zeros = Total_L - No_zeros
        confidence.append(No_Non_Zeros/Total_L)
    return confidence    
    
def get_Initial_P_cap(LAMDA):
    P_cap = []
    for L in LAMDA:
        P_ik = []
        denominator=float(L.count(1)+L.count(-1))
        if(denominator==0):
            denominator=1
        P_ik.append(L.count(1)/denominator)
        P_ik.append(L.count(-1)/denominator)
        P_cap.append(P_ik)
    return P_cap
    #print(np.array(LAMDA))
    #print(np.array(P_cap))append(L)
    #LAMDA=np.array(LAMDA).astype(int)
    #P_cap=np.array(P_cap)
    #print(np.array(LAMDA).shape)
    #print(np.array(P_cap).shape)
    #print(L)
    #print(ci.chemical.get_span(),ci.disease.get_span(),"No.Os",L.count(0),"No.1s",L.count(1),"No.-1s",L.count(-1))
    #print(ci.chemical.get_span(),ci.disease.get_span(),"P(0):",L.count(0)/len(L)," P(1)",L.count(1)/len(L),"P(-1)",L.count(-1)/len(L))

        
def get_P_cap(LAMDA,THETA):
    P_cap = []
    for LAMDAi in LAMDA:
        P_capi = softmax(THETA,LAMDAi)
        P_cap.append(P_capi)
    return P_cap


def score(predicted_labels,gold_labels):
    tp =0.0
    tn =0.0
    fp =0.0
    fn =0.0
    for i in range(len(gold_labels)):
        if(predicted_labels[i]==gold_labels[i]):
            if(predicted_labels[i]==1):
                tp=tp+1
            else:
                tn=tn+1
        else:
            if(predicted_labels[i]==1):
                fp=fp+1
            else:
                fn=fn+1
    print("tp",tp,"tn",tn,"fp",fp,"fn",fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1score = (2*precision*recall)/(precision+recall)
    print("precision:",precision)
    print("recall:",recall)
    print("F1 score:",f1score)
                
           
    
from scipy.optimize import minimize

def get_marginals(P_cap):
    marginals = []
    for P_capi in P_cap:
        marginals.append(P_capi[0])
    return marginals

def predict_labels(marginals):
    predicted_labels=[]
    for i in marginals:
        if(i<0.4):
            predicted_labels.append(-1)
        else:
            predicted_labels.append(1)
    return predicted_labels
    
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
    
dev_LAMDA = get_LAMDA(dev_cands)
def print_details(label,THETA):
    print(label)
    global dev_LAMDA
    P_cap = get_P_cap(dev_LAMDA,THETA)
    marginals=get_marginals(P_cap)
    #plt.hist(marginals, bins=20)
    #plt.show()
    #plt.bar(range(0,888),train_marginals)
    #plt.show()
    predicted_labels=predict_labels(marginals)
    print(len(marginals),len(predicted_labels),len(gold_labels))
    score(predicted_labels,gold_labels)
    #print(precision_recall_fscore_support(np.array(gold_labels),np.array(predicted_labels),average='binary')) 
    
    
    
def train(No_Iter,Use_Confidence=True):
    global THETA
    LAMDA = get_LAMDA(train_cands)
    P_cap = get_Initial_P_cap(LAMDA)
    Confidence = get_Confidence(LAMDA)
    for iteration in range(No_Iter):
        if(Use_Confidence==True):
            res = minimize(function_conf,THETA,args=(LAMDA,P_cap,Confidence), method='BFGS',jac=function_conf_der,options={'disp': True, 'maxiter':20}) #nelder-mead
        else:
            res = minimize(function,THETA,args=(LAMDA,P_cap), method='BFGS',jac=function_der,options={'disp': True, 'maxiter':20}) #nelder-mead            
        THETA = res.x # new THETA
        print(THETA)
        P_cap = get_P_cap(LAMDA,THETA) #new p_cap 
        print_details("train iteration: "+str(iteration),THETA)
        #score(predicted_labels,gold_labels)
    NP_P_cap = np.array(P_cap)
    np.savetxt('Train_P_cap_conf.txt', NP_P_cap, fmt='%f')
    np.save('Train_P_cap_conf', NP_P_cap)
    NP_THETA = np.array(THETA)
    np.savetxt('FTHETA_conf.txt', NP_THETA, fmt='%f')
    np.save('FTHETANPY_conf', NP_THETA) # save the file as "outfile_name.npy" 

        
def test(THETA):
    global dev_LAMDA
    P_cap = get_P_cap(dev_LAMDA,THETA)
    print_details("test:",THETA)
    NP_P_cap = np.array(P_cap)
    np.savetxt('Dev_P_cap_conf.txt', NP_P_cap, fmt='%f')
    np.save('Dev_P_cap_conf', NP_P_cap)
                    
def load_marginals(s):
    marginals = []
    if(s=="train"):
        train_P_cap = np.load("Train_P_cap_conf.npy")
        marginals = train_P_cap[:,0]
    return marginals

In [5]:
#train(3,Use_Confidence=False)

#test(THETA)

In [6]:
train(3)

test(THETA)

         Current function value: 207.013624
         Iterations: 20
         Function evaluations: 24
         Gradient evaluations: 24
[ 0.47844004  1.29668512  0.61060496  0.53903927  0.73890189  0.60851199
  0.39646162 -0.64617727 -0.7060489  -0.50304711  0.18643678 -0.03844427
  0.81023496  0.4939991   1.02149272  0.84749134  0.2348705   0.66393911
  0.74300491  0.54768663  0.89075     1.01976587  0.75587943  0.49980697
  0.53499946  0.49288678  0.72534036  0.69471096  0.68106339  0.48964412
  0.6509262   0.88324222  0.75345992]
train iteration: 0
(888, 888, 888)
('tp', 224.0, 'tn', 311.0, 'fp', 281.0, 'fn', 72.0)
('precision:', 0.44356435643564357)
('recall:', 0.7567567567567568)
('F1 score:', 0.5593008739076155)
Optimization terminated successfully.
         Current function value: 206.649140
         Iterations: 0
         Function evaluations: 1
         Gradient evaluations: 1
[ 0.47844004  1.29668512  0.61060496  0.53903927  0.73890189  0.60851199
  0.39646162 -0.64617727 -0.

In [7]:
THETA = np.load('FTHETANPY_conf.npy');
test(THETA)

test:
(888, 888, 888)
('tp', 224.0, 'tn', 311.0, 'fp', 281.0, 'fn', 72.0)
('precision:', 0.44356435643564357)
('recall:', 0.7567567567567568)
('F1 score:', 0.5593008739076155)


# Training Sparse Logistic Regression

In [8]:
train_marginals=load_marginals("train")

In [9]:
from snorkel.annotations import FeatureAnnotator
featurizer = FeatureAnnotator()

F_train = featurizer.load_matrix(session, split=0)
F_dev = featurizer.load_matrix(session, split=1)
F_test = featurizer.load_matrix(session, split=2)

In [10]:
from snorkel.learning import SparseLogisticRegression
from snorkel.learning.utils import RandomSearch, ListParameter, RangeParameter

# Searching over learning rate
rate_param = RangeParameter('lr', 1e-6, 1e-2, step=1, log_base=10)
l1_param  = RangeParameter('l1_penalty', 1e-6, 1e-2, step=1, log_base=10)
l2_param  = RangeParameter('l2_penalty', 1e-6, 1e-2, step=1, log_base=10)

# NOTE: A larger search (n) would likely lead to a higher score!
searcher = RandomSearch(SparseLogisticRegression, [rate_param, l1_param, l2_param], F_train,
                        Y_train=train_marginals, n=5)


from snorkel.annotations import load_gold_labels
L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_dev

Initialized RandomSearch search of size 5. Search space size = 125.


<888x1 sparse matrix of type '<type 'numpy.int64'>'
	with 888 stored elements in Compressed Sparse Row format>

In [11]:
%%time
import numpy as np
np.random.seed(1701)
disc_model, run_stats = searcher.fit(F_dev, L_gold_dev, n_epochs=85, rebalance=0.5, print_freq=25)

[1] Testing lr = 1.00e-02, l1_penalty = 1.00e-06, l2_penalty = 1.00e-04
[SparseLogisticRegression] Training model
[SparseLogisticRegression] n_train=6754  #epochs=85  batch size=256
[SparseLogisticRegression] Epoch 0 (0.82s)	Average loss=0.718038
[SparseLogisticRegression] Epoch 25 (22.38s)	Average loss=0.557215
[SparseLogisticRegression] Epoch 50 (44.60s)	Average loss=0.566103
[SparseLogisticRegression] Epoch 75 (66.65s)	Average loss=0.570378
[SparseLogisticRegression] Epoch 84 (74.32s)	Average loss=0.572765
[SparseLogisticRegression] Training done (74.32s)
[SparseLogisticRegression] F1 Score: 0.538043478261
[SparseLogisticRegression] Model saved as <SparseLogisticRegression_0>
[2] Testing lr = 1.00e-04, l1_penalty = 1.00e-06, l2_penalty = 1.00e-03
[SparseLogisticRegression] Training model
[SparseLogisticRegression] n_train=6754  #epochs=85  batch size=256
[SparseLogisticRegression] Epoch 0 (0.82s)	Average loss=1.365198
[SparseLogisticRegression] Epoch 25 (23.25s)	Average loss=1.21504

In [12]:
run_stats

Unnamed: 0,lr,l1_penalty,l2_penalty,Prec.,Rec.,F1
0,0.01,1e-06,0.0001,0.45,0.668919,0.538043
4,0.01,1e-06,0.0001,0.449541,0.662162,0.535519
2,0.001,0.01,0.0001,0.428571,0.709459,0.534351
3,0.001,0.0001,0.001,0.438178,0.682432,0.533686
1,0.0001,1e-06,0.001,0.41453,0.655405,0.507853


# Scoring on test set

In [13]:
from load_external_annotations import load_external_labels
load_external_labels(session, ChemicalDisease, split=2, annotator='gold')

from snorkel.annotations import load_gold_labels
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)
L_gold_test

tp, fp, tn, fn  = disc_model.error_analysis(session, F_test, L_gold_test)

AnnotatorLabels created: 0
Scores (Un-adjusted)
Pos. class accuracy: 0.688
Neg. class accuracy: 0.585
Precision            0.445
Recall               0.688
F1                   0.54
----------------------------------------
TP: 1036 | FP: 1293 | TN: 1822 | FN: 469



In [14]:
tp, fp, tn, fn  = disc_model.error_analysis(session, F_dev, L_gold_dev)

Scores (Un-adjusted)
Pos. class accuracy: 0.669
Neg. class accuracy: 0.591
Precision            0.45
Recall               0.669
F1                   0.538
----------------------------------------
TP: 198 | FP: 242 | TN: 350 | FN: 98

