In [3]:
%matplotlib notebook
from matplotlib import pyplot as plt
import pandas
import re
import nltk
import numpy as np
from numpy import *
import csv
import theano.tensor as T
import os.path
from nltk.collocations import *
from optparse import OptionParser
from collections import Counter
from copy import copy
import cPickle
import csv
import warnings

from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

from keras.layers.convolutional import MaxPooling1D, Convolution1D
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Graph
from keras.engine.training import slice_X
from keras.layers.core import Layer, Dense, Dropout, Activation,\
    Reshape, Flatten, Lambda
from keras.regularizers import Regularizer
from keras.optimizers import SGD
from keras.constraints import maxnorm
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.optimizers import Adadelta
from keras.callbacks import Callback


from IPython.utils.io import CapturedIO
from gensim.models import Word2Vec
from pkg_resources import resource_filename
import utils
import datasets
from unidecode import unidecode

# Yoon Kim's tokenization
def my_process(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^\w(),|!?\'\`\:\-\.;\$%#]", " ", string)
    string = re.sub(r"\'s", " is", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    string = re.sub(r"(?<=\w)\.\.\.", " ... ", string)
    string = re.sub(r"(?<=\w)\.", " . ", string)
    string = re.sub(r"(?<=\w),", " , ", string)
    string = re.sub(r"(?<=\w);", " ; ", string)
    string = re.sub(r"(?<=\w)!", " ! ", string)
    string = re.sub(r"\((?=\w)", " ( ", string)
    string = re.sub(r"(?<=\w)\)", " ) ", string)
    string = re.sub(r"(?<=\w)\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def mixed_score(y_true, y_probs, th):
    y_probs = asarray(y_probs)
    return {
        "precision": y_true[y_probs >= th].sum()*1./(y_probs >= th).sum(),
        "recall": y_true[y_probs >= th].sum()*1./y_true.sum(),
        "f1": f1_score(y_true, (y_probs >= th)*1),
        "auc": roc_auc_score(y_true, y_probs)
    }

# This function chooses the best threshold based on f1 of validation.
def seq_score(model, X, y):
    val_split = model.last_fit_params.get('validation_split', 0.)
    split_at = int(model.last_fit_X.shape[0] * (1. - val_split))
    X_val, y_val = model.last_fit_X[split_at:], model.last_fit_y[split_at:]
    val_probs = model.predict(X_val).flatten()
    thresholds = sorted(unique(val_probs))
    max_f1, best_threshold = 0, 0
    for threshold in thresholds:
        f1 = f1_score(y_val, (val_probs >= threshold)*1)
        if f1 > max_f1:
            max_f1 = f1
            best_threshold = threshold
    return mixed_score(y, model.predict(X).flatten(), best_threshold)
    
# Same as seq_f1 but for graph model
def graph_score(model, data):
    val_split = model.last_fit_params.get('validation_split', 0.)
    split_at = int(model.last_fit_data['output'].shape[0] * (1. - val_split))
    data_val = {k: slice_X(v, split_at) for k, v in model.last_fit_data.items()}
    val_probs = model.predict(data_val)['output'].flatten()
    thresholds = sorted(unique(val_probs))
    max_f1, best_threshold = 0, 0
    for threshold in thresholds:
        f1 = f1_score(data_val['output'], (val_probs >= threshold)*1)
        if f1 > max_f1:
            max_f1 = f1
            best_threshold = threshold
    return mixed_score(data['output'], 
                       model.predict(data)['output'].flatten(), 
                       best_threshold)

def seq_auc(model, X, y):
    preds = model.predict(X).flatten()
    return roc_auc_score(y, preds)
    
def graph_auc(model, data):
    preds = model.predict(data)['output'].flatten()
    return roc_auc_score(data['output'], preds)

seq_eval_f = seq_score
graph_eval_f = graph_score
results = pandas.DataFrame()

Using gpu device 0: Quadro K2000 (CNMeM is disabled, cuDNN 5005)
Using Theano backend.


# Load ADE data set

In [6]:
data_path = "ADE-Corpus-V2/"
texts, labels = [], []
with open(os.path.join(data_path, 'DRUG-AE.rel')) as f:
    for line in f:
        pubmed_id, text = line.strip().split('|')[:2]
        texts.append(unidecode(text.decode('utf-8')))
        labels.append(1)

with open(os.path.join(data_path, 'ADE-NEG.txt')) as f:
    for line in f:
        pubmed_id, neg = line.strip().split(' ')[:2]
        text = ' '.join(line.strip().split(' ')[2:])
        texts.append(unidecode(text.decode('utf-8')))
        labels.append(0)
        
np.random.seed(0)
# Shuffle the data as Keras won't shuffle validation data.
# This can make the training ends early as we are using
# early stop for regularisation.
idx = np.random.permutation(len(labels))        
labels = asarray(labels)[idx]
texts = asarray(texts, dtype='str')[idx]
skf = list(StratifiedKFold(labels, n_folds=10))

# The baselines

In [7]:
import pandas
from sklearn.linear_model import LogisticRegression
from zhang_adr.concept_matching import run_cm
from zhang_adr.maxent_tfidf import run_tfidf
from zhang_adr.maxent_nblcr import run_nblcr
from zhang_adr.maxent_we import run_we
from zhang_adr.tweetnlp import tweet_tagger
from zhang_adr.preprocess import clean_tweet

tokens, tags = tweet_tagger.runtagger_parse(texts)
zhang_clean_texts = []
for token, tag in zip(tokens, tags):
    zhang_clean_texts.append(clean_tweet(token, tag))
zhang_clean_texts = asarray(zhang_clean_texts)

In [None]:
import warnings
warnings.filterwarnings("ignore")

clf = LogisticRegression(class_weight="auto")
bm_results = []
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for i, (train_idx, test_idx) in enumerate(skf, 1):
        print "### Fold {}:".format(i)
        train, test = [], []
        for train_id in train_idx:
            train.append({"id": None, "label": labels[train_id], "text": zhang_clean_texts[train_id]})
        train = pandas.DataFrame(train)
        for test_id in test_idx:
            test.append({"id": None, "label": labels[test_id], "text": zhang_clean_texts[test_id]})
        test = pandas.DataFrame(test)

        result = {}

        y_pred_cm = run_cm(train, test, resource_filename('zhang_adr', 'data/ADR-lexicon.txt'))
        result = mixed_score(test['label'].values, y_pred_cm, 0.5)
        result['model'] = 'CM'
        results = pandas.concat([results, pandas.DataFrame([result])])

        _, y_prob_tfidf = run_tfidf(train, test, grams='123', n_dim=40000, clf=clf)
        result = mixed_score(test['label'].values, asarray(y_prob_tfidf[:, 1]), 0.5)
        result['model'] = 'ME-TFIDF'
        results = pandas.concat([results, pandas.DataFrame([result])])

        _, y_prob_nblcr = run_nblcr(train, test, 'nblcr', grams='123', clf=clf)
        result = mixed_score(test['label'].values, y_prob_nblcr[:, 1], 0.5)
        result['model'] = 'ME-NBLCR'
        results = pandas.concat([results, pandas.DataFrame([result])])

        _, y_prob_we = run_we(train, test, resource_filename('zhang_adr', 'data/w2v_150.txt'), 150, clf=clf)
        result = mixed_score(test['label'].values,  y_prob_we[:, 1], 0.5)
        result['model'] = 'ME-WE'
        results = pandas.concat([results, pandas.DataFrame([result])])

### Fold 1:


In [None]:
results.groupby("model").mean()

# Our methods

## Prepare the embedding and features

In [None]:
w2v = Word2Vec.load_word2vec_format(
    '/home/trung/data/embeddings/glovec/tmp',
    binary=False
)

dim = w2v.layer1_size

In [None]:
from zhang_adr.TextUtility import TextUtility

MOST_FREQUENT_WORDS = 20000
USE_CACHE = False
INCLUDE_UNKNOWN_WORDS = False

docs = [[w for w in TextUtility.text_to_wordlist(text)\
         if INCLUDE_UNKNOWN_WORDS or w in w2v.index2word]\
         for text in zhang_clean_texts]
all_words = Counter([w for doc in docs for w in doc])
top_words = sorted(all_words.items(), key=lambda t: t[1], reverse=True)
top_words = top_words[:MOST_FREQUENT_WORDS]
V = {w:i for i, (w, freq) in enumerate(top_words)}
X = utils.vectorize(docs, V)

# initialize embedding matrix
my_embeddings = np.random.normal(-.25, .25, size=(X.max() + 1, dim))
for w in V:
    if w in w2v:
        my_embeddings[V[w]] = w2v[w]
        
# set embedding of padded character as 0s.
my_embeddings[len(V) + 1] = np.zeros((dim, ))

### CNN

In [None]:
# sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.seq_cross_validate(
    utils.mk_yk_model_f(X.shape[1], my_embeddings, n_filters=300),
    X, labels, 
    skf, eval_f=seq_eval_f,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    }, 
    verbose=1)
df = pandas.DataFrame(scores)
model_name = "CNN"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()



In [13]:
results.groupby("model").mean()

KeyError: 'model'

### CNN + [Doc features: has_adrs]

In [None]:
# def mk_mixture_model(max_len, embedding, doc_feature_size):
    
#     def mixture_model():
#         graph = Graph()
#         graph.add_input(name='tokens', input_shape=(max_len, ), dtype='int')
#         utils.add_yk_node(graph, 'tokens', max_len, embedding)
#         graph.add_input(name='doc_features', input_shape=(doc_feature_size, ), dtype='int')
#         graph.add_node(Dense(1, activation='sigmoid', W_constraint=maxnorm(9)), 
#                        name='perceptron', inputs=['yk', 'doc_features'])
#         graph.add_output('output', input='perceptron')
#         graph.compile(optimizer='adadelta', loss={'output': 'binary_crossentropy'})
#         return graph
    
#     return mixture_model

# class MyReport(Callback):
    
#     def on_epoch_end(self, logs={}):
#         pass

In [None]:
# # Create a feature that tells whether a tweet contains a phrase in ADR lexicon
# adr_lexicon = datasets.load_ADR_lexicon("/home/trung/data/lexicons/ADR/ADR_lexicon.tsv")
# has_adrs = []
# for text in zhang_clean_texts:
#     has_adr = 0
#     for p in adr_lexicon['phrase']:
#         if p in text:
#             has_adr = 1
#     has_adrs.append(has_adr)                
# has_adrs = asarray(has_adrs).reshape(-1, 1)

# # graph model
# early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=1)
# scores = utils.graph_cross_validate(
#     mk_mixture_model(X.shape[1], my_embeddings, has_adrs.shape[1]),
#     {'tokens': X, 'doc_features': has_adrs, 'output': labels}, 
#     skf, 
#     eval_f=graph_eval_f,
#     fit_params={
#         "callbacks": [early_stopper],
#         "validation_split": .1,
#         "batch_size": 50
#     })
# results["my-cnn-dynamic-embedding-has_adr-inc_unkwn_{}".format(INCLUDE_UNKNOWN_WORDS)] = scores
# mean(scores)

### GRU

In [None]:
# sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.seq_cross_validate(
    utils.mk_gru_model_f(X.shape[1], my_embeddings),
    X[:, ::-1], labels,
    skf, eval_f=seq_eval_f,
    verbose=0,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    })
df = pandas.DataFrame(scores)
model_name = "GRU"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()



In [None]:
results

### CRNN

In [None]:
# sequential model
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.seq_cross_validate(
    utils.mk_cgru_model_f(X.shape[1], my_embeddings, nb_filter=300, rnn_output=300),
    X[:, ::-1], labels,
    skf, eval_f=seq_eval_f,
    verbose=0,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    })
df = pandas.DataFrame(scores)
model_name = "CRNN"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

INFO (theano.gof.compilelock): Refreshing lock /home/trung/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-centos-6.6-Final-x86_64-2.7.12-64/lock_dir/lock
INFO (theano.gof.compilelock): Refreshing lock /home/trung/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-centos-6.6-Final-x86_64-2.7.12-64/lock_dir/lock


### RCNN

In [14]:
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.seq_cross_validate(
    utils.mk_rcnn_model_f(X.shape[1], my_embeddings, rnn_output=300, nb_filter=300, filter_length=5),
    X, labels,
    skf, eval_f=seq_eval_f,
    verbose=0,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    })
df = pandas.DataFrame(scores)
model_name = "RCNN"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

Train on 19046 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Train on 19048 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19048 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 

NameError: name 'result' is not defined

In [15]:
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

auc          0.922562
f1           0.829314
precision    0.811351
recall       0.887119
dtype: float64

### CNNA

In [21]:
reload(utils)

<module 'utils' from 'utils.py'>

In [23]:
early_stopper = utils.MyEarlyStopping(monitor='val_loss', patience=5, verbose=0)
scores = utils.graph_cross_validate(
    utils.mk_attention_based_model_f(X.shape[1], my_embeddings, attention_l2=0.1),
    {"tokens": X, "output": labels},
    skf,
    eval_f=graph_eval_f,
    verbose=0,
    fit_params={
        "callbacks": [early_stopper],
        "validation_split": .1,
        "batch_size": 50
    })
df = pandas.DataFrame(scores)
model_name = "CNNA"
df["model"] = model_name
results = pandas.concat([results[results["model"] != model_name], df])
df.mean()

Train on 19046 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19047 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19048 samples, validate on 2117 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 19048 samples, validate on 2117

auc          0.950869
f1           0.826228
precision    0.815253
recall       0.838439
dtype: float64

### Results

In [16]:
results.groupby("model").count()

Unnamed: 0_level_0,auc,f1,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CM,11,11,11,11
CNN,10,10,10,10
CNNA,10,10,10,10
CRNN,10,10,10,10
ME-NBLCR,10,10,10,10
ME-TFIDF,10,10,10,10
ME-WE,10,10,10,10
RCNN,10,10,10,10


In [25]:
results.groupby("model").mean()

Unnamed: 0_level_0,auc,f1,precision,recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CM,0.528312,0.463361,0.302339,0.991338
CNN,0.970146,0.865692,0.84542,0.887404
CNNA,0.950869,0.826228,0.815253,0.838439
CRNN,0.956478,0.837676,0.816188,0.861015
ME-NBLCR,0.953812,0.843834,0.905228,0.790352
ME-TFIDF,0.938561,0.796262,0.74277,0.85823
ME-WE,0.760576,0.572706,0.482441,0.704592
RCNN,0.922562,0.829314,0.811351,0.887119


In [None]:
results.to_csv("ADE-results.csv")