In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
### hack tf-keras to appear as top level keras
import sys
sys.modules['keras'] = keras
### end of hack
from keras import layers
from keras.models import Model
import keras.backend as K
import json
import gensim
import os

In [2]:
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import objectpath

In [3]:
home = os.getcwd()
dataDir = home + '/bioasq/data'
modelsDir = home + '/bioasq/models/dan.h5'
embeddingFile = home + '/bioasq/embeddings/pubmed2018_w2v_200D.bin'
trainFilePath = dataDir + '/BioASQ-trainingDataset6b.json'
trainJSON = {}
no_vectors = {}
yes_vectors = {}
missing_vectors = {}
embedding_dimension = 200
labels = {
    "summary": [1, 0, 0, 0],
    "list":    [0, 1, 0, 0],
    "yesno":   [0, 0, 1, 0],
    "factoid": [0, 0, 0, 0]
}
label_dim = 4

In [4]:
# Load data from files
pubmedW2V = gensim.models.KeyedVectors.load_word2vec_format(embeddingFile, binary=True)

In [5]:
with open(trainFilePath) as f:
    trainJSON = json.load(f)

In [6]:
def seperate_punctuation(str):
    pStr = str.lower()
    pStr = " ".join(re.findall(r"[\w']+|[.,!?;'s]", pStr))
    remove_chars = ["'s", "'t", "s'", "'", ",", ".","?", "!"]
    for ch in remove_chars:
        pStr = pStr.replace(ch, "")
    pStr = pStr.replace("  ", " ")
    return pStr

In [7]:
seperate_punctuation("RET, GDNF, EDNRB, EDN3!, ands' SOX10 lead's to't long-segment (L-HSCR)?")

'ret gdnf ednrb edn3  and sox10 lead to long segment l hscr '

In [8]:

def getQuestionembeddingsFromText(qText):
    global no_vectors
    global yes_vectors
    global missing_vectors
    qVect = seperate_punctuation(qText)
    qWords = qVect.split(" ")
    qEmbeddings = np.zeros((len(qWords), embedding_dimension))
    for i, qWord in enumerate(qWords):
        qEmbedding = np.zeros((1, embedding_dimension))
        try:
            qEmbeddings[i:] = pubmedW2V.get_vector(qWord)  
            if qWord not in yes_vectors:
                yes_vectors[qWord] = 0;
            yes_vectors[qWord] += 1
        except:
            if qWord not in no_vectors:
                no_vectors[qWord] = 0;
            if qWord not in missing_vectors:
                missing_vectors[qWord] = np.random.randn(1, embedding_dimension)
            no_vectors[qWord] += 1
            qEmbeddings[i:] = missing_vectors[qWord]        
    qEmbeddings = np.mean(qEmbeddings, axis=0, keepdims=True)
    return qEmbeddings

In [58]:
question_count = len(trainJSON["questions"])
question_embeddings = np.zeros((question_count, 1, embedding_dimension))
question_labels = np.zeros((question_count, embedding_dimension))
questions_data = []
for i, question in enumerate(trainJSON["questions"]):
    embedding = getQuestionembeddingsFromText(question["body"])
    question_embeddings[i:] = embedding
    question_labels[i:] = embedding
    questions_data.append({
        "id": question["id"],
        "embedding": embedding,
        "vector": None
    })
    
print("   ".join(no_vectors.keys()))
n_cnt = len(no_vectors.keys())
y_cnt = len(yes_vectors.keys())
percent = ((n_cnt/(n_cnt+y_cnt)) * 100)
print("\n\nNo vector count % = {}".format(percent))


   turbeculosis   path2ppi   kub5   mesaure   metazaon   sfpr3   archaelogy   secreatase   mirduplexsvm   phoshpatases   bouble   axagliptin   hydroxyisobutyrylation   thyroitoxicosis   itagliptin   hisrchsprung   alterred   dediodinases   μac1   mer41   chédiak   acetylgalactosaminidase   immonostaining   regioner   levoxyl   phopspholamban   mepopausal   6srna   glycolylneuraminic   arcalyst   sqtlseeker   ranasmurfin   menzerath   trigered   woolsorter   pregrancy   subtahalamic


No vector count % = 0.8733624454148471


In [59]:
print(question_embeddings.shape)
print(question_labels.shape)
print(question_labels[:-10])

(2251, 1, 200)
(2251, 200)
[[-0.04032383  0.23722161  0.01415557 ... -0.20252093  0.2247508
  -0.03295899]
 [-0.12237463  0.11521546 -0.09487724 ... -0.23687904 -0.03425544
   0.04332822]
 [ 0.1490044  -0.03180349 -0.04777178 ... -0.32831423  0.13612961
   0.21818388]
 ...
 [ 0.1391815   0.10629099  0.04947067 ... -0.25339634  0.27767004
   0.02496959]
 [ 0.01561616  0.20751076  0.00118731 ... -0.37215197  0.52265794
   0.16852258]
 [-0.03440449  0.06675705 -0.06633284 ... -0.30361671  0.17864662
  -0.01184909]]


In [73]:
def getModel(input_shape):
    X_input = keras.layers.Input(shape=input_shape)
    X = keras.layers.Flatten()(X_input)
    X = keras.layers.Dense(128, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(256, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(128, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(64, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(128, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(256, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(embedding_dimension, activation=tf.nn.relu)(X)
    # X = keras.layers.Dense(label_dim, activation=tf.nn.sigmoid)(X)
    model = keras.Model(inputs=X_input, outputs=X, name='HappyModel')
    return model

In [74]:
model = getModel((question_embeddings.shape[1], question_embeddings.shape[2]))
model.compile(
    optimizer=tf.train.AdamOptimizer(),
    loss='mse',
    metrics=['accuracy']
)

In [75]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 1, 200)            0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 200)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 128)               25728     
_________________________________________________________________
dense_51 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_52 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_53 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_54 (Dense)             (None, 128)               8320      
__________

In [76]:
x_train, x_test, y_train, y_test = train_test_split(question_embeddings, question_labels, test_size=0.2)

In [77]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1800, 1, 200)
(1800, 200)
(451, 1, 200)
(451, 200)


In [83]:
model.fit(x=x_train, y=y_train, epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x7f27973b7f98>

In [84]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Loss = %.2f  and test accuracy = %.2f' % (test_loss, test_acc))

Loss = 0.02  and test accuracy = 0.63


In [85]:
def getPenultimateLayerOutput(model, input_embedding):
    index = len(model.layers) - 3
    get_2ndlast_layer_output = K.function([model.layers[0].input],
                                      [model.layers[index].output])
    layer_output = get_2ndlast_layer_output([input_embedding])[0]
    return layer_output

In [86]:
question_vectors = getPenultimateLayerOutput(model, question_embeddings)
for i, qVect in enumerate(question_vectors):
    questions_data[i]["vector"] = qVect

In [87]:
getPenultimateLayerOutput(model, question_embeddings).shape


(2251, 128)

In [88]:
def getQueryVector(qText):
    qEmbedding = getQuestionembeddingsFromText(qText)
    qVector = getPenultimateLayerOutput(model, qEmbedding)
    return qVector

In [92]:
def getTop10MatchingQuestionIdsForQuery(qText):
    qVector = getQueryVector(qText)
    values = []
    dtype = [('id', 'S50'), ('distance', float)]
    for i, question in enumerate(questions_data):
        values.append((
            question["id"],
            cosine_similarity(qVector, question["vector"].reshape((1, 128))).squeeze()
        ))
    distances = np.array(values, dtype=dtype)
    questionid_cosine_distances = np.sort(distances, order=["distance"])
    return questionid_cosine_distances[-10:][::-1]
    

In [134]:
def executeQuery(qText):
    top_matches = getTop10MatchingQuestionIdsForQuery(qText)

    for i, (id, dist) in enumerate(top_matches):
        for question in trainJSON["questions"]:
            if id.decode("utf-8") == question["id"]:
                print("\nActual Question => " + question["body"])
                print("\n\n")
                print(question["ideal_answer"])
                break

In [139]:
# Actual question from data => Which drugs may interfere thyroxine absorption?
# Text input to query       => thyroxine absorption?  (a partial representation of exact query)

executeQuery("thyroxine absorption?")



Actual Question => Which drugs may interfere thyroxine absorption?



['bile acid sequestrants, ferrous sulphate, sucralfate, calcium carbonate, aluminium-containing antacids, phosphate binders, raloxifene and proton-pump inhibitors, have also been shown to interfere with the absorption of levothyroxine\nsevelamer hydrochloride or chromium picolinate should be advised to separate the time of ingestion of these drugs from their thyroid hormone preparation by several hours']

Actual Question => Does metformin interfere thyroxine absorption?



['There are not reported data indicating that metformin interferes with thyroxine absorption']

Actual Question => What are reactive metabolites?



['Reactive metabolites are generated when a small molecule, commonly a drug or hydrocarbon, is broken down in the body. Reactive metabolites can cause cancer and other diseases as well as hepatoxicty. ']

Actual Question => Have thyronamines effects on fat tissue?



['thyronamines cause reduction of 

In [138]:
# Actual question from data => Is the protein Papilin secreted?
# Text input to query       => Papilin secreted?  (a partial representation of exact query)

executeQuery("Papilin secreted?")



Actual Question => Is the protein Papilin secreted?



['Yes,  papilin is a secreted protein']

Actual Question => What is TOPAZ1?



['TOPAZ1 is a novel germ cell-specific expressed gene conserved during evolution across vertebrates. Its PAZ-domain protein is abundantly expressed in the gonads during germ cell meiosis. The expression pattern of TOPAZ1, and its high degree of conservation, suggests that it may play an important role in germ cell development. Further characterization of TOPAZ1 may elucidate the mechanisms involved in gametogenesis, and particularly in the RNA silencing process in the germ line.', 'TOPAZ1 (Testis and Ovary-specific PAZ domain gene 1) is a germ cell specific factor that is essential for male meiotic progression. Topaz1 is supposed to have a role during gametogenesis and may be involved in the piRNA pathway and contribute to silencing of transposable elements and maintenance of genome integrity. It is highly conserved in vertebrates.']

Actual Question =>

In [140]:
# Actual question from data => List signaling molecules (ligands) that interact with the receptor EGFR?"?
# Text input to query       => List signaling molecules (ligands) (a partial representation of exact query)

executeQuery("List signaling molecules (ligands)")



Actual Question => List signaling molecules (ligands) that interact with the receptor EGFR?



['The 7 known EGFR ligands  are: epidermal growth factor (EGF), betacellulin (BTC), epiregulin (EPR), heparin-binding EGF (HB-EGF), transforming growth factor-α [TGF-α], amphiregulin (AREG) and epigen (EPG).']

Actual Question => List receptors of the drug Cilengitide



['Cilengitide binds αvβ3 and αvβ5 integrins. It inhibits attachment and invasion of malignant cells. Thus, cilengitide is being tested for treatment of cancer patients.']

Actual Question => Which signaling pathway is activating the dishevelled proteins?



['Dishevelled (Xdsh) controls cell fate via canonical Wnt signaling']

Actual Question => Which proteins act as histone-like molecules in prokaryotes?



['Prokaryotic histone-like proteins (Hlps) or nucleoid-associated proteins (NAPs) are abundant proteins found in bacterial and plastid nucleoids. HU protein is a small, basic, heat-stable DNA-binding protein that is well