In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
### hack tf-keras to appear as top level keras
import sys
sys.modules['keras'] = keras
### end of hack
from keras import layers
from keras.models import Model
import keras.backend as K
import json
import gensim
import os

In [3]:
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import objectpath

In [4]:
home = os.getcwd()
dataDir = home + '/bioasq/data'
modelsDir = home + '/bioasq/models/dan.h5'
embeddingFile = home + '/bioasq/embeddings/pubmed2018_w2v_200D.bin'
trainFilePath = dataDir + '/BioASQ-trainingDataset6b.json'
trainJSON = {}
no_vectors = {}
yes_vectors = {}
missing_vectors = {}
embedding_dimension = 200
labels = {
    "summary": [1, 0, 0, 0],
    "list":    [0, 1, 0, 0],
    "yesno":   [0, 0, 1, 0],
    "factoid": [0, 0, 0, 0]
}
label_dim = 4

In [5]:
# Load data from files
pubmedW2V = gensim.models.KeyedVectors.load_word2vec_format(embeddingFile, binary=True)

In [6]:
with open(trainFilePath) as f:
    trainJSON = json.load(f)

In [7]:
def seperate_punctuation(str):
    pStr = str.lower()
    pStr = " ".join(re.findall(r"[\w']+|[.,!?;'s]", pStr))
    remove_chars = ["'s", "'t", "s'", "'", ",", ".","?", "!"]
    for ch in remove_chars:
        pStr = pStr.replace(ch, "")
    pStr = pStr.replace("  ", " ")
    return pStr

In [8]:
seperate_punctuation("RET, GDNF, EDNRB, EDN3!, ands' SOX10 lead's to't long-segment (L-HSCR)?")

'ret gdnf ednrb edn3  and sox10 lead to long segment l hscr '

In [9]:

def getQuestionembeddingsFromText(qText):
    global no_vectors
    global yes_vectors
    global missing_vectors
    qVect = seperate_punctuation(qText)
    qWords = qVect.split(" ")
    qEmbeddings = np.zeros((len(qWords), embedding_dimension))
    for i, qWord in enumerate(qWords):
        qEmbedding = np.zeros((1, embedding_dimension))
        try:
            qEmbeddings[i:] = pubmedW2V.get_vector(qWord)  
            if qWord not in yes_vectors:
                yes_vectors[qWord] = 0;
            yes_vectors[qWord] += 1
        except:
            if qWord not in no_vectors:
                no_vectors[qWord] = 0;
            if qWord not in missing_vectors:
                missing_vectors[qWord] = np.random.randn(1, embedding_dimension)
            no_vectors[qWord] += 1
            qEmbeddings[i:] = missing_vectors[qWord]        
    qEmbeddings = np.mean(qEmbeddings, axis=0, keepdims=True)
    return qEmbeddings

In [10]:
question_count = len(trainJSON["questions"])
question_embeddings = np.zeros((question_count, 1, embedding_dimension))
question_labels = np.zeros((question_count, label_dim))
questions_data = []
for i, question in enumerate(trainJSON["questions"]):
    embedding = getQuestionembeddingsFromText(question["body"])
    question_embeddings[i:] = embedding
    question_labels[i:] = labels[question["type"]]
    questions_data.append({
        "id": question["id"],
        "embedding": embedding,
        "vector": None
    })
    
print("   ".join(no_vectors.keys()))
n_cnt = len(no_vectors.keys())
y_cnt = len(yes_vectors.keys())
percent = ((n_cnt/(n_cnt+y_cnt)) * 100)
print("\n\nNo vector count % = {}".format(percent))


   levoxyl   mirduplexsvm   woolsorter   mepopausal   itagliptin   turbeculosis   pregrancy   6srna   axagliptin   dediodinases   alterred   phopspholamban   mer41   path2ppi   mesaure   sfpr3   hydroxyisobutyrylation   ranasmurfin   metazaon   immonostaining   chédiak   secreatase   thyroitoxicosis   hisrchsprung   regioner   bouble   menzerath   kub5   archaelogy   acetylgalactosaminidase   glycolylneuraminic   arcalyst   subtahalamic   μac1   sqtlseeker   trigered   phoshpatases


No vector count % = 0.8733624454148471


In [11]:
print(question_embeddings.shape)
print(question_labels.shape)
print(question_labels[:-10])

(2251, 1, 200)
(2251, 4)
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 ...
 [0. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


In [12]:
def getModel(input_shape):
    X_input = keras.layers.Input(shape=input_shape)
    X = keras.layers.Flatten()(X_input)
    X = keras.layers.Dense(128, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(256, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(128, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(64, activation=tf.nn.relu)(X)
    X = keras.layers.Dense(label_dim, activation=tf.nn.sigmoid)(X)
    model = keras.Model(inputs=X_input, outputs=X, name='HappyModel')
    return model

In [13]:
model = getModel((question_embeddings.shape[1], question_embeddings.shape[2]))
model.compile(
    optimizer=tf.train.AdamOptimizer(),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1, 200)            0         
_________________________________________________________________
flatten (Flatten)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               25728     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
Total para

In [15]:
x_train, x_test, y_train, y_test = train_test_split(question_embeddings, question_labels, test_size=0.2)

In [16]:
model.fit(x=x_train, y=y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fe443a6fac8>

In [17]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Loss = %.2f  and test accuracy = %.2f' % (test_loss, test_acc))

Loss = 0.31  and test accuracy = 0.93


In [18]:
def getPenultimateLayerOutput(model, input_embedding):
    index = len(model.layers) - 2
    get_2ndlast_layer_output = K.function([model.layers[0].input],
                                      [model.layers[index].output])
    layer_output = get_2ndlast_layer_output([input_embedding])[0]
    return layer_output

In [19]:
question_vectors = getPenultimateLayerOutput(model, question_embeddings)
for i, qVect in enumerate(question_vectors):
    questions_data[i]["vector"] = qVect

In [20]:
getPenultimateLayerOutput(model, question_embeddings).shape


(2251, 64)

In [21]:
def getQueryVector(qText):
    qEmbedding = getQuestionembeddingsFromText(qText)
    qVector = getPenultimateLayerOutput(model, qEmbedding)
    return qVector

In [22]:
def getTop10MatchingQuestionIdsForQuery(qText):
    qVector = getQueryVector(qText)
    values = []
    dtype = [('id', 'S50'), ('distance', float)]
    for i, question in enumerate(questions_data):
        values.append((
            question["id"],
            cosine_similarity(qVector, question["vector"].reshape((1, 64))).squeeze()
        ))
    distances = np.array(values, dtype=dtype)
    questionid_cosine_distances = np.sort(distances, order=["distance"])
    return questionid_cosine_distances[-10:][::-1]
    

In [56]:
def executeQuery(qText):
    top_matches = getTop10MatchingQuestionIdsForQuery(qText)

    for i, (id, dist) in enumerate(top_matches):
        for question in trainJSON["questions"]:
            if id.decode("utf-8") == question["id"]:
                print("\nResult #%d\n"%(i+1))
                print("------")
                print(question["ideal_answer"])
                print("\n\n")
                if "snippets" in question:
                    for snippet in question["snippets"]:
                        print(snippet["text"])
                        print("\n")
                    print("=" * 100)
                break

In [57]:
executeQuery("Are long non coding RNAs spliced?")


Result #1

------
['Long non coding RNAs appear to be spliced through the same pathway as the mRNAs']



Our analyses indicate that lncRNAs are generated through pathways similar to that of protein-coding genes, with similar histone-modification profiles, splicing signals, and exon/intron lengths.


For alternative exons and long noncoding RNAs, splicing tends to occur later, and the latter might remain unspliced in some cases.


bosome-mapping data to identify lncRNAs of Caenorhabditis elegans. We found 170 long intervening ncRNAs (lincRNAs), which had single- or multiexonic structures that did not overlap protein-coding transcripts, and about sixty antisense lncRNAs (ancRNAs), which were complementary to protein-coding transcripts


We introduce an approach to predict spliced lncRNAs in vertebrate genomes combining comparative genomics and machine learning.


Owing to similar alternative splicing pattern to mRNAs, the concept of lncRNA genes was put forward to help systematic unders

In [58]:
executeQuery("Is RANKL secreted from the cells?")


Result #1

------
['Receptor activator of nuclear factor κB ligand (RANKL) is a cytokine predominantly secreted by osteoblasts.']



Osteoprotegerin (OPG) is a soluble secreted factor that acts as a decoy receptor for receptor activator of NF-κB ligand (RANKL) 


Osteoprotegerin (OPG) is a secreted glycoprotein and a member of the tumor necrosis factor receptor superfamily. It usually functions in bone remodeling, by inhibiting osteoclastogenesis through interaction with a receptor activator of the nuclear factor κB (RANKL).


e RANKL/OPG ratio secreted by osteoblasts increased and RANK expression by osteoclasts increased, leading to increased osteoclastogenesis


Osteoprotegerin (OPG) is an essential secreted protein in bone turnover due to its role as a decoy receptor for the Receptor Activator of Nuclear Factor-kB ligand (RANKL) in the osteoclasts, thus inhibiting their differentiation


We identify a TNFSF11 transcript variant that extends the originally identified transcript enco

In [61]:
executeQuery("Does metformin interfere thyroxine absorption?")



Result #1

------
['There are not reported data indicating that metformin interferes with thyroxine absorption']




Result #2

------
['Myasthenia gravis (MG) is a neuromuscular disease which has been associated with an increased risk of glucocorticoid-induced osteoporosis. Thymectomy can also increase risk for osteoporosis. Appropriate osteoporosis preventive measures can reduce osteoporosis risk in MG patients.']



We performed PVP in 4 patients with generalized MG associated with recent steroid-induced symptomatic VFs. 


In this case report, we used tacrolimus to successfully treat a 13-year-old boy with ocular MG who had suffered from severe steroid complications, including a failure of thrive and osteoporosis.


 INTRODUCTION: Myasthenia gravis (MG) is a neuromuscular disease which has been associated with an increased falls risk and glucocorticoid-induced osteoporosis, recognized determinants of increased fracture risk. 


RESULTS: Compared to the control cohort, there was no

In [62]:
executeQuery("Which miRNAs could be used as potential biomarkers for ovarian cancer?")


Result #1

------
['Isolated ectopia lentis (EL) is caused by mutation in genes:\n1) ADAMTSL4 and \n2) Fibrillin-1 (FBN1).']



A founder mutation in ADAMTSL4 causes early-onset bilateral ectopia lentis among Jews of Bukharian origin.


The term isolated ectopia lentis (EL; subluxation or dislocation of the human crystalline lens) is applied to patients with EL, without skeletal features and in the absence of aortic root dilatation. To date, the only gene shown to cause autosomal-recessive isolated EL is ADAMTSL4. Here we report a novel founder mutation in ADAMTSL4 gene in children of Bukharian Jewish origin presenting with early-onset bilateral EL. 


Early onset ectopia lentis due to a FBN1 mutation with non-penetrance.


Isolated ectopia lentis is usually autosomal dominant and commonly due to the mutations of FBN1 gene. 


In conclusion, we report on a case of early-onset autosomal dominant isolated ectopia lentis caused by FBN1 mutation that has previously been reported only in M