In [1]:
import numpy as np
import pandas as pd
import json
import keras
import keras.backend as K
from keras.layers import Input, Lambda, Dense, Embedding, Bidirectional, LSTM, concatenate, Dropout
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras import metrics
from gensim.models.poincare import PoincareModel
from wikipedia2vec import Wikipedia2Vec

Using TensorFlow backend.


In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [26]:
wiki_df = pd.read_pickle("../../data/all_wiki_sentence_split_words.pkl")

with open("../../data/pageid2ChEBI.json", 'r') as f:
    pageid2ChEBI_table = json.load(f)

ChEBI_df = pd.DataFrame()
for _id, ChEBIs in pageid2ChEBI_table.items():
    new_df = pd.DataFrame({'_id': [_id] * len(ChEBIs), 'ChEBI': ChEBIs})
    ChEBI_df = ChEBI_df.append(new_df)

train_df = pd.merge(wiki_df, ChEBI_df, on='_id')
print("Number of train rows:", len(train_df))

Number of train rows: 67118


In [4]:
wiki2vec = Wikipedia2Vec.load('../../model/jawiki_20180420_300d.pkl')
poincare_model = PoincareModel.load("../../model/poincare.model")

In [5]:
WORD_EMBEDDING_DIM = 300
ONTOLOGY_EMBEDDING_DIM = 10

In [27]:
flatten = lambda l: [i for sub in l for i in sub]

def sentence2vec(s: str):

def _w2v(w):
    try:
        return np.array(wiki2vec.get_word_vector(w).tolist())
    except KeyError:
        return np.zeros(WORD_EMBEDDING_DIM)

def ontology2vec(ChEBI: str):
    '''
    Using Poincaré embedding.
    '''
    if ChEBI is np.nan:
        return np.zeros(ONTOLOGY_EMBEDDING_DIM)
    
    try:
        return poincare_model.kv[ChEBI]
    except KeyError:
        return np.zeros(ONTOLOGY_EMBEDDING_DIM)

def negative_sampling(train_df, rate):
    negative_df = pd.DataFrame()
    for _id, entry in train_df.groupby('_id'):
        n_sample = int(len(entry) * rate)
        
        entry_ChEBI = entry.loc[entry._id == _id].ChEBI.unique()
        
        sample_df = train_df.loc[~train_df.ChEBI.isin(entry_ChEBI)].sample(n_sample)
        sample_df = sample_df.assign(ontology_vec = flatten([entry.ontology_vec.tolist()] * rate))
        
        negative_df = negative_df.append(sample_df)
    
    negative_df = negative_df.assign(label = False)
    
    return negative_df

In [28]:
# to vector
train_df = \
train_df.assign(
    words_vec = train_df.words.apply(lambda x: sentence2vec(x))
    , ontology_vec = train_df.ChEBI.apply(lambda x: ontology2vec(x))
)

train_df = \
pd.concat([
    train_df.assign(label = True)
    , negative_sampling(train_df, rate=5)
])[['words_vec', 'ontology_vec', 'label']]

train_df.to_pickle("../../dump/pretrain_data.pkl")

In [3]:
train_df = pd.read_pickle("../../dump/pretrain_data.pkl")

In [29]:
X_words = pad_sequences(
    train_df.words_vec
    , dtype='float32'
    , padding='post'
    , truncating='pre'
    , maxlen=50
)

X_ontology = np.array(train_df.ontology_vec.tolist())

y = train_df.label.values

In [30]:
# check dimensions
print(X_words.shape)
print(X_ontology.shape)
print(y.shape)

(402708, 50, 300)
(402708, 10)
(402708,)


In [31]:
WORD_LSTM_UNIT = 512
FC_DIM = 128
DROPOUT_RATE = 0.5

In [32]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [33]:
word_embeddings = Input(shape=(None, WORD_EMBEDDING_DIM,), dtype='float32')
ontology_embeddings = Input(shape=(ONTOLOGY_EMBEDDING_DIM,), dtype='float32')

l_lstm = Bidirectional(LSTM(WORD_LSTM_UNIT, return_sequences=True))(word_embeddings)
l_max = Lambda(lambda x: K.max(x, axis=1))(l_lstm)
x = concatenate([l_max, ontology_embeddings])

x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu', name='fc1')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu', name='fc2')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu', name='fc3')(x)

pred = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[word_embeddings, ontology_embeddings], outputs=pred)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[metrics.binary_accuracy, f1])

In [35]:
model.fit(x=[X_words, X_ontology], y=y, epochs=50, class_weight={0: 1, 1: 5}, batch_size=1024)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

KeyboardInterrupt: 

In [13]:
tb_cb = keras.callbacks.TensorBoard(log_dir="../../log/")
model.fit(x=[X_words, X_ontology], y=y, epochs=50, batch_size=1024, callbacks=[tb_cb])
model.save("../../model/pretrain_model.h5")