In [1]:
import sys
sys.path.append('..\\')
import nltk
import os
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda, Reshape
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np
import re
import pickle
import tensorflow as tf
from scripts.attention_utils import softmax
from scripts.attention_preprocessing import load_data, transform_data
from scripts.data_loader import DataGenerator

Using TensorFlow backend.


In [2]:
# vocab_to_int, int_to_vocab = load_data("..//data//train.txt", firstn=0)

In [3]:
# with open("..\\data\\dicts", 'wb') as file:
#     pickle.dump((vocab_to_int, int_to_vocab), file)

In [4]:
with open("..\\data\\dicts", 'rb') as file:
    vocab_to_int, int_to_vocab = pickle.load(file)

## Build a Model

In [5]:
Tx = 200 # input sequence length
Ty = 200 # output sequence length
vocab_size = len(vocab_to_int) # number of unique characters
n_a = 32 # number of neurons in single LSTM in encoder
n_s = 64 # number of neurons in single LSTM in decoder

### Attention mechanism

In [6]:
# Defined shared layers as global variables
repeat_layer = RepeatVector(Tx)
concatenate_layer = Concatenate(axis=-1)
dense_layer_1 = Dense(10, activation = "tanh")
dense_layer_2 = Dense(1, activation = "relu")
activation_layer = Activation(softmax, name='attention_weights')
dot_prod_layer = Dot(axes = 1)

In [7]:
def one_step_attention(a, s_prev):
    """
    Performs one step of attention: Outputs a context vector computed as a dot product of the attention weights
    "alphas" and the hidden states "a" of the Bi-LSTM.
    
    Args:
    a (np.array): hidden state output of the Bi-LSTM, numpy-array of shape (?, Tx, 2*n_a)
    s_prev (np.array): previous hidden state of the (post-attention) LSTM, numpy-array of shape (?, n_s)
    
    Returns:
    context (np.array): context vector, input of the next (post-attetion) LSTM cell
    """
    
    ### START CODE HERE ###
    # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a"
    s_prev = repeat_layer(s_prev)
    # Use concatenator to concatenate a and s_prev on the last axis
    concat = concatenate_layer([a, s_prev])
    # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e.
    e = dense_layer_1(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies.
    energies = dense_layer_2(e)
    # Use "activator" on "energies" to compute the attention weights "alphas"
    alphas = activation_layer(energies)
    # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell
    context = dot_prod_layer([alphas, a])
    ### END CODE HERE ###
    
    return context

### Define model for training phase

In [8]:
reshape_layer = Reshape((1, vocab_size))
concatenate_layer_1 = Concatenate(axis=-1)
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(vocab_size, activation=softmax)

In [9]:
# Create encoder part of the model
X = Input(shape=(Tx, vocab_size), name='X')
a = Bidirectional(LSTM(n_a, return_sequences=True))(X)

In [10]:
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
Y_true = Input(shape=(Ty, vocab_size), name='Y_true')
s = s0
c = c0

# Initialize empty list of outputs
outputs = []

for t in range(Ty):

    # Perform one step of the attention mechanism to get back the context vector at step t
    context = one_step_attention(a, s) # context.shape  = (?, 1, 2*n_a)
    y_true = Lambda(lambda x: x[:, t, :])(Y_true) # y_true.shape = (?, vocab_size)
    y_true = reshape_layer(y_true) # y_true.shape = (?, 1, vocab_size)
    context = concatenate_layer_1([y_true, context])
    # Apply the post-attention LSTM cell to the "context" vector.
    s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])

    # Apply Dense layer to the hidden state output of the post-attention LSTM
    out = output_layer(s)

    outputs.append(out)

training_model = Model(inputs=[X, s0, c0, Y_true], outputs=outputs)

In [None]:
opt = Adam(0.0001, decay=0.001, clipnorm=10.0)
training_model.compile(opt, 'categorical_crossentropy', ['accuracy'])

In [None]:
filenames = [str(x) for x in range(22000)]
generator = DataGenerator(filenames, filenames)

In [None]:
training_model.fit_generator(generator=generator, steps_per_epoch=500, epochs=1, workers=4, shuffle=True)

In [11]:
training_model.load_weights("..\\models\\attention_model\\attention.h5")

In [None]:
training_model.save_weights("..\\models\\attention_model\\attention.h5")

### Define model for inference

In [14]:
encoder = Model(inputs=[X], outputs=[a])

In [15]:
def one_step_inference_model(vocab_size):
    """
    Return decoder that makes one step inference
    
    Args:
    vocab_size (int): number of distinct characters for output
    Returns:
    inference_model (Model): keras model that makes one step inference
    """
    a = Input(shape=(Tx, 2*n_a), name='a')
    Y_prev = Input(shape=(1, vocab_size), name='Y_inf')
    y_prev = Y_prev
    
    context = one_step_attention(a, s0)
    context = concatenate_layer_1([y_prev, context])
    s, _, c = post_activation_LSTM_cell(context, initial_state=[s0, c0])
    
    out = output_layer(s)
    
    return Model(inputs=[a, s0, c0, Y_prev], outputs=[out, s, c])

In [16]:
beam_width = 5
inf_model = one_step_inference_model(vocab_size) 

In [25]:
def beam_search(X, beam_width, vocab_size, Ty, n_s):
    """
    Performs beam search among outputs of inference model
    
    Args:
    X (numpy.ndarray): sentence in a form of numpy arrays X.shape = (m, Tx, vocab_size)
    beam_width (int): number of best candidates to choose
    vocab_size (int): number of distinct characters for output
    Ty (int): number of time steps to perform
    n_s (int): number of neurons in post attention LSTM
    Returns:
    characters (numpy.ndarray): output sentences
    
    """
    m = X.shape[0] # batch_size
    
    a = encoder.predict(x=[X])
    
    # perform first prediction
    out, s, c = inf_model.predict([a, np.zeros((m, n_s)), np.zeros((m, n_s)), np.zeros((m, 1, vocab_size))])
    a = np.concatenate([a for _ in range(beam_width)])
    # define variables needed to save computations
    prev_symbol_index = np.zeros((m, Ty, beam_width))
    curr_symbol = np.zeros((m, Ty, beam_width))
    s = np.concatenate([s for _ in range(beam_width)]) # s.shape = (m * beam_width, n_s)
    c = np.concatenate([c for _ in range(beam_width)])
    prob = np.log(np.partition(out, out.shape[-1] - beam_width, axis=-1)[:, -beam_width:]) # prob.shape = (m, beam_width)
    curr_symbol[:, 0, :] = np.argpartition(out,  out.shape[-1] - beam_width, axis=-1)[:, -beam_width:]
    # Y.shape = (m * beam_width, 1, vocab_size)
    Y = np.expand_dims(np.concatenate([to_categorical(x, vocab_size) 
                                       for x in curr_symbol[:, 0, :].swapaxes(0, 1)], axis=0), axis=-2)
    
    for t in range(1, Ty):
        # run predictions for all candidates
        out_temp, s_temp, c_temp = inf_model.predict([a, s, c, Y]) #out_temp.shape = (m * beam_width, vocab_size)
    
        out_temp = np.reshape(out_temp[reshape_indices(m, beam_width), :], 
                              (m, beam_width, vocab_size)).swapaxes(1, 2).swapaxes(0, 1) #out_temp.shape = (vocab_size, m, beam_width)
        
        out_temp = np.reshape((np.log(out_temp) + prob).swapaxes(0, 1), (m, vocab_size * beam_width))
        
        
        # choose top beam_width candidates
        prob = np.partition(out_temp,  out_temp.shape[-1] - beam_width, axis=-1)[:, -beam_width:] # prob.shape = (m, beam_width)
        indices = np.argpartition(out_temp,  out_temp.shape[-1] - beam_width, axis=-1)[:, -beam_width:]
        prev_symbol_index[:, t, :] = indices % beam_width
        curr_symbol[:, t, :] = indices // beam_width
    
        # prepare next inputs
        input_indices = choose_activation(prev_symbol_index[:, t, :].swapaxes(0, 1).flatten().astype(int), m)
        s = s_temp[input_indices, :]
        c = c_temp[input_indices, :]
        Y =  np.expand_dims(np.concatenate([to_categorical(x, vocab_size) 
                                       for x in curr_symbol[:, t, :].swapaxes(0, 1)], axis=0), axis=-2)
    
    # find output of the beam search
    characters = np.zeros((m, Ty))
    index = np.argmax(prob, axis=-1).astype(int)
    characters[:, Ty - 1] = curr_symbol[:, Ty - 1, index].diagonal() # characters[:, Ty - 1].shape = (m, )
    
    for i in range(Ty - 2, -1, -1):
        index = prev_symbol_index[:, i + 1, index].diagonal().astype(int)
        characters[:, i] = curr_symbol[:, i, index].diagonal()
        
    return characters
    
    

In [117]:
prediction = predict(text, vocab_to_int, int_to_vocab)

In [118]:
def reshape_indices(a, b):
    return [i + a*j  for i in range(a) for j in range(b)]
reshape_indices(3, 2)

[0, 3, 1, 4, 2, 5]

In [119]:
def choose_activation(indices, m):
    return [m * indices[i] + i%m for i in range(len(indices))]

In [120]:
int_to_vocab[1]

'h'

In [121]:
"".join(list(map(lambda x: int_to_vocab[x], prediction[2])))

'This paper discribes the design and evaluation of Positive Outlook an online program aiming to enhance the selfmanagement skills of gay men living with .<EOS><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'

### Make an inference

In [115]:
text = """The emergencee of HIV as a chronic condition means that people living with HIV are required to take more responsibility for the self-management of their condition , including making physical , emotional and social adjustments .
BACKGROUND	This paper djscribes the design and evaluation of Positive Outlook , an online program aiming to enhance the self-management skills of gay men living with HIV .
METHODS	This study is designed as a randomised controlled trial in which men living with HIV in Australia will be assigned to either an intervention group or usual care control group .
METHODS	The intervention group will participate in the online group program ` Positive Outlook ' ."""

In [89]:
def predict(text, vocab_to_int, int_to_vocab):
    data = transform_data(text, vocab_to_int)
    data = to_categorical(data, num_classes=vocab_size)
    
    prediction = beam_search(data, 10, vocab_size, Ty, n_s)
#     for i in len(prediction):
#         prediction[i] = prediction[i].swapaxes(0, 1)
#         prediction[i] = np.argmax(prediction[i], axis=-1)
#         prediction[i] = ["".join(list(map(lambda x: int_to_vocab[x], j))) for j in prediction[i]]
    return prediction
