## DeepQ environment setting

In [None]:
import sys
sys.path[0] = '/home/deepq/.local/lib/python3.6/site-packages'

## Code

In [4]:
from __future__ import print_function

from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Activation, TimeDistributed
from keras.layers import dot, concatenate
import keras.backend as K
import numpy as np
import pandas as pd
from collections import defaultdict

batch_size = 10 # Batch size for training.
epochs = 10 # Number of epochs to train for.
word_size = 30000 # Size of the dictionary
max_seq_len = 30
latent_dim = 256 # Latent dimensionality of the encoding space.
num_samples = 1000 # Number of samples to train on.
# Path to the data txt file on disk.
TRAIN_FILE = 'hw3_1/rhyme/train.csv'
TEST_FILE = 'hw3_1/rhyme/test.csv'

In [5]:
df = pd.read_csv(TRAIN_FILE, header=None, names=['X', 'Y'])

input_texts = []
target_texts = []
cnt = 0

for index in range(len(df)):
    input_text, target_text = df['X'][index], df['Y'][index]
#     input_text = input_text.replace('SOS ', '').replace(' EOS', '')
#     target_text = target_text.replace('SOS ', '').replace(' EOS', '')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
#     target_text = '\t ' + target_text + ' \n'
    input_text, target_text = input_text.split(' '), target_text.split(' ')
    if len(input_text) > max_seq_len or len(target_text) > max_seq_len:
        continue
    input_texts.append(input_text)
    target_texts.append(target_text)
    if len(input_texts) >= num_samples:
        break

In [6]:
from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load('embeddings/rhyme.w2v', mmap='r')

In [7]:
words_index = {'PAD': 0, 'UNK': 1}
words_list = [(k, word_vectors[k]) for k, v in word_vectors.vocab.items()]

embeddings_matrix = np.zeros((len(word_vectors.vocab.items())+2, word_vectors.vector_size))
for i in range(len(words_list)):
    word = words_list[i][0]
    words_index[word] = i + 2
    embeddings_matrix[i+2] = words_list[i][1]

# UNK vector = average word vector
embeddings_matrix[1] = np.mean(embeddings_matrix, axis=0)
    
reverse_words_index = dict(
    (i, word) for word, i in words_index.items())

print('Number of samples:', len(input_texts))
print('Number of unique tokens:', len(words_index))
print('Max sequence length:', max_seq_len)

Number of samples: 1000
Number of unique tokens: 29176
Max sequence length: 30


In [8]:
def padding(x):
    maxlen = max([len(i) for i in x])
    return [i + [0] * (maxlen-len(i)) for i in x]

def id2str(ids):
    return [reverse_words_index[i] for i in ids]

def onehot2str(onehots):
    return [reverse_words_index[np.argmax(i)] for i in onehots]

def data_generator():
    while True:
        cnt = 0
        encoder_input_data = np.zeros(
            (batch_size, max_seq_len),
            dtype=np.int)
        decoder_input_data = np.zeros(
            (batch_size, max_seq_len),
            dtype=np.int)
        decoder_target_data = np.zeros(
            (batch_size, max_seq_len, len(words_index)),
            dtype=np.bool)

        for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
            for t, word in enumerate(input_text):
                if t >= max_seq_len:
                    break
                if word not in words_index:
                    word = 'UNK'
                encoder_input_data[cnt, t] = words_index[word]
            for t, word in enumerate(target_text):
                if t >= max_seq_len:
                    break
                if word not in words_index:
                    word = 'UNK'
                # decoder_target_data is ahead of decoder_input_data by one timestep
                decoder_input_data[cnt, t] = words_index[word]
                if t > 0:
                    # decoder_target_data will be ahead by one timestep
                    # and will not include the start character.
                    decoder_target_data[cnt, t - 1, words_index[word]] = 1
            cnt += 1
            if cnt >= batch_size:
                yield [[encoder_input_data, decoder_input_data], decoder_target_data]
                cnt = 0
                encoder_input_data = np.zeros(
                    (batch_size, max_seq_len),
                    dtype=np.int)
                decoder_input_data = np.zeros(
                    (batch_size, max_seq_len),
                    dtype=np.int)
                decoder_target_data = np.zeros(
                    (batch_size, max_seq_len, len(words_index)),
                    dtype=np.bool)

def get_testing_data(test_size):
    X_test = np.zeros((test_size, max_seq_len), dtype=np.int)
    cnt = 0
    
    for i, test_text in enumerate(test_texts):
        for t, word in enumerate(test_text):
            if t >= max_seq_len:
                break
            if word not in words_index:
                word = 'UNK'
            X_test[cnt, t] = words_index[word]
        cnt += 1
        if cnt >= test_size:
            return X_test
                
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = words_index['SOS']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_words_index[sampled_token_index]
        decoded_sentence.append(sampled_word)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == 'EOS' or
           len(decoded_sentence) > max_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
for X, Y in data_generator():
    encoder_input_data = X[0]
    decoder_input_data = X[1]
    decoder_target_data = Y
    break

In [None]:
index = 0
print(id2str(encoder_input_data[index]))
print(id2str(decoder_input_data[index]))
print(onehot2str(decoder_target_data[index]))

## Model

### No attention

In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
embed = Embedding(len(embeddings_matrix), 
                  latent_dim,
                  weights=[embeddings_matrix],
                  trainable=False,
                  mask_zero=True)
encoder_embed = embed(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embed)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embed = embed(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
decoder_dense = Dense(len(words_index), activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
model.load_weights('models/s2s_no_attn_all_2.h5')

In [None]:
model.save('models/s2s_report.h5')

### Attention

In [None]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
embed = Embedding(len(embeddings_matrix), 
                  latent_dim,
                  weights=[embeddings_matrix],
                  trainable=False,
                  mask_zero=True)
encoder_embed = embed(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_sequences=True, return_state=True)(encoder_embed)
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embed = embed(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
# decoder_dense = Dense(len(words_index), activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

# Attention layer
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax')(attention)

context = dot([attention, encoder_outputs], axes=[2,1])
decoder_combined_context = concatenate([context, decoder_outputs])

# Has another weight + tanh layer as described in equation (5) of the paper
output = TimeDistributed(Dense(64, activation="tanh"))(decoder_combined_context) # equation (5) of the paper
output = TimeDistributed(Dense(len(words_index), activation="softmax"))(output) # equation (6) of the paper

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], output)
model.summary()

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

### Train model

In [None]:
cp = ModelCheckpoint('s2s.h5', save_weights_only=True)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit_generator(data_generator(),
                              steps_per_epoch=min(num_samples, len(df))//batch_size,
#                               callbacks=[cp],
                              epochs=epochs)
# Save model
# model.save_weights('tmp/s2s_weights.h5')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history['loss'])
plt.show()

## Get testing data

In [9]:
test_df = pd.read_csv(TEST_FILE, header=None, names=['X'])
test_texts = []
for index in range(len(test_df)):
    test_text = test_df['X'][index]
#     test_text = test_text.replace('SOS ', '').replace(' EOS', '')
    test_text = test_text.split(' ')
    test_texts.append(test_text)

In [10]:
X_test = get_testing_data(len(test_texts))

## Inference

In [None]:
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embed = embed(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embed, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# encoder_model.save('encoder.h5')
# encoder_model.save_weights('encoder_w.h5')
# decoder_model.save('decoder.h5')
# decoder_model.save_weights('decoder_w.h5')

In [11]:
from keras.models import load_model

encoder_model = load_model('models/no_attn_rhyme_encoder.h5')
decoder_model = load_model('models/no_attn_rhyme_decoder.h5')



In [None]:
encoder_model.load_weights('models/no_attn_all_encoder_w.h5')
decoder_model.load_weights('models/no_attn_all_decoder_w.h5')

In [12]:
from tqdm import tqdm

output_filename = 'results/no_attn_rhyme.csv'

with open(output_filename, 'w') as f:
    for seq_index in tqdm(range(len(X_test))):
        # Take one sequence (part of the training set)
        # for trying out decoding.
        input_seq = X_test[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(input_seq)[:-1]
        output_seq = ' '.join(decoded_sentence)
#         output_seq = output_seq.replace('SOS ', '').replace(' EOS', '')
        f.write(output_seq)
        f.write('\n')
#         print('-')
#         print('Input sentence:', input_texts[seq_index])
#         print('Decoded sentence:', decoded_sentence)

100%|██████████| 70000/70000 [22:26<00:00, 51.98it/s]


In [None]:
output_filename = 'no_attn_all.csv'

for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = X_test[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)[:-1]
    output_seq = ' '.join(decoded_sentence)
    print('-')
    print('Input sentence:', id2str(X_test[seq_index]))
    print('Decoded sentence:', decoded_sentence)

## Train word embedding

In [None]:
# For word2vec
w2v_texts = []
df = pd.read_csv(TRAIN_FILE, header=None, names=['X', 'Y'])

for index in range(len(df)):
    input_text, target_text = df['X'][index], df['Y'][index]
    input_text = input_text.replace('SOS ', '').replace(' EOS', '')
    target_text = target_text.replace('SOS ', '').replace(' EOS', '')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t ' + target_text + ' \n'
    input_text, target_text = input_text.split(' '), target_text.split(' ')
    w2v_texts.append(input_text)
    w2v_texts.append(target_text)

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(w2v_texts, size=256, max_final_vocab=word_size)

word_vectors = model.wv
print('Total vocabulary:', len(word_vectors.vocab))
del model

word_vectors.save('no_attn_len.w2v')

In [None]:
# embed = word_vectors.get_keras_embedding()
# embed.mask_zero = True