In [None]:
import numpy as np
import pandas as pd
import datetime, os
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split

# tensorflow imports
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Embedding
from tensorflow.keras.losses import sparse_categorical_crossentropy
from keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorboard.plugins.hparams import api as hp

In [None]:
#Loading the Spider dataset
df = pd.read_json('train_spider.json')

#Select first 1000 records
df = df.sample(1000, random_state=101)

df.reset_index(inplace=True)
df.drop(columns=['index'])
#Extract input and target data from the dataframe
questions = df['question']
queries = df['query']
x = df['question']
y = df['query']


In [None]:
#Check if there is any missing value in the Spider dataset
df.isnull().values.any()

In [None]:
#Samples of inputs and outputs values
for sample_i in range(2):
  print('Questions {}:  {}'.format(sample_i + 1, questions[sample_i]))
  print('SQL query {}:  {}'.format(sample_i + 1, queries[sample_i]))

In [None]:
import collections
questions_words_counter = collections.Counter([word for sentence in questions for word in sentence.split()])
queries_words_counter = collections.Counter([word for sentence in queries for word in sentence.split()])

print('In the training data set')
print('{} words in questions.'.format(len([word for sentence in questions for word in sentence.split()])))
print('{} unique words.'.format(len(questions_words_counter)))
print('20 Most common words in the questions dataset:')
print('"' + '" "'.join(list(zip(*questions_words_counter.most_common(20)))[0]) + '"')
print()
print('{} words in SQL queries.'.format(len([word for sentence in queries for word in sentence.split()])))
print('{} unique words.'.format(len(queries_words_counter)))
print('20 Most common words in the SQL queries dataset:')
print('"' + '" "'.join(list(zip(*queries_words_counter.most_common(20)))[0]) + '"')

In [None]:
#Tokenization method
def tokenize(x):
  """
  Tokenize x
  :param x: List of sentences/strings to be tokenized
  :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
  """
  #Implementation
  x_tk = Tokenizer()
  x_tk.fit_on_texts(x)

  return x_tk.texts_to_sequences(x), x_tk

#Padding method
def pad(x, length=None):
  """
  Pad x
  :param x: List of sequences.
  :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
  :return: Padded numpy array of sequences
  """
  #Implementation
  if length is None:
    length = max([len(sentence) for sentence in x])
  return pad_sequences(x, maxlen=length, padding='post', truncating='post')

def preprocess(x, y):
  """
  Preprocess x and y
  :param x: Feature List of sentences
  :param y: Label List of sentences
  :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
  """
  preprocess_x, x_tk = tokenize(x)
  preprocess_y, y_tk = tokenize(y)

  preprocess_x = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)

  # Keras's sparse_categorical_crossentropy function will require the labels in 3 dimensions
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

  return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
preprocess_x, preprocess_y, x_tk, y_tk = preprocess(x, y)

x_sequence_length = preprocess_x.shape[1]
y_sequence_length = preprocess_y.shape[1]
x_vocab_size = len(x_tk.word_index)
y_vocab_size = len(y_tk.word_index)

print('Dataset Preprocessed')
print("Max question sentence length:", x_sequence_length)
print("Max query sentence length:", y_sequence_length)
print("Question vocabulary size:", x_vocab_size)
print("Query vocabulary size:", y_vocab_size)


In [None]:
#Split data 80% for training, 20% test
x_train, x_test, y_train, y_test = train_test_split(preprocess_x, preprocess_y, test_size=0.2, random_state=99)

print(len(x_train), 'train sequences', x_train.shape)
print(len(x_test), 'test sequences', x_test.shape)

In [None]:
#Create values of Hyper-parameter using Optimizers
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd','RMSprop']))
HP_NUM_UNITS1 = hp.HParam('num_units1', hp.Discrete([128, 256, 512]))
HP_NUM_UNITS2 = hp.HParam('num_units2', hp.Discrete([256, 512]))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS1, HP_NUM_UNITS2, HP_OPTIMIZER],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

In [None]:
#Hyper-parameter optimisation process
def train_test_model(hparams):
  """
  Define a function for testing different combinations of hyper-parameter setting
  :hparams: a nest list of values
  :return: training and vaildation accuracy for each epoch
  """
  model = tf.keras.models.Sequential([
    Input(shape=x_train.shape[1:]),
    Embedding(x_vocab_size, 100),
    Bidirectional(layers.LSTM(hparams[HP_NUM_UNITS1], return_sequences=True)),
    Bidirectional(layers.LSTM(hparams[HP_NUM_UNITS1], return_sequences=False)),
    Dense(hparams[HP_NUM_UNITS2], activation='relu'),
    RepeatVector(y_sequence_length),
    Bidirectional(layers.LSTM(hparams[HP_NUM_UNITS2], dropout=0.5, return_sequences=True)),
    TimeDistributed(Dense(y_vocab_size, activation='softmax'))
  ])
  model.compile(
    optimizer=hparams[HP_OPTIMIZER],
    loss=sparse_categorical_crossentropy,
    metrics=['accuracy']
  )
  model.fit(x_train, y_train, epochs=5)
  _, accuracy = model.evaluate(x_test, y_test)
  return accuracy

In [None]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    accuracy = train_test_model(hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [None]:
session_num = 0

for num_units1 in HP_NUM_UNITS1.domain.values:
  for num_units2 in HP_NUM_UNITS2.domain.values:
    for optimizer in HP_OPTIMIZER.domain.values:
      hparams = {
          HP_NUM_UNITS1: num_units1,
          HP_NUM_UNITS2: num_units2,
          HP_OPTIMIZER: optimizer
      }
      run_name = "run-%d" % session_num
      print('--- Starting trial: %s' % run_name)
      print({h.name: hparams[h] for h in hparams})
      run('logs/hparam_tuning/' + run_name, hparams)
      session_num += 1

In [None]:
HP_df = pd.read_csv('hparams_table.csv')
HP_df


In [None]:
df = pd.read_json('train_spider.json')
x = df['question']
y = df['query']

preprocess_x, preprocess_y, x_tk, y_tk = preprocess(x, y)

x_sequence_length = preprocess_x.shape[1]
y_sequence_length = preprocess_y.shape[1]
x_vocab_size = len(x_tk.word_index)
y_vocab_size = len(y_tk.word_index)

x_train, x_test, y_train, y_test = train_test_split(preprocess_x, preprocess_y, test_size=0.3, random_state=99)


In [None]:
from pandas.core.algorithms import mode
from tensorflow import keras
from keras import layers, models
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import GRU, Input, Dense, TimeDistributed, Dropout, LSTM

def model_final(input_shape, output_sequence_length, question_vocab_size, query_vocab_size):
  """
  Build and train a embedding and bidirectional LSTM on x and y
  :param input_shape: Tuple of input shape
  :param output_sequence_length: Length of output sequence
  :param question_vocab_size: Number of unique words in the question dataset
  :param query_vocab_size: Number of unique words in the query dataset
  :return: Keras model built, but not trained
  """
  inputs = Input(shape=input_shape[1:])
  emb = Embedding(question_vocab_size, 100)(inputs)
  en_lstm_layer1 = Bidirectional(LSTM(128, return_sequences=True))(emb)
  en_lstm_layer2 = Bidirectional(LSTM(128, return_sequences=False))(en_lstm_layer1)
  final_enc = Dense(256, activation='relu')(en_lstm_layer2)
    
  dec1 = RepeatVector(output_sequence_length)(final_enc)
  de_lstm_layer1 = Bidirectional(LSTM(256, dropout=0.5, return_sequences=True))(dec1)
  layer = TimeDistributed(Dense(query_vocab_size, activation='softmax'))
  final = layer(de_lstm_layer1)

  model = Model(inputs=inputs, outputs=final)
  model.compile(loss=sparse_categorical_crossentropy, 
                optimizer=keras.optimizers.Adam(learning_rate=0.0001), 
                metrics=['accuracy'])
  return model

model_final = model_final(
    x_train.shape,
    y_train.shape[1],
    x_vocab_size+1,
    y_vocab_size+1)

callbacks = [EarlyStopping(monitor='val_loss', patience=1)]

model_final.summary()
model_final_history = model_final.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_test, y_test), callbacks = callbacks)

print(model_final_history.history.keys())
# summarize history for accuracy
plt.plot(model_final_history.history['accuracy'])
plt.plot(model_final_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('final_accuracy.png')
plt.show()

# summarize history for loss
plt.plot(model_final_history.history['loss'])
plt.plot(model_final_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.savefig('final_loss.png')
plt.show()

In [None]:
print("Average train accuracy: ", np.average(model_final_history.history['accuracy']))
print("Average train loss: ", np.average(model_final_history.history['loss']))
print("Average val accuracy: ", np.average(model_final_history.history['val_accuracy']))
print("Average val loss: ", np.average(model_final_history.history['val_loss']))

In [None]:
for sample_i in range(2):
  print('Questions {}:  {}'.format(sample_i + 1, x[sample_i]))
  print('SQL query {}:  {}'.format(sample_i + 1, y[sample_i]))

In [None]:
y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
y_id_to_word[0] = '<PAD>'

sentence = 'how many heads of the departments are older than 56'
sentence = [x_tk.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=x_train.shape[-1], padding='post')
sentences = np.array([sentence[0], x_train[0]])
predictions = model_final.predict(sentences, len(sentences))
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))