## A NLP Project with Hyperparametres Tuning

#### 1. The Main Code For the Project

In [None]:
import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def solution_model():
    url = 'http://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
    urllib.request.urlretrieve(url, 'sarcasm.json')

    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000

    sentences = []
    labels = []

    with open("sarcasm.json", 'r') as f:
        datastore = json.load(f)

    for item in datastore:
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])

    training_sentences = sentences[0:training_size]
    training_labels = labels[0:training_size]
    testing_sentences = sentences[training_size:]  # validation set
    testing_labels = labels[training_size:]  # validation set

    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(training_sentences)

    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)  # validation set
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type,
                                   truncating=trunc_type)  # validation set

    training_padded = np.array(training_padded)
    training_labels = np.array(training_labels)
    testing_padded = np.array(testing_padded)  # validation set
    testing_labels = np.array(testing_labels)  # validation set

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96, return_sequences=True)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dropout(0.2),
        
        tf.keras.layers.Dense(96, activation='relu'),
        
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    num_epochs = 15

    model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)

    return model

In [None]:
if __name__ == '__main__':
    model = solution_model()
    model.save("mymodel.h5")

#### 2. Hyperparametres Tuning

In [None]:
import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

!pip install -q -U keras-tuner
import kerastuner as kt
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import IPython

In [None]:
class ClearTrainingOutput(tf.keras.callbacks.Callback):
  def on_train_end(*args, **kwargs):
    IPython.display.clear_output(wait = True)

In [None]:
    url = 'http://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
    urllib.request.urlretrieve(url, 'sarcasm.json')

    # DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
    vocab_size = 1000
    embedding_dim = 16
    max_length = 120
    trunc_type='post'
    padding_type='post'
    oov_tok = "<OOV>"
    training_size = 20000

    sentences = []
    labels = []
    # YOUR CODE HERE
    with open("sarcasm.json", 'r') as f:
        datastore = json.load(f)

    for item in datastore:
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])

    training_sentences = sentences[0:training_size]
    training_labels = labels[0:training_size]
    testing_sentences = sentences[training_size:]  # validation set
    testing_labels = labels[training_size:]  # validation set

    tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
    tokenizer.fit_on_texts(training_sentences)

    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)  # validation set
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type,
                                   truncating=trunc_type)  # validation set

    training_padded = np.array(training_padded)
    training_labels = np.array(training_labels)
    testing_padded = np.array(testing_padded)  # validation set
    testing_labels = np.array(testing_labels)  # validation set

In [None]:
def model_builder(hp):
  model = tf.keras.Sequential()

  model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length))

  model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp.Int('LSTM_1',
                                        min_value=32,
                                        max_value=128,
                                        step=16), return_sequences=True)))
  model.add(tf.keras.layers.Dropout(hp.Choice('Dropout_1', values = [0.2, 0.3, 0.5])))
  model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp.Int('LSTM_2',
                                        min_value=32,
                                        max_value=128,
                                        step=16))))
  model.add(tf.keras.layers.Dropout(hp.Choice('Dropout_2', values = [0.2, 0.3, 0.5])))

  model.add(tf.keras.layers.Dense(hp.Int('Dense_1',
                                        min_value=32,
                                        max_value=256,
                                        step=32), activation='relu'))
  
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

  #from tf.keras.optimizers import Adam
  model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate = hp.Choice('learning_rate', values = [1e-3, 1e-4, 1e-5, 1e-6])), metrics=['accuracy'])

  return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective = 'val_accuracy', 
                     max_epochs = 15,
                     factor = 3,
                     directory = 'my_dir',
                     project_name = 'intro_to_kt')

In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(training_padded, training_labels, epochs = 15, validation_data = (testing_padded, testing_labels), callbacks = [ClearTrainingOutput()])

In [None]:
tuner.results_summary()

In [None]:
models = tuner.get_best_models(num_models=2)

In [None]:
models[0]

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials = 1)[0]
print(best_hps)

In [None]:
model = tuner.hypermodel.build(best_hps)

In [None]:
model.fit(training_padded, training_labels, epochs = 20, validation_data = (testing_padded, testing_labels), callbacks = [ClearTrainingOutput()])