###Import the required libraries

In [0]:

import numpy as np
import matplotlib.pyplot as plt
import json
import tensorflow as tf
import io

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


###Define the hyperparameters

In [0]:

vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000


###Download and store the dataset

In [0]:

# Dowload the dataset
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

# Get the training data from the dataset
with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])


###Split the data into training and test sets

In [0]:

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


###Tokenize the sentences present in the dataset

In [0]:

# Define the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Tokenize the training examples
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_length,
                                padding = padding_type,
                                truncating = trunc_type)

# Tokenize the test examples
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,
                               maxlen = max_length,
                               padding = padding_type,
                               truncating = trunc_type)

# Convert the labels into numpy arrays
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)


###Define the model

In [0]:

# Define the layers of the model
sd_model = tf.keras.Sequential ([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation = 'relu', kernel_regularizer = "l2"),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

# Define the optimizer and the loss function
sd_model.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy'])


###Train the model

In [0]:

history = sd_model.fit(training_padded,
                       training_labels,
                       epochs = 30,
                       validation_data = (testing_padded, testing_labels),
                       verbose = 1)


###Plot the loss and accuracy as a function of number of epochs

In [0]:

# Function to plot the graphs
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")


###Create tsv files to visualize the embeddings

In [0]:

# Get the output of the embedding layer
emb = sd_model.layers[0]
weights = emb.get_weights()[0]

# Define the reverse word index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Store the words and corresponding embeddings
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


###Download the tsv files

In [0]:

try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')
  