In [None]:
# Importing required libraries
import json
import tensorflow as tf
import requests
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Get the dataset
srcsm_json = requests.get('https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json')
# Inspecting the data, print 450 characters
print(srcsm_json.text[0:450])

[
{"article_link": "https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5", "headline": "former versace store clerk sues over secret 'black code' for minority shoppers", "is_sarcastic": 0},
{"article_link": "https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365", "headline": "the 'roseanne' revival catches up to our thorny political mood, for better and worse", "is_sarcastic": 0},


In [None]:
# Separate the json into sentences and labels
sentences = []
labels = []
for item in srcsm_json.json():
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
print(pd.DataFrame({'sentence' : sentences[0:10], 'label':labels[0:10]}))

                                            sentence  label
0  former versace store clerk sues over secret 'b...      0
1  the 'roseanne' revival catches up to our thorn...      0
2  mom starting to fear son's web series closest ...      1
3  boehner just wants wife to listen, not come up...      1
4  j.k. rowling wishes snape happy birthday in th...      0
5                        advancing the world's women      0
6     the fascinating case for eating lab-grown meat      0
7  this ceo will send your kids to school, if you...      0
8  top snake handler leaves sinking huckabee camp...      1
9  friday's morning email: inside trump's presser...      0


In [None]:
# Splitting the dataset into Train and Test
training_size =  20000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
# Setting tokenizer properties
vocab_size = 1000
oov_tok = "<OOV>"
# Fit the tokenizer on Training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
# Setting the padding properties
max_length = 120
trunc_type='post'
padding_type='post'
# Creating padded sequences from train and test data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
# Setting the model parameters
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 16,433
Trainable params: 16,433
Non-trainable params: 0
_________________________________________________________________


In [None]:

# Converting the lists to numpy arrays for Tensorflow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
# Training the model
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 4s - loss: 0.6809 - accuracy: 0.5602 - val_loss: 0.6581 - val_accuracy: 0.5633
Epoch 2/30
625/625 - 2s - loss: 0.5534 - accuracy: 0.7268 - val_loss: 0.4727 - val_accuracy: 0.7879
Epoch 3/30
625/625 - 2s - loss: 0.4274 - accuracy: 0.8065 - val_loss: 0.4273 - val_accuracy: 0.8009
Epoch 4/30
625/625 - 2s - loss: 0.3934 - accuracy: 0.8217 - val_loss: 0.4112 - val_accuracy: 0.8095
Epoch 5/30
625/625 - 2s - loss: 0.3813 - accuracy: 0.8275 - val_loss: 0.4045 - val_accuracy: 0.8159
Epoch 6/30
625/625 - 2s - loss: 0.3724 - accuracy: 0.8316 - val_loss: 0.4018 - val_accuracy: 0.8153
Epoch 7/30
625/625 - 2s - loss: 0.3653 - accuracy: 0.8349 - val_loss: 0.4006 - val_accuracy: 0.8147
Epoch 8/30
625/625 - 2s - loss: 0.3615 - accuracy: 0.8347 - val_loss: 0.4023 - val_accuracy: 0.8149
Epoch 9/30
625/625 - 2s - loss: 0.3586 - accuracy: 0.8346 - val_loss: 0.4016 - val_accuracy: 0.8114
Epoch 10/30
625/625 - 2s - loss: 0.3559 - accuracy: 0.8388 - val_loss: 0.4023 - val_accuracy: 0.8170

In [None]:
model.save("mymodel.h5")