In [89]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import tensorflow_datasets as tfds
import json
import numpy as np

In [90]:
!wget 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'

--2023-02-07 17:56:40--  https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 209.85.200.128, 142.250.152.128, 142.250.128.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|209.85.200.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json.1’


2023-02-07 17:56:40 (189 MB/s) - ‘sarcasm.json.1’ saved [5643545/5643545]



In [91]:
f = open('/content/sarcasm.json')
raw_dataset = json.load(f)

In [92]:
dataset = []
labels = []
for data in raw_dataset:
  dataset.append(data['headline'])
  labels.append(int(data['is_sarcastic']))  

In [93]:
def train_val_test_split(datset, lables, train_val_split_size, val_test_split_size):
  train_val_split_size = int(len(dataset) * train_val_split_size)
  val_test_split_size = int((len(dataset) - train_val_split_size) * val_test_split_size) 

  print(len(dataset))
  print(train_val_split_size)
  print(val_test_split_size)

  train_dataset, val_test_dataset = dataset[:train_val_split_size], dataset[train_val_split_size:]
  val_dataset, test_dataset = val_test_dataset[:val_test_split_size], val_test_dataset[val_test_split_size:]

  train_labels, val_test_labels = labels[:train_val_split_size], labels[train_val_split_size:]
  val_labels, test_labels = val_test_labels[:val_test_split_size], val_test_labels[val_test_split_size:]

  return train_dataset, train_labels, val_dataset, val_labels, test_dataset, test_labels  

In [94]:
raw_train_set, train_labels, raw_validation_set, validation_labels, raw_test_set, test_labels = train_val_test_split(dataset, labels, 0.7, 0.6)

26709
18696
4807


In [95]:
print(len(raw_train_set))
print(len(train_labels))
print(len(raw_validation_set))
print(len(validation_labels))
print(len(raw_test_set))
print(len(test_labels))

18696
18696
4807
4807
3206
3206


In [96]:
VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 250

**Using TextVectorization: A preprocessing layer which maps text features to integer sequences**

In [97]:
vectorize_layer = TextVectorization(max_tokens = VOCAB_SIZE, output_mode = 'int', output_sequence_length = MAX_SEQUENCE_LENGTH)

vectorize_layer.adapt(raw_train_set)

In [98]:
def vectorize_text(text, labels):
  return vectorize_layer(text), labels

In [99]:
print(vectorize_text(raw_train_set[0], labels[0])[0])

tf.Tensor(
[ 323    1  919 4405 2227   47  365   91 1901    6 2741    1    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 

In [100]:
train_set = tf.data.Dataset.from_tensor_slices((raw_train_set, train_labels))

#shuffling on validation or test is useless as there is only one forward pass to calcualte the accuracy score
validation_set = tf.data.Dataset.from_tensor_slices((raw_validation_set, validation_labels)) 
test_set = tf.data.Dataset.from_tensor_slices((raw_test_set, test_labels))

In [101]:
final_train_set = train_set.map(vectorize_text)
final_validation_set = validation_set.map(vectorize_text)
final_test_set = test_set.map(vectorize_text)

In [102]:
final_train_set = final_train_set.shuffle(1000).batch(64).prefetch(1)
final_validation_set = final_validation_set.batch(64).prefetch(1)
final_test_set = final_test_set.batch(64).prefetch(1)

In [103]:
model = tf.keras.Sequential([
      layers.Embedding(VOCAB_SIZE + 1, 64),
      layers.Conv1D(8, 5, padding="valid", activation="relu", strides=2),
      layers.Dropout(0.5),
      layers.GlobalMaxPooling1D(),
      layers.Dense(1, activation = 'sigmoid')
])

In [104]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
savebest = tf.keras.callbacks.ModelCheckpoint(filepath = '/tmp/checkpoint',monitor = 'val_accuracy', save_best_only = True, save_weights_only = True)
model.fit(final_train_set, epochs = 6, validation_data = final_validation_set, callbacks = [savebest])

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7faee4641e80>

In [105]:
 model.load_weights('/tmp/checkpoint')
 model.evaluate(final_test_set)



[0.378498911857605, 0.8399875164031982]

**With some hyperparameter tuning (changing vocab size, max sequence length and number of trainable parameters), the model achieved an accuracy score of 84% on the test set.**