<a href="https://colab.research.google.com/github/tqnhu2407/sarcasm_detector_using_embeddings/blob/main/Sarcasm_Detector_using_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from bs4 import BeautifulSoup
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import numpy as np
import matplotlib.pyplot as plt

# Reading JSON input file and Tokenizing

In [None]:
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']

26709-long data

In [None]:
def split_train_test(stopwords, training_size, vocab_size, max_len):

    table = str.maketrans('', '', string.punctuation)
    sentences = []
    labels = []
    urls = []


    with open('Sarcasm_Headlines_Dataset.json', 'r') as f:
        for line in f:
            obj = json.loads(line)
            sentence = obj['headline'].lower()
            sentence = sentence.replace(",", " , ")
            sentence = sentence.replace(".", " . ")
            sentence = sentence.replace("-", " - ")
            sentence = sentence.replace("/", " / ")
            soup = BeautifulSoup(sentence)
            sentence = soup.get_text() # remove HTML tags
            words = sentence.split()
            filtered_sentence = ""
            for word in words:
                word = word.translate(table) # remove punctuation
                if word not in stopwords: # remove stop words
                    filtered_sentence = filtered_sentence + word + " "
            sentences.append(filtered_sentence)
            urls.append(obj['article_link'])
            labels.append(obj['is_sarcastic'])

    training_sentences = sentences[0:training_size]
    testing_sentences = sentences[training_size:]
    training_labels = labels[0:training_size]
    testing_labels = labels[training_size:]

    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(training_sentences)

    training_sequences = tokenizer.texts_to_sequences(training_sentences)
    training_padded = pad_sequences(training_sequences, maxlen=max_len, padding='post', truncating='post')

    tokenizer.fit_on_texts(testing_sentences)
    testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
    testing_padded = pad_sequences(testing_sequences, maxlen=max_len, padding='post', truncating='post')

    training_padded = np.array(training_padded)
    training_labels = np.array(training_labels)
    testing_padded = np.array(testing_padded)
    testing_labels = np.array(testing_labels)

    return training_padded, testing_padded, training_labels, testing_labels, tokenizer, sentences

In [None]:
training_size = 23000

In [None]:
vocab_size = 10000

In [None]:
training_padded, testing_padded, training_labels, testing_labels, tokenizer, sentences = split_train_test(stopwords, training_size, vocab_size, 85)

# Embeddings in Tensorflow

## Building a Sarcasm Detector Using Embeddings


In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(10000, 16), # vocab_size, embedding_dim
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

* The embedding has a 10,000-word vocab, each word is a vector in 16D => The total number of trainable parameters = 160,000.
* The average pooling layer has 0 trainable params (it's just averaging the params in the embedding layer before it to get a single 16-value vector).
* 24-neuron dense layer effectively calculates using weights and biases, so it will need to learn 24 x 16 + 24 = 408 params.
* Final single-neuron network (I already learned Sigmoid function/Logistic regression), there will be 24 + 1 = 25 params to learn.


In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=35)

In [None]:
print(history.history.keys())

In [None]:
def plot_accuracy(history):
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.xlabel('epoch')
    plt.ylabel('acccuracy')
    plt.show()

In [None]:
def plot_loss(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.show()

The validation data likely contains many words that aren't present in the training data.

**Overfitting**: while the validation accuracy is dropping a little over time, its loss is increasing sharply.

## Reducing Overfitting in Language Models

### Adjusting the learning rate

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(10000, 16), # vocab_size, embedding_dim
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)

### Exploring vocab size

In [None]:
wc = tokenizer.word_counts
print(wc)

In [None]:
from collections import OrderedDict
newlist = (OrderedDict(sorted(wc.items(), key=lambda t: t[1], reverse=True)))
print(newlist)

Plot our vocab

In [None]:
xs=[]
ys=[]
curr_x = 1
for item in newlist:
 xs.append(curr_x)
 curr_x=curr_x+1
 ys.append(newlist[item])
plt.plot(xs,ys)
plt.xlabel('word_index')
plt.ylabel('frequency')
plt.show()

Very few words are used many times.

A lot of words are used very few times.

Zoom the diagram

In [None]:
xs=[]
ys=[]
curr_x = 1
for item in newlist:
 xs.append(curr_x)
 curr_x=curr_x+1
 ys.append(newlist[item])
plt.plot(xs,ys)
plt.xlabel('word_index')
plt.ylabel('frequency')
plt.axis([300,10000,0,100])
plt.show()

Reduce the vocab_size to 2,000

In [None]:
vocab_size = 2000

In [None]:
training_padded, testing_padded, training_labels, testing_labels, tokenizer = split_train_test(stopwords, training_size, vocab_size, 85)

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, 16), # vocab_size, embedding_dim
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)

### Exploring embedding dimensions


"Best practice for embedding size is to have it be the fourth root of the vocab size."

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, 7), # vocab_size, embedding_dim
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)

### Exploring the model architecture


Reduce to 8 neurons

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, 7), # vocab_size, embedding_dim
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(8, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)

### Using dropout

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocab_size, 7), # vocab_size, embedding_dim
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(8, activation='relu'),
                             tf.keras.layers.Dropout(0.25),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)

### Using regularization

* L1
* L2 (commonly used in NLP)

In [None]:
model = tf.keras.Sequential([
                        tf.keras.layers.Embedding(vocab_size, 7),
                        tf.keras.layers.GlobalAveragePooling1D(),
                        tf.keras.layers.Dense(8, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)),
                        tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)

### Other optimization consideration

#### Exploring sentence length

In [None]:
xs=[]
ys=[]
current_item=1
for item in sentences:
    xs.append(current_item)
    current_item=current_item+1
    ys.append(len(item))
newys = sorted(ys)

plt.plot(xs,newys)
plt.show()

There are more than 25,000 sentences that have a length of < 100 words. So we reduce the max_length to 85.

In [None]:
model = tf.keras.Sequential([
                        tf.keras.layers.Embedding(vocab_size, 7),
                        tf.keras.layers.GlobalAveragePooling1D(),
                        tf.keras.layers.Dense(8, activation='relu', kernel_regularizer = tf.keras.regularizers.l2(0.01)),
                        tf.keras.layers.Dense(1, activation='sigmoid')
])

adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False) # default = 0.001
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [None]:
history = model.fit(training_padded, training_labels, validation_split=0.33, epochs=100)

In [None]:
plot_accuracy(history)

In [None]:
plot_loss(history)