### Dataset - sarcasm detection in news headlines 

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
 
# get current directory
cwd = os.getcwd()
 
# prints parent directory
parent_directory = os.path.abspath(os.path.join(cwd, os.pardir))
os.listdir(parent_directory)


2022-02-22 20:33:30.908908: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-22 20:33:30.908986: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


['README.md',
 'NLP LABS',
 'Document Summarisation Project',
 '.git',
 'NLP Fundamentals',
 'spacy_cheatsheets',
 'Topic Modelling',
 'data',
 'pretrained_word_embeddings.ipynb',
 'Stock price Prediction Using NewsHeadlines',
 '.ipynb_checkpoints']

In [2]:
filename = f'{parent_directory}/data/Sarcasm_Headlines_Dataset.json'
import json 
with open(filename,'r') as f:
    data = f.read() 

In [3]:
urls = []
sentences = []
labels = []

d = data.split('\n')
for i in range(len(d)):
    try:
        document = json.loads(d[i])
        url  = document['article_link']
        urls.append(url)
        headline = document['headline']
        sentences.append(headline)
        label = document['is_sarcastic']
        labels.append(label)
    except:
        pass 

In [4]:
sentences[:10]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way',
 "advancing the world's women",
 'the fascinating case for eating lab-grown meat',
 'this ceo will send your kids to school, if you work for his company',
 'top snake handler leaves sinking huckabee campaign',
 "friday's morning email: inside trump's presser for the ages"]

In [8]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
lentraining_sentences[0]

"former versace store clerk sues over secret 'black code' for minority shoppers"

In [9]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
training_sequences[0]

[328, 1, 799, 3405, 2404, 47, 389, 2214, 1, 6, 2614, 8863]

In [10]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

## Embedding for getting the meaning of a word 

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [15]:
num_epochs = 10
history = model.fit(
    training_padded, 
    training_labels, 
    epochs=num_epochs, 
    validation_data=(testing_padded, testing_labels),
    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Testing examples 

In [18]:
new_sentence = [
    'granny starting to fear  spiders in the garden might be real',
    'the weather today is bright and sunny'
]
new_sequences = tokenizer.texts_to_sequences(new_sentence)

In [20]:
padded = pad_sequences(
    new_sequences, 
    maxlen = max_length, 
    padding = padding_type,
    truncating = trunc_type
)

In [29]:
for i in model.predict(padded):
    print(format(i[0],'f'))

0.876030
0.000000


In [22]:
print(format(8.7602985e-01, 'f'))

0.876030


## LSTM MODEL 

In [32]:
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000,64), 
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), 
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(1,activation = 'sigmoid')
])

In [35]:
lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
lstm_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          640000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 714,369
Trainable params: 714,369
Non-trainable params: 0
_________________________________________________________________


In [36]:
num_epochs = 10
history = lstm_model.fit(
    training_padded, 
    training_labels, 
    epochs=num_epochs, 
    validation_data=(testing_padded, testing_labels),
    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Notes
In practical terms, you can use the pretrained Word2vec embeddings as features of any neural network (or other algorithm). They can give you advantage if your data is small, since the pretrained embeddings were trained on large volumes of text.

On another hand, there are examples showing that learning the embeddings from your data, optimized for a particular problem, may be more efficient 


Embeddings are methods for learning vector representations of categorical data. They are most commonly used for working with textual data. Word2vec and GloVe are two popular frameworks for learning word embeddings.

For Keras Embedding Layer, You are using supervised learning. Embedding learned here for independent variable will directly map to the dependent variable.
However, word2vec or glove is unsupervised learning problem. Here, embedding learned depends on data you are feeding to model.