In [1]:
import os
from dotenv import load_dotenv

sentences = []
labels = []

load_dotenv()

DATA_DIRECTORY = os.getenv('DATA_DIRECTORY')
DATA_PATH = DATA_DIRECTORY + '/data.csv'

In [3]:
import csv

file = open(DATA_PATH, 'r', encoding='utf-8')
csvreader = csv.reader(file)
_ = next(csvreader)
for row in csvreader:
    sentences.append(row[0])
    labels.append(int(row[1]))
file.close()

training_split = 0.8
training_size = int(training_split * len(labels))

training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[:training_size]
testing_labels = labels[training_size:]
print(f'training samples size: {len(training_sentences)}'
      f'testing samples size: {len(testing_sentences)}')

In [4]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
embedding_dim = 32
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

Num GPUs Available:  1


In [5]:
import pickle

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

with open('tokenizer_data.pkl', 'wb') as handle:
    pickle.dump({'tokenizer': tokenizer, 'vocab_size': vocab_size}, handle)
print('saved tokenizer_data')

In [6]:
import numpy as np

training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [7]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.SpatialDropout1D(0.4),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128, dropout=0.2, return_sequences=True)),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           320000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 200, 32)          0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 200, 256)         164864    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 32)                8224      
                                                        

In [8]:
num_epochs = 25
batch_size = 64
history = model.fit(training_padded, training_labels, epochs=num_epochs, batch_size=batch_size, validation_data=(testing_padded, testing_labels), callbacks=[callback], verbose=1)
model.save('./saved_models/bidirectional_lstm_model3')

Epoch 1/25

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

probs = model.predict(testing_padded)
pred = np.where(probs > 0.8, 1,0)
cm = confusion_matrix(testing_labels,pred)

In [None]:
import pandas as pd
import seaborn as sns

cm = pd.DataFrame(cm , index = ['Not Sarcastic','Sarcastic'] , columns = ['Not Sarcastic','Sarcastic'])
plt.figure(figsize = (10,10))
sns.heatmap(cm,cmap= "Blues", linecolor = 'black' , linewidth = 1 , annot = True, fmt='' , xticklabels = ['Not Sarcastic','Sarcastic'] , yticklabels = ['Not Sarcastic','Sarcastic'])