In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

DATA_DIRECTORY = os.getenv('DATA_DIRECTORY')
DATA_PATH = os.path.join(DATA_DIRECTORY, 'data.csv')

In [2]:
import csv
import numpy as np

sentences = []
labels = []

file = open(DATA_PATH, 'r', encoding='utf-8')
csvreader = csv.reader(file)
_ = next(csvreader)
for row in csvreader:
    sentences.append(row[0])
    labels.append(int(row[1]))
file.close()

features, targets = np.array(sentences), np.array(labels)

In [None]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_targets, test_targets = train_test_split(
        features, targets,
        train_size=0.8,
        test_size=0.2,
        random_state=42,
        shuffle = True,
        stratify=targets
)

print(f'training samples size: {len(train_features)}'
    f'\ntesting samples size: {len(test_features)}')

In [None]:
import tensorflow as tf

vocab_size = 50000
embedding_dim = 32
max_length = 200

print('Num GPUs Available: ', len(tf.config.list_physical_devices('GPU')))

In [5]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size+2,
    output_mode='int',
    output_sequence_length=max_length,
)
vectorize_layer.adapt(train_features)

In [None]:
model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.SpatialDropout1D(0.4),
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(units=128, dropout=0.2, return_sequences=True)
    ),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(units=32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=16, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
num_epochs = 25
batch_size = 64

callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
history = model.fit(
    x=train_features, y=train_targets,
    epochs=num_epochs, batch_size=batch_size,
    validation_data=(test_features, test_targets),
    callbacks=[callback], verbose=1
)

version = 1
MODEL_DIRECTORY = os.getenv('MODEL_DIRECTORY')
export_path = os.path.join(MODEL_DIRECTORY, 'sarcasm_model', str(version))
model.save(export_path)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

probs = model.predict(test_features)
pred = np.where(probs > 0.8, 1,0)
cm = confusion_matrix(test_targets, pred)
print(accuracy_score(test_targets, pred))

In [None]:
import pandas as pd
import seaborn as sns

cm = pd.DataFrame(
    data=cm,
    index=['Not Sarcastic','Sarcastic'],
    columns=['Not Sarcastic','Sarcastic']
)
plt.figure(figsize = (10,10))
sns.heatmap(
    data=cm, cmap="Blues", linecolor='black',
    linewidth=1, annot=True, fmt='',
    xticklabels = ['Not Sarcastic','Sarcastic'],
    yticklabels = ['Not Sarcastic','Sarcastic']
)