<a href="https://colab.research.google.com/github/waynew99/592-final-project-team6/blob/main/Flirting_Detection_LSTM_opt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports & Installations

In [None]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import pandas as pd
print(np.__version__)

In [None]:
! pip install datasets
! pip install codecarbon

# Load Dataset

In [None]:
# Dataset 1 (from hugging face)
from datasets import load_dataset, Dataset, concatenate_datasets, ClassLabel, Features
from codecarbon import EmissionsTracker
dataset = load_dataset("ieuniversity/flirty_or_not")
dataset = dataset.remove_columns('id')

print(dataset)

train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

In [None]:
# Dataset 2 (from local upload)
data = pd.read_csv('flirting_rated.csv')
data.drop_duplicates(subset=['texts'], inplace=True)
data.dropna(subset=['label'], inplace=True)
dataset_2 = Dataset.from_pandas(data)
print(len(dataset_2))

In [None]:
split_train_dataset = dataset_2.train_test_split(test_size=0.2)['train']
split_valid_dataset = dataset_2.train_test_split(test_size=0.2)['test'].train_test_split(test_size=0.5)['train']
split_test_dataset = dataset_2.train_test_split(test_size=0.2)['test'].train_test_split(test_size=0.5)['test']

print(len(split_train_dataset))
print(len(split_valid_dataset))
print(len(split_test_dataset))

class_label = ClassLabel(names=['neutral', 'flirty'])

# Define features for the dataset
features = Features({
    'label': class_label,
    'texts': split_train_dataset.features['texts']
})

split_train_dataset = Dataset.from_dict({
    'label': split_train_dataset['label'],
    'texts': split_train_dataset['texts']
}, features=features)

split_valid_dataset = Dataset.from_dict({
    'label': split_valid_dataset['label'],
    'texts': split_valid_dataset['texts']
}, features=features)

split_test_dataset = Dataset.from_dict({
    'label': split_test_dataset['label'],
    'texts': split_test_dataset['texts']
}, features=features)


print(split_train_dataset.features)
print(train_dataset.features)


In [None]:
# The second dataset is NOT balanced: Majority is neutral (0)
total_flirty = 0
for label in split_train_dataset['label']:
  if label != 0:
    total_flirty += 1

for label in split_valid_dataset['label']:
  if label != 0:
    total_flirty += 1

for label in split_test_dataset['label']:
  if label != 0:
    total_flirty += 1
print(total_flirty/2659)

In [None]:
train_dataset_final = concatenate_datasets([split_train_dataset, train_dataset])
validation_dataset_final = concatenate_datasets([validation_dataset, split_valid_dataset])
test_dataset_final = concatenate_datasets([test_dataset, split_test_dataset])

In [None]:
TRAIN_SIZE = len(train_dataset_final)
VALIDATION_SIZE = len(validation_dataset_final)
TEST_SIZE = len(test_dataset_final)
print("Train: ", TRAIN_SIZE, "Validation: ", VALIDATION_SIZE, "Test: ", TEST_SIZE)

In [None]:
total_length = 0
total_samples = 5000

for sample in train_dataset_final['texts']:
    # print(sample)
    # Calculate the length of each data instance
    # Assuming each data instance is a dictionary-like object with a 'text' key
    words = sample.split()
    length = len(words)
    total_length += length

for sample in validation_dataset_final['texts']:
    # print(sample)
    # Calculate the length of each data instance
    # Assuming each data instance is a dictionary-like object with a 'text' key
    words = sample.split()
    length = len(words)
    total_length += length

for sample in test_dataset_final['texts']:
    # print(sample)
    # Calculate the length of each data instance
    # Assuming each data instance is a dictionary-like object with a 'text' key
    words = sample.split()
    length = len(words)
    total_length += length

# Compute the average length
average_length = total_length / total_samples

print("Average length of each data instance:", average_length)

In [None]:
total_flirty = 0
for label in train_dataset_final['label']:
  total_flirty += label
print(total_flirty/TRAIN_SIZE)

total_flirty = 0
for label in validation_dataset_final['label']:
  total_flirty += label
print(total_flirty/VALIDATION_SIZE)

total_flirty = 0
for label in test_dataset_final['label']:
  total_flirty += label
print(total_flirty/VALIDATION_SIZE)

# Preprocess Dataset: Converting Text Data into Vectors/Numbers
Models cannot directly process raw text, so we need to convert the text into numbers using TensorFlow's `TextVectorization` layer. Specifically, we can perform the following 3 operations all at once, by constructing this layer and feeding the data into it.

* Standardization: preprocessing the text, typically to change all text to lowercase and remove punctuation to simplify the dataset.
* Tokenization: dividing text into individual words called tokens.
* Vectorization: converting tokens into numbers so they can be fed into a neural network.

Constructing the layer is very easy (we simply call the function), but there are some hyperparameters we need to determine first.

## TextVectorization
First, *maximum vocabulary size* and *maximum sequence length* are 2 hyperparameters defined based on the nature of the dataset and the memory constraints of the machine.

### Maximum Vocabulary Size (`max_tokens`)

This parameter determines the number of unique words that are considered when vectorizing texts. To decide this:

*   Tokenize your dataset to find out the total number of unique tokens it contains.
*   Evaluate your hardware's memory constraints since a larger vocabulary will require more memory.

Typically, a number between 10,000 to 100,000 works well for many tasks, but if your dataset is very specialized, smaller might be enough. Also, it's often best to choose a number that is slightly above the number of unique tokens actually observed in your data to account for uncertainties. However, if the vocabulary size is significantly higher than the actual number of unique tokens, it can lead to increased memory overhead, slower training speed, and most importantly, possibility of overfitting.

In [None]:
# access the text and labels after the dataset is loaded successfully
texts = [row['texts'] for row in train_dataset_final]
labels = [row['label'] for row in train_dataset_final]

In [None]:
# Calculate the number of unique tokens using TensorFlow's tokenizer
from collections import Counter

# Flatten the list of sentences into a single list of words
all_words = [word for text in texts for word in text.split()]

# Count the unique words
word_count = Counter(all_words)
# ~4000 unique words/tokens in the training + validation set
print(f"Total unique tokens in the raw text: {len(word_count)}")

# Thus, probably use about 4000-5000 for token size
MAX_FEATURES = 5000  # Size of the vocabulary

### Maxmimum Sequence Length (`output_sequence_length`)

This parameter controls the maximum number of tokens that will be considered in each example. To find a reasonable maximum sequence length:

- Calculate the length of each text example in your dataset.
- Plot a histogram of these lengths to get a sense of the distribution.
- Use this information to decide on a length that covers most of your texts without being excessively long.

The `output_sequence_length` parameter is used to pad or truncate sequences to this maximum length. If a text is shorter than this, it will be padded with zeros, and if it's longer, it will be truncated.

In [None]:
# Find a reasonable sequence length
import matplotlib.pyplot as plt

# Calculate the length for each piece of text
text_lengths = [len(text.split()) for text in texts]

# Plot a histogram
plt.hist(text_lengths, bins=50)
plt.xlabel('Length of Texts')
plt.ylabel('Frequency')
plt.show()

# 5 number summary showing distribution of lengths
# calculate quartiles
quartiles = np.percentile(text_lengths, [25, 50, 75])
# calculate min/max
len_min, len_max = min(text_lengths), max(text_lengths)
# print 5-number summary
print('Min: %.3f' % len_min)
print('Q1: %.3f' % quartiles[0])
print('Median: %.3f' % quartiles[1])
print('Q3: %.3f' % quartiles[2])
print('Max: %.3f' % len_max)

# Determine a suitable maximum length (e.g., the 95th percentile could be a good starting point)
SEQUENCE_LENGTH = int(np.percentile(text_lengths, 95)) # Length of the input sequences
SEQUENCE_LENGTH = 24
print(SEQUENCE_LENGTH)

### Output Mode and Standardization

These are 2 more hyperparameters for the `TextVectorization` layer.

First, for output mode, we are setting it to `int`. As a result, the layer converts the tokens into integer indices. Each unique token is assigned a specific integer value, and texts are converted into sequences of these integers. In many deep learning models for text based on neural networks, `int` is the preferred option because it works very well with Embedding Layers, which are efficient and powerful mechanisms for handling sequences of tokens.

Next, for standardization, we are currently just changing each token into lowercase, a very common standardize function. We are *not* removing punctuation and special characters because there may be special characters (e.g. emojis) that actually contribute to the meaning of the text, so removing them may reduce context.

As a side note, we do not need to further define a custome split hyperparameter because splitting by whitespace (default) is sufficient for our case.

In [None]:
import re
import string
import keras

# @keras.saving.register_keras_serializable()
# def custom_standardization(text):
#   # Make text lowercase
#   text = tf.strings.lower(text)
#   # Only remove common punctuation
#   # common_punct = r'[,.?!]'
#   common_punct = r"[,.?!\'\"\<\>\{\}\[\]\^\&\\\%\$\#\@\|]"
#   text = tf.strings.regex_replace(text, common_punct, '')
#   return text

# Define the function that will treat punctuation as tokens?
@keras.saving.register_keras_serializable()
def custom_standardization(text):
  # Make text lowercase
  text = tf.strings.lower(text)
  # Remove insignificant punctuation from the text
  common_punct = r"[,\'\"\<\>\{\}\[\]\^\&\\\%\$\#\@\|]"
  text = tf.strings.regex_replace(text, common_punct, '')
  # Replace punctuations with space + punctuation + space
  for punct in string.punctuation:
      text = tf.strings.regex_replace(text, re.escape(punct), f" {punct} ")
  # Remove extra spaces
  text = tf.strings.strip(text)
  text = tf.strings.regex_replace(text, ' +', ' ')
  return text

In [None]:
# tokenization & vectorization
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=MAX_FEATURES,
    standardize=custom_standardization, # lowercase & remove common punctuation
    output_mode='int', # int is preferred mode in modern day DL tasks
    output_sequence_length=SEQUENCE_LENGTH)

In [None]:
# adapt the TextVectorization layer to text data so that it can build the vocabulary
vectorize_layer.adapt(texts)
# test
len(vectorize_layer.get_vocabulary())

In [None]:
# # Let's prepare a dataset to illustrate the functionality
# example_dataset = tf.data.Dataset.from_tensor_slices(["Hello!", "Hello !", "It's a? test ðŸ˜‰?", "It's a test :) :("])
# vectorize_layer.adapt(example_dataset)

# print(vectorize_layer.get_vocabulary())

# # Example of how the layer will preprocess the data
# for example in example_dataset:
#     print("Original:", example.numpy())
#     vectorized_text = vectorize_layer(example)
#     print("Vectorized:", vectorized_text.numpy())

## Convert to Tensorflow Dataset (for training)

Now, we can convert the raw text data into a form that a TensorFlow machine learning model can understand (i.e., numeric tensors) using the TextVectorization layer, preparing it in batches to be fed into the model for training or inference.

We can either (1) do the convertion as part of data preprocessing, or (2) leave it to the LSTM model itself. Option (2) generalizes better if we want to deploy and test the model since it can automatically convert text into numbers without any manual preprocessing, so we are going with this option.

In [None]:
# another hyperparameter
BATCH_SIZE = 32
# takes text data (a list of strings) and creates Dataset object
# each element of the dataset is one piece of text from your original list
text_ds = tf.data.Dataset.from_tensor_slices(texts).batch(BATCH_SIZE)
# Turn the texts into sequences of integers [Option 1]
# text_int_sequences = text_ds.map(vectorize_layer)
# convert corresponding labels to tensorflow format
label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(labels, tf.int32)).batch(BATCH_SIZE)

In [None]:
# integer representing the number of elements to shuffle at a time
shuffle_buffer_size = TRAIN_SIZE # the number of data in the training set
# build the training dataset & optimize it for training
# train_ds = tf.data.Dataset.zip((text_int_sequences, label_ds)) \ # [Option 1]
train_ds = tf.data.Dataset.zip((text_ds, label_ds)) \
            .shuffle(buffer_size=shuffle_buffer_size) \
            .cache() \
            .prefetch(buffer_size=tf.data.AUTOTUNE)

### Convert Validation Set and Test Set as well

In [None]:
val_texts = [row['texts'] for row in validation_dataset_final]
val_labels = [row['label'] for row in validation_dataset_final]
val_text_ds = tf.data.Dataset.from_tensor_slices(val_texts).batch(BATCH_SIZE)
val_label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(val_labels, tf.int32)).batch(BATCH_SIZE)
# val_text_int_sequences = val_text_ds.map(vectorize_layer) # [Option 1]
# note: no shuffling for validation and test sets
val_ds = tf.data.Dataset.zip((val_text_ds, val_label_ds)) \
            .cache() \
            .prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
test_texts = [row['texts'] for row in test_dataset_final]
test_labels = [row['label'] for row in test_dataset_final]
test_text_ds = tf.data.Dataset.from_tensor_slices(test_texts).batch(BATCH_SIZE)
test_label_ds = tf.data.Dataset.from_tensor_slices(tf.cast(test_labels, tf.int32)).batch(BATCH_SIZE)
test_ds = tf.data.Dataset.zip((test_text_ds, test_label_ds)) \
            .cache() \
            .prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
# sanity check
# for example, label in val_ds.take(1):
#   print('texts: ', example.numpy())
#   print()
#   print('labels: ', label.numpy())
# exact_sample_count = sum(1 for _ in train_ds.unbatch())  # This can be slow for large datasets
# print("Total number of samples in train_ds:", exact_sample_count)

# Build LSTM Model

In [None]:
from tensorflow.keras import layers
from tensorflow.keras import regularizers

# More Hyperparameters
EMBEDDING_DIM = 128    # Dimension of the embedding vectors
LSTM_UNITS = 16        # The number of units in the LSTM layer (x2 if bidirectional is specified)
DROPOUT_RATE = 0.25
REGULAR_RATE = 0.001

model = tf.keras.Sequential([
    # [Option 2] convert text to vectors
    vectorize_layer,
    # TODO: can switch the embedding to be a pre-trained model, e.g. word2vec
    layers.Embedding(
        input_dim=MAX_FEATURES + 1,
        # input_dim=len(vectorize_layer.get_vocabulary()) + 1,
        output_dim=EMBEDDING_DIM,
        input_length=SEQUENCE_LENGTH,
        mask_zero=True),
    # Avoid overfitting
    layers.SpatialDropout1D(DROPOUT_RATE),
    # LSTM layer
    layers.Bidirectional(layers.LSTM(
        LSTM_UNITS,
        dropout=DROPOUT_RATE,
        recurrent_dropout=DROPOUT_RATE,
        return_sequences=True)),
    layers.Bidirectional(layers.LSTM(
        LSTM_UNITS,
        dropout=DROPOUT_RATE,
        recurrent_dropout=DROPOUT_RATE)),
    # layers.LSTM(
    #     LSTM_UNITS,
    #     dropout=DROPOUT_RATE,
    #     recurrent_dropout=DROPOUT_RATE,
    #     kernel_regularizer=regularizers.l2(REGULAR_RATE),
    #     recurrent_regularizer=regularizers.l2(REGULAR_RATE)),
    # increasing complexity
    # layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(REGULAR_RATE)),
    # using sigmoid activation function for binary classification problem
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
print(MAX_FEATURES)
print(SEQUENCE_LENGTH)
print(BATCH_SIZE)

In [None]:
model.summary()

# Training

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    verbose=1,
    restore_best_weights=True  # Restores model weights from the epoch with the minimum validation loss.
)

# Save model checkpoints
checkpoint_filepath = 'training/best_model.h5'
model_checkpoint = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

# Reduce learning rate on plateau
reduce_lr = ReduceLROnPlateau(
    monitor='val_accuracy',
    factor=0.1,
    patience=3,
    verbose=1
)

In [None]:
# Even More Hyperparameters
EPOCHS = 10

history = model.fit(train_ds, epochs=EPOCHS, validation_data=val_ds, callbacks=[early_stopping, reduce_lr])

In [None]:
test_loss, test_acc = model.evaluate(test_ds)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
sample_text = ('Lets explore each others body')
tracker = EmissionsTracker()
tracker.start()
predictions = model.predict(np.array([sample_text]))
emissions: float = tracker.stop()
print(f"Emissions: {emissions} kg")
predictions

# Analysis

## Training Plots

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
history_dict.keys()

In [None]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(acc) + 1)

In [None]:
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo-', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'ro-', label='Validation loss')
plt.title(f'Training and Validation Loss over {EPOCHS} Epochs of LSTM Model')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.plot(epochs, acc, 'bo-', label='Training acc')
plt.plot(epochs, val_acc, 'ro-', label='Validation acc')
plt.title(f'Training and Validation Accuracy over {EPOCHS} Epochs of LSTM Model')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

# make predictions on test set
y_pred = model.predict(test_ds)
# convert these probabilities to binary predictions
# e.g., classify samples with a probability > 0.5 as positive (flirty)
y_pred_binary = (y_pred > 0.5).astype("int32")
# Prepare the true labels. You'll need to concatenate them into one array.
y_true = np.concatenate([y for x, y in test_ds], axis=0)

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred_binary)

In [None]:
import seaborn as sns

# plot the confusion matrix using seaborn (for better visualization)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title("Confusion Matrix for LSTM Model's Prediction on Test Set")
plt.show()

## See Wrong Predictions

In [None]:
# Get binary prediction produced by model
y_pred1 = (y_pred > 0.5).astype("int32").flatten()
# Collect true labels
y_true1 = np.concatenate([labels.numpy() for inputs, labels in test_ds]).flatten()
# Collect input data (assuming the input data is a NumPy array or a list)
x_test1 = np.concatenate([inputs.numpy() for inputs, labels in test_ds]).flatten()

In [None]:
# Identify indices of incorrect predictions
false_positives = np.where((y_pred1 == 1) & (y_true1 == 0))[0]
false_negatives = np.where((y_pred1 == 0) & (y_true1 == 1))[0]
# Retrieve the corresponding input text for false positives and false negatives
fp_texts = [x_test1[i] for i in false_positives]
fn_texts = [x_test1[i] for i in false_negatives]

In [None]:
# Now, if you want to visualize some of them:
print("False Positives:")
for i, fp_idx in enumerate(false_positives):  # Show first 10 false positives
  print(f"Text: {fp_texts[i]} - Predicted: {y_pred[fp_idx]}, Actual: {y_true[fp_idx]}")
print("\n")
print("False Negatives:")
for i, fn_idx in enumerate(false_negatives):  # Show first 10 false negatives
  print(f"Text: {fn_texts[i]} - Predicted: {y_pred[fn_idx]}, Actual: {y_true[fn_idx]}")

# Save the Model

In [None]:
import keras
print(keras.__version__)

import tensorflow
print(tensorflow.__version__)

In [None]:
model.save('LSTM-Flirt.keras')

In [None]:
from tensorflow import keras

new_model = keras.models.load_model('LSTM-Flirt.keras')

# Show the model architecture
new_model.summary()

In [None]:
print(train_dataset_final)
tracker = EmissionsTracker()
tracker.start()
for i in range(4000):
    text = train_dataset_final['texts'][i%3000]
    predictions = new_model.predict(np.array([sample_text]))
emissions: float = tracker.stop()
print(f"Emissions for 2000 runs: ", emissions)
predictions

In [None]:
emissions_data = tracker.get_emissions()
print(tracker.total_emissions)

In [None]:
new_model.evaluate(test_ds)