# Text VAD Model

In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import tensorflow as tf
# import tensorflow_datasets as tfds

import itertools
import string
# import adjustText 
# import collections
# import re
import json

# from tqdm.auto import tqdm

save_figures = True
data_path = "../Data"
figure_path = "../Figures"

## Load preprocessed data

In [2]:
reddit_train_pd = pd.read_parquet(f"{data_path}/reddit_train.parquet")
reddit_test_pd = pd.read_parquet(f"{data_path}/reddit_test.parquet")

reddit_train_tokens = np.load(f"{data_path}/reddit_train_tokens.npy")
reddit_test_tokens = np.load(f"{data_path}/reddit_test_tokens.npy")

In [3]:
display(reddit_train_pd.head(n = 3))
print(reddit_train_tokens[0:3])

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,comment_text,confusion,curiosity,desire,...,surprise,n_tags,valence_raw,arousal_raw,dominance_raw,valence,arousal,dominance,Weight_raw,Weight
0,False,False,False,False,False,False,"b""It's just wholesome content, from questionab...",False,False,False,...,False,1,0.469,0.184,0.357,0.469,0.184,0.357,0.059224,0.059224
1,True,False,False,False,False,False,b'This is actually awesome.',False,False,False,...,False,1,0.969,0.583,0.726,0.969,0.583,0.726,0.004345,0.004345
2,False,False,False,False,False,False,"b""People really spend more than $10 in an app ...",True,False,False,...,False,2,0.307,0.955,0.441,0.1535,0.4775,0.2205,0.028525,0.014263


[[2967 4781 3031 6206 1196 2273 4416 5224    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [5664 2955   62  397    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [4041 4505 5248 3590 5624 2820  198  251 2312 2757 3420  198   61 6051
  2312 2757  811 5921  777 5631 4781 3031 4785    0    0    0    0    0
     0    0]]


In [4]:
display(reddit_test_pd.head(n = 3))
print(reddit_test_tokens[0:3])

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,comment_text,confusion,curiosity,desire,...,surprise,n_tags,valence_raw,arousal_raw,dominance_raw,valence,arousal,dominance,Weight_raw,Weight
0,False,False,False,False,True,False,"b""You're right, thanks for pointing that out, ...",False,False,False,...,False,1,1.739,0.901,1.499,1.739,0.901,1.499,0.022136,0.022136
1,False,False,False,False,False,False,b'Molon labe!!!',False,False,False,...,False,1,0.469,0.184,0.357,0.469,0.184,0.357,0.059224,0.059224
2,False,False,False,False,False,False,b'So this is what edging feels like',False,False,False,...,False,2,0.469,0.184,0.357,0.2345,0.092,0.1785,0.059224,0.029612


[[6349 4483 4707 5629 2211 4174 5631 3914 2757 6026 1792 2967 3769  847
  3846    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [   1    1    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [5166 5664 2955 6183    1 2093 3230    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]


### Dictionary

In [5]:
with open(f"{data_path}/token_dictionary.json", "r") as readfile:
    word_to_index_dict = json.load(readfile)

print(len(word_to_index_dict))
print(list(word_to_index_dict.items())[0:30])

6372
[('*PAD*', 0), ('*UNK*', 1), ('a', 2), ('aa', 3), ('ab', 4), ('abandoned', 5), ('abhorrent', 6), ('ability', 7), ('able', 8), ('abnormal', 9), ('abortion', 10), ('abortions', 11), ('about', 12), ('above', 13), ('abroad', 14), ('absolute', 15), ('absolutely', 16), ('absurd', 17), ('abuse', 18), ('abused', 19), ('abusing', 20), ('abusive', 21), ('ac', 22), ('accent', 23), ('accept', 24), ('acceptable', 25), ('accepted', 26), ('accepting', 27), ('access', 28), ('accident', 29)]


### VAD mapping

In [6]:
emotion_vad_mapping = np.array([
    [0.969,0.583,0.726],
    [0.929,0.837,0.803],
    [0.167,0.865,0.657],
    [0.167,0.718,0.342],
    [0.854,0.46,0.889],
    [0.635,0.469,0.5],
    [0.255,0.667,0.277],
    [0.75,0.755,0.463],
    [0.896,0.692,0.647],
    [0.115,0.49,0.336],
    [0.085,0.551,0.367],
    [0.052,0.775,0.317],
    [0.143,0.685,0.226],
    [0.896,0.684,0.731],
    [0.073,0.84,0.293],
    [0.885,0.441,0.61],
    [0.07,0.64,0.474],
    [0.98,0.824,0.794],
    [1,0.519,0.673],
    [0.163,0.915,0.241],
    [0.469,0.184,0.357],
    [0.949,0.565,0.814],
    [0.729,0.634,0.848],
    [0.554,0.51,0.836],
    [0.844,0.278,0.481],
    [0.103,0.673,0.377],
    [0.052,0.288,0.164],
    [0.875,0.875,0.562]])

emotion_columns = [
    "admiration", "amusement", "anger", "annoyance", "approval", 
    "caring", "confusion", "curiosity", "desire", "disappointment",
    "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", 
    "neutral", "optimism", "pride", "realization", "relief", 
    "remorse", "sadness","surprise"]

emotion_vad_mapping_pd = pd.DataFrame(columns = ["valence", "arousal", "dominance"], 
                                      data = emotion_vad_mapping, 
                                      index = emotion_columns)
display(emotion_vad_mapping_pd.T)

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,...,love,nervousness,neutral,optimism,pride,realization,relief,remorse,sadness,surprise
valence,0.969,0.929,0.167,0.167,0.854,0.635,0.255,0.75,0.896,0.115,...,1.0,0.163,0.469,0.949,0.729,0.554,0.844,0.103,0.052,0.875
arousal,0.583,0.837,0.865,0.718,0.46,0.469,0.667,0.755,0.692,0.49,...,0.519,0.915,0.184,0.565,0.634,0.51,0.278,0.673,0.288,0.875
dominance,0.726,0.803,0.657,0.342,0.889,0.5,0.277,0.463,0.647,0.336,...,0.673,0.241,0.357,0.814,0.848,0.836,0.481,0.377,0.164,0.562


## Model

In [7]:
class Text_VAD(tf.keras.Model):
    def __init__(self, vocab_size, window_size):
        super(Text_VAD, self).__init__()
        self.vocab_size = vocab_size # The size of the English vocab
        self.window_size = window_size # The English window size

        # TODO:
        # 1) Define any hyperparameters

        # Define batch size and optimizer/learning rate
        self.batch_size = 128 # You probably should change this
        self.optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01) ## what should the learning rate be????
        self.embedding_size = 64 # You should change this too

        # 2) Define embeddings, encoder, decoder, and feed forward layers
        
        initializer = tf.keras.initializers.TruncatedNormal(mean = 0.0, stddev = 0.01)

        self.embedding = tf.keras.layers.Embedding(self.vocab_size, 
                                                   self.embedding_size, 
                                                   embeddings_initializer = initializer,
                                                   name = "french_embedding")
        
        # Convolution? LSTM? Bi-directional LSTM Dense?
        # How many layers?
        self.next_layers_1 = None 
        self.next_layers_2 = None 
        self.next_layers_3 = None 

    @tf.function
    def call(self, text_input):
        """
        :param text_input: batched tokenized words of the reddit comments, [batch_size x window_size]
        :return VAD_coordinates: The 3d VAD coordinates as a tensor, [batch_size x 3]
        """

        # TODO:
        # 1) Pass your French sentence embeddings to your encoder
        # 2) Pass your English sentence embeddings, and final state of your encoder, to your decoder
        # 3) Apply dense layer(s) to the decoder out to generate probabilities
        
        embedding_layer = self.embedding(text_input)
        
        ## do something here
        
        VAD_coordinates = None
        
        # The final result should look like this
        # VAD_coordinates = 
        #     [[v_1, a_1, d_1],
        #      [v_2, a_2, d_2],
        #      ...
        #      [v_batch, a_batch, d_batch]]
        
        return VAD_coordinates


    def loss_function(self, VAD_true, VAD_pred, sample_weights):
        """
        Calculates the Euclidean distance between the true VAD coordinates and the predicted coordinates.

        :param VAD_true: float tensor, [batch_size x 3]
        :param VAD_pred: float tensor, [batch_size x 3]
        :param sample_weights: float tensor, [batch_size x 1]

        :return: the loss of the model as a tensor
        """
        
        ## Return the weighted average of euclidean distances
        ## loss = (weight_1*distance_1 + weight_2*distance_2 + ... + weight_n*distance_n)/(n_batch)

        return None

## Customized train, test, accuracy functions

In [8]:
def train(model, text_input, VAD_true, sample_weights):
    """
    Runs through one epoch - all training examples.

    :param model: the initialized model to use for forward and backward pass
    :param text_input: batched tokenized words of the reddit comments, [batch_size x window_size]
    :param VAD_true: float tensor, [batch_size x 3]
    :param sample_weights: float tensor, [batch_size x 1]
    
    :return: None
    """

    input_size = len(text_input)
    batch_size = model.batch_size
    
    # drop the rest of the data that do not fit in batches
    input_size = input_size - (input_size%batch_size)
    
    # shuffle
    shuffled_index = tf.random.shuffle(tf.range(input_size))
    shuffled_text_input  = tf.gather(text_input, shuffled_index)
    shuffled_VAD_true = tf.gather(VAD_true, shuffled_index)
    
    for train_index in range(0, input_size, batch_size): 
        batch_text_input = shuffled_text_input[train_index:(train_index + batch_size)]
        batch_VAD_true   = shuffled_VAD_true[train_index:(train_index + batch_size)]

        with tf.GradientTape() as tape:
            # get the loss for this batch
            batch_VAD_pred = model(batch_text_input, 
                                   batch_VAD_true)
            batch_loss = model.loss_function(batch_VAD_true, 
                                             batch_VAD_pred, 
                                             sample_weights)
            
        gradients = tape.gradient(batch_loss, model.trainable_variables)
        
        model.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        if train_index%(batch_size*200) == 0:
            print(f"training on batch {train_index}, "
                  f"progress {100*(train_index + batch_size)/(input_size):2.1f}%, "
                  f"loss {batch_loss:.4f}")
    
    print(f"training over\n"
          f"last batch {train_index}, "
          f"progress {100*(train_index + batch_size)/(input_size):2.1f}%, "
          f"loss {batch_loss:.4f}")    

In [9]:
def test(model, text_input, VAD_true):
    """
    Runs through one epoch - all training examples.

    :param model: the initialized model to use for forward and backward pass
    :param text_input: batched tokenized words of the reddit comments, [batch_size x window_size]
    :param VAD_true: float tensor, [batch_size x 3]
    
    :return: Loss
    """
    
    ## Do something similar to train

    return None

In [10]:
def accuracy(model, text_input, onehot_emotion_labels, emotion_vad_mapping_pd):
    """
    Runs through one epoch - all training examples.

    Predict the VAD coordinate for the input text
    Find the closest emotion label on the VAD space
    See if the model has chosen the correct emotion label

    :param model: the initialized model to use for forward and backward pass
    :param text_input: batched tokenized words of the reddit comments, [batch_size x window_size]
    :param onehot_emotion_labels: boolean tensor, [batch_size x 28]
    
    :return: average accuracy
    """
    
    ## return the average accuracy

    return None

## Train/test the model

### Prepare the input tensors

In [11]:
text_input_train = reddit_train_tokens
text_input_test = reddit_test_tokens

VAD_true_train = reddit_train_pd[["valence", "arousal", "dominance"]].to_numpy()
VAD_true_test = reddit_test_pd[["valence", "arousal", "dominance"]].to_numpy()

print(text_input_train.shape)
# print(text_input_train[0:3])
print("")

print(text_input_test.shape)
# print(text_input_test[0:3])
print("")

print(VAD_true_train.shape)
# print(VAD_true_train[0:3])

print("")

print(VAD_true_test.shape)
# print(VAD_true_test[0:3])

(43410, 30)

(5427, 30)

(43410, 3)

(5427, 3)


In [12]:
sample_weights_train = reddit_train_pd["Weight"].to_numpy()
print(sample_weights_train.shape)

sample_weights_test = reddit_test_pd["Weight"].to_numpy()
print(sample_weights_test.shape)

(43410,)
(5427,)


In [13]:
onehot_emotion_labels_train = reddit_train_pd[emotion_columns].to_numpy()
print(onehot_emotion_labels_train.shape)
print("")

onehot_emotion_labels_test = reddit_test_pd[emotion_columns].to_numpy()
print(onehot_emotion_labels_test.shape)

(43410, 28)

(5427, 28)


### Initialize the model and train

In [14]:
n_epochs = 3 ## how many epochs are needed???
vocab_size = len(word_to_index_dict)
window_size = 30

model = Text_VAD(vocab_size, 30)

# for each_epoch in range(n_epochs):
#     train(model, text_input_train, VAD_true_train, sample_weights_train)

In [15]:
# test_loss = test(model, text_input_test, VAD_true_test)
# test_accuracy = accuracy(model, text_input_test, onehot_emotion_labels_test, emotion_vad_mapping_pd)
# print(f"test loss: {test_loss}")
# print(f"test accuracy: {100*test_accuracy}%")