# Imports

In [31]:
import keras
import datasets
import numpy as np
import transformers
import datetime
import os
import sklearn.metrics
import tensorflow as tf
import tqdm.notebook as tqdm
import tensorflow_datasets as tfds
import sklearn.model_selection
import matplotlib.pyplot as plt
from typing import List
import pandas as pd
from pathlib import Path  

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except:
        pass

I0000 00:00:1730980886.120714  227453 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1730980886.169429  227453 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1730980886.169494  227453 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.


# Global params

In [45]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
OUTPUT = 28
EPOCHS = 20

In [4]:
np.random.seed(42)

# Dataset

Load the dataset (we will be using [go_emotions](https://huggingface.co/datasets/google-research-datasets/go_emotions)). Pretokenize data or make a loader that tokenizes the sentenses as you iterate through the dataset. Implement two datasets: variable and fixed sentence length (in tokens). Don't forget to split the dataset into train and test subsets

In [5]:
dataset = datasets.load_dataset('google-research-datasets/go_emotions', name='raw', split='train')

In [6]:
dataset.column_names

['text',
 'id',
 'author',
 'subreddit',
 'link_id',
 'parent_id',
 'created_utc',
 'rater_id',
 'example_very_unclear',
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [6]:
emotions = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
    'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
    'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
    'remorse', 'sadness', 'surprise', 'neutral'
]

In [7]:
tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def get_target(item):
    item['lables'] = [item[key] for key in emotions]

    return item

In [9]:
# Train / test split

def get_train_test(dataset: datasets.arrow_dataset.Dataset):
    dataset_splited = dataset.train_test_split(test_size=0.1)
    split_labels    = ['train', 'test']

    def get_process_data(split_label: str):
        data = dataset_splited[split_label]
        
        process_data = data.map(lambda item: tokenizer(item['text'], padding='max_length', max_length=64, truncation=True), batched=True)
        process_data = process_data.map(get_target)
        
        tf_ds = process_data.to_tf_dataset(
            columns=["input_ids"],
            label_cols=["lables"],
            batch_size=BATCH_SIZE,
            shuffle=(split_label == 'train'),
        )

        return tf_ds

    return map(get_process_data, split_labels)

In [10]:
# train, test = get_train_test(dataset)
train, test = get_train_test(dataset)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
I0000 00:00:1730980913.372793  227453 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1730980913.372884  227453 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1730980913.372927  227453 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1730980913.550533  227453 cuda_executor.cc:1001] could not open file to read NUMA

In [11]:
ind = 0
for item in test.as_numpy_iterator():
    if ind < 1:
        print(item[0][0])
        print(item[1][0])
    else: 
        break
    ind += 1

[14253   510   477   465  3404   290  1234   340  2354   262  3420 50256
 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256
 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256
 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256
 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256 50256
 50256 50256 50256 50256]
[0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# Model

Implement your model. The model should have the RNN architecture (with LSTM or GRU cells), support stacking and bidirectional feature extraction.

In [36]:
def get_name(prefix: str | None = None, suffix: str | None = None, separator: str = '_') -> str | None:
    return prefix and prefix + separator + suffix or suffix or None

In [37]:
def get_model(
    units: int,
    n_tokens: int,
    n_labels: int,
    n_stacks: int = 1,
    bidirectional: bool = False,
    name: str | None = None,
    cell_type: type[keras.layers.Layer] = keras.layers.LSTMCell
) -> keras.Model:
    def apply_rnn(x):
        rnn_layer_template = keras.layers.LSTM(units, name=get_name(name, 'lstm'))
        
        rnn_layer = rnn_layer_template

        if bidirectional:
            rnn_layer = keras.layers.Bidirectional(rnn_layer_template)

            if n_stacks > 1:
                new_x = keras.layers.Bidirectional(keras.layers.LSTM(units, return_sequences=True, name=get_name(name, 'lstm')))(x)
                return keras.layers.Bidirectional(rnn_layer_template)(new_x)
            else:
                return rnn_layer(x)
            
        elif n_stacks > 1:
            rnn_cells = [cell_type(units) for _ in range(n_stacks)]
            stacked_lstm = keras.layers.StackedRNNCells(rnn_cells)
            rnn_layer = keras.layers.RNN(stacked_lstm)

            return rnn_layer(x)
        else:
            return rnn_layer(x)

    

    inputs = keras.layers.Input((None, ), dtype=tf.int32, name=get_name(name, 'inputs_tokens'))
    x = keras.layers.Embedding(n_tokens, 64, name=get_name(name, 'embedding'))(inputs)
    x = apply_rnn(x)
    outputs = keras.layers.Dense(n_labels, activation="sigmoid", name=get_name(name, 'prediction'))(x)

    return keras.Model(inputs=inputs, outputs=outputs, name=name)


# Training

Train several models on the two dataset variants. Use either of the cell types (LSTM or GRU)
* Simple RNN (no stacking, one direction)
* Stacked RNN (stacking, one direction)
* Bidirectional RNN (no stacking, bidirectional)
* Stacked Bidirectional RNN (stacking, bidirectional)

In [38]:
models_params = [
    {
        'units': 32,
        'name': 'simple_lstm',
        'bidirectional': False,
        'n_stacks': 1,
        'cell_type': keras.layers.LSTMCell,
    },
    {
        'units': 32,
        'name': 'stacked_lstm',
        'bidirectional': False,
        'n_stacks': 2,
        'cell_type': keras.layers.LSTMCell,
    },
    {
        'units': 32,
        'name': 'bidirectional_lstm',
        'bidirectional': True,
        'n_stacks': 1,
        'cell_type': keras.layers.LSTMCell,
    },
    {
        'units': 32,
        'name': 'bidirectional_stacked_lstm',
        'bidirectional': True,
        'n_stacks': 2,
        'cell_type': keras.layers.LSTMCell,
    },
]

In [39]:
models = [
    get_model(
        units=model['units'],
        n_tokens=len(tokenizer.get_vocab()),
        n_labels=len(emotions),
        name=model['name'],
        bidirectional=model['bidirectional'],
        n_stacks=model['n_stacks'],
        cell_type=model['cell_type']
    )
    for model in models_params
]

Which loss should be used to multilabel classification? Which metrics?

In [40]:
for model in models:
    model.compile(
        loss=keras.losses.BinaryCrossentropy(),
        optimizer=keras.optimizers.Adam(
            learning_rate=0.001
        ),
        metrics=[
            'binary_accuracy'
        ]
    )

In [41]:
HISTORY_DIR = './history'
os.makedirs(HISTORY_DIR, exist_ok=True)

In [42]:
logdir = os.path.join(HISTORY_DIR, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [43]:
def get_callbacks(model_name: str):
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        os.path.join(logdir, f'model-{model_name}.keras'),
        save_best_only=True
    )
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        os.path.join(logdir, f'logs-{model_name}.keras'),    
    )

    return model_checkpoint_callback, tensorboard_callback

In [46]:
histories = []

In [None]:
for index in range(0, len(models)):
    model = models[index]
    model_name = models_params[index]['name']
    
    model_checkpoint_callback, tensorboard_callback = get_callbacks(model_name)
    
    history = model.fit(train, validation_data=test, epochs=EPOCHS, callbacks=[model_checkpoint_callback, tensorboard_callback])

    histories.append(history)

Epoch 1/20
[1m1701/2971[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m56s[0m 45ms/step - binary_accuracy: 0.9423 - loss: 0.2003

# Evaluation

Evaluate the models you trained on the test datasets. Plot ROC curves for each label (use `sklearn.metrics.RocCurveDisplay`) for each model.

In [None]:
def plot_roc_curve(
    X: np.ndarray,
    y: np.ndarray,
    model: keras.Model,
    ax: plt.Axes | None = None
) -> float:
    '''Plots ROC curves for each of the labels (on a single axes) and outputs mean ROC AUC score.

    Arguments:
        X: model inputs
        y: ground thruths
        model: model to plot the curve for
        ax: axes to plot on

    Returns:
        Mean ROC AUC score'''

Plot the mean ROC AUC scores. Which model has the highest score? On what kind of dataset?

Inspect the best model performance closer. Come up with some sentences (in English). Does the model output sensible results?

In [None]:
def label_text(text: str, model: keras.Model, threshold: float = 0.5, max_length: int | None = None) -> list[str]:
    '''Computes the model output for `text` and outputs a list of emotions that have a probability of at least `threshold`

    Arguments:
        text: text to label
        model: model to use
        threshold: threshold to use
        max_length: max length for tokenization
    
    Return:
        List of predicted emotion labels'''

In [None]:
def plot_emotion_scores(text: str, model: keras.Model, max_length: int | None = None, ax: plt.Axes | None = None):
    '''Plots a bar plot of emotion probabilities for given `text` using `model`.

    Arguments:
        text: text to label
        model: model to use        
        max_length: max length for tokenization
        ax: axes to plot on'''

For each of your texts get a list of emotion labels and plot emotion scores

# Bonus

Train and evaluate the same model as your best one, but use a different cell type