In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

SEED = 1103

notebooks_dir = Path.cwd()
project_dir = notebooks_dir.parent
data_dir = project_dir / 'data' / 'raw'
text_data_path = data_dir / 'Subtask_2_train.json'
interim_dir = project_dir / 'data' / 'interim'
mel_dir = interim_dir / 'mel' 
models_dir = project_dir / 'models'

In [2]:
text_data = json.loads(text_data_path.read_text())
for i in range(len(text_data[0]['conversation'])):
    print(text_data[0]['conversation'][i])

{'utterance_ID': 1, 'text': 'Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked .', 'speaker': 'Chandler', 'emotion': 'neutral', 'video_name': 'dia1utt1.mp4'}
{'utterance_ID': 2, 'text': 'Oh , yeah . Had that dream .', 'speaker': 'All', 'emotion': 'neutral', 'video_name': 'dia1utt2.mp4'}
{'utterance_ID': 3, 'text': 'Then I look down , and I realize there is a phone ... there .', 'speaker': 'Chandler', 'emotion': 'surprise', 'video_name': 'dia1utt3.mp4'}
{'utterance_ID': 4, 'text': 'Instead of ... ?', 'speaker': 'Joey', 'emotion': 'surprise', 'video_name': 'dia1utt4.mp4'}
{'utterance_ID': 5, 'text': 'That is right .', 'speaker': 'Chandler', 'emotion': 'anger', 'video_name': 'dia1utt5.mp4'}
{'utterance_ID': 6, 'text': 'Never had that dream .', 'speaker': 'Joey', 'emotion': 'neutral', 'video_name': 'dia1utt6.mp4'}
{'utterance_ID': 7, 'text': 'No .', 'speaker': 'Phoebe', 'emotion': 'neutral', 'video_name': 'dia1utt7.mp4'}

In [7]:
def get_data_from_json(json):
    text = []
    emotions = []
    filenames = []
    for line_idx in range(len(json)):
        text += [json[line_idx]['text']]
        emotions += [json[line_idx]['emotion']]
        filenames += [json[line_idx]['video_name'][:-4]]
    return text, emotions, filenames

convos = []
all_emotions = []
all_filenames = []
for convo_idx in range(len(text_data)):
    convo, emotions, filenames = get_data_from_json(text_data[convo_idx]['conversation'])
    convos += convo
    all_emotions += emotions
    all_filenames += filenames

data = {'filename': all_filenames, 'text' : convos, 'label': all_emotions}
text_data_dict = {'text' : convos, 'label': all_emotions}

In [4]:
print(data['filename'])
print(data['text'])
print(data['label'])

['dia1utt1', 'dia1utt2', 'dia1utt3', 'dia1utt4', 'dia1utt5', 'dia1utt6', 'dia1utt7', 'dia1utt8', 'dia2utt1', 'dia2utt2', 'dia2utt3', 'dia3utt1', 'dia3utt2', 'dia3utt3', 'dia3utt4', 'dia3utt5', 'dia3utt6', 'dia3utt7', 'dia3utt8', 'dia3utt9', 'dia4utt1', 'dia4utt2', 'dia4utt3', 'dia5utt1', 'dia5utt2', 'dia5utt3', 'dia6utt1', 'dia6utt2', 'dia6utt3', 'dia6utt4', 'dia6utt5', 'dia6utt6', 'dia6utt7', 'dia6utt8', 'dia6utt9', 'dia6utt10', 'dia7utt1', 'dia7utt2', 'dia7utt3', 'dia7utt4', 'dia7utt5', 'dia7utt6', 'dia7utt7', 'dia7utt8', 'dia7utt9', 'dia7utt10', 'dia8utt1', 'dia8utt2', 'dia8utt3', 'dia8utt4', 'dia8utt5', 'dia8utt6', 'dia8utt7', 'dia8utt8', 'dia8utt9', 'dia8utt10', 'dia8utt11', 'dia8utt12', 'dia8utt13', 'dia8utt14', 'dia8utt15', 'dia8utt16', 'dia8utt17', 'dia9utt1', 'dia9utt2', 'dia9utt3', 'dia9utt4', 'dia9utt5', 'dia9utt6', 'dia10utt1', 'dia10utt2', 'dia10utt3', 'dia10utt4', 'dia10utt5', 'dia11utt1', 'dia11utt2', 'dia11utt3', 'dia11utt4', 'dia11utt5', 'dia12utt1', 'dia12utt2', 'dia1

In [8]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_dict(text_data_dict)
ds = ds.class_encode_column("label")

train_testvalid = ds.train_test_split(test_size=0.2, stratify_by_column="label", seed=SEED)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5, stratify_by_column="label", seed=SEED)

dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

print(dataset)


Casting to class labels:   0%|          | 0/13619 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10895
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1362
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 1362
    })
})


In [9]:
print(dataset['train'][0])

{'text': 'Yeah , I ... I ... I ... I am funny Ben , but I am not stupid . Okay ?', 'label': 4}


In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
labels = ['neutral', 'joy', 'surprise', 'anger', 'fear', 'disgust', 'sadness']
label2id = {label: index for index, label in enumerate(labels)}
id2label = {index: label for index, label in enumerate(labels)}

def preprocess_function(batch):
    tokenized_batch = tokenizer(batch["text"], truncation=True)
    tokenized_batch["label"] = [id2label[label] for label in batch["label"]]
    return tokenized_batch

tokenized_data = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/10895 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

Map:   0%|          | 0/1362 [00:00<?, ? examples/s]

In [11]:
print(tokenized_data)
print(tokenized_data['train'][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10895
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1362
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1362
    })
})
{'text': 'Yeah , I ... I ... I ... I am funny Ben , but I am not stupid . Okay ?', 'label': 2, 'input_ids': [101, 3398, 1010, 1045, 1012, 1012, 1012, 1045, 1012, 1012, 1012, 1045, 1012, 1012, 1012, 1045, 2572, 6057, 3841, 1010, 2021, 1045, 2572, 2025, 5236, 1012, 3100, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
print(label2id)
print(id2label)

{'neutral': 0, 'joy': 1, 'surprise': 2, 'anger': 3, 'fear': 4, 'disgust': 5, 'sadness': 6}
{0: 'neutral', 1: 'joy', 2: 'surprise', 3: 'anger', 4: 'fear', 5: 'disgust', 6: 'sadness'}


PYTORCH

In [28]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

TENSORFLOW

In [29]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [30]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [31]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["valid"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [32]:
import tensorflow as tf

model.compile(optimizer=optimizer) 

In [33]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  5383      
                                                                 
 dropout_39 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66,958,855
Trainable params: 66,958,855
Non-trainable params: 0
_________________________________________________________________


In [34]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=models_dir,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

callbacks = [metric_callback, checkpoint_callback]

In [35]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)


Epoch 1/3


AttributeError: in user code:

    File "/Users/teodorastereciu/Documents/bachelors-project/mc-ecpe/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/teodorastereciu/Documents/bachelors-project/mc-ecpe/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/teodorastereciu/Documents/bachelors-project/mc-ecpe/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/teodorastereciu/Documents/bachelors-project/mc-ecpe/.venv/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1571, in train_step
        if self._label_to_output_map is not None:

    AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'
