### Implementing emotion cause analysis using Bert model and adding additional layers

In [66]:
# Imports for most of the notebook
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List
import random
from tqdm.autonotebook import tqdm

In [67]:
print(torch.cuda.is_available())
device = torch.device("cpu")

True


### Loading the data

In [68]:
import json
# Opening JSON file
f = open('.\Data\\text\Subtask_1_train.json')
# returns JSON object as 
# a dictionary
data = json.load(f)
data

[{'conversation_ID': 1,
  'conversation': [{'utterance_ID': 1,
    'text': 'Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked .',
    'speaker': 'Chandler',
    'emotion': 'neutral'},
   {'utterance_ID': 2,
    'text': 'Oh , yeah . Had that dream .',
    'speaker': 'All',
    'emotion': 'neutral'},
   {'utterance_ID': 3,
    'text': 'Then I look down , and I realize there is a phone ... there .',
    'speaker': 'Chandler',
    'emotion': 'surprise'},
   {'utterance_ID': 4,
    'text': 'Instead of ... ?',
    'speaker': 'Joey',
    'emotion': 'surprise'},
   {'utterance_ID': 5,
    'text': 'That is right .',
    'speaker': 'Chandler',
    'emotion': 'anger'},
   {'utterance_ID': 6,
    'text': 'Never had that dream .',
    'speaker': 'Joey',
    'emotion': 'neutral'},
   {'utterance_ID': 7,
    'text': 'No .',
    'speaker': 'Phoebe',
    'emotion': 'neutral'},
   {'utterance_ID': 8,
    'text': 'All of a sudden , the

### separate labels and data

In [88]:
# Initialize empty lists for conversations, emotion-cause pairs, and emotion labels
x_data = []
y_cause_labels = []
y_emotion_labels = []

# Iterate through each conversation in the dataset
for conv in data:
    # Extract conversation, emotion-cause pairs, and emotion labels
    conversation = conv['conversation']
    emotion_cause_pairs = conv['emotion-cause_pairs']
    
    # Extract emotion labels from each utterance in the conversation
    emotion_labels = [utterance['emotion'] for utterance in conversation]

    # Append to the respective lists
    x_data.append(conversation)
    y_cause_labels.extend(emotion_cause_pairs)
    y_emotion_labels.append(emotion_labels) # figure out if its append or extend

# Print the extracted data
# print("Conversations:")
# for conv in x_data:
#     print(conv)

# print("\nEmotion-Cause Pairs:")
# for pair in y_cause_labels:
#     print(pair)

# print("\nEmotion Labels:")
# print(y_emotion_labels)


In [82]:
x_data

[[{'utterance_ID': 1,
   'text': 'Alright , so I am back in high school , I am standing in the middle of the cafeteria , and I realize I am totally naked .',
   'speaker': 'Chandler',
   'emotion': 'neutral'},
  {'utterance_ID': 2,
   'text': 'Oh , yeah . Had that dream .',
   'speaker': 'All',
   'emotion': 'neutral'},
  {'utterance_ID': 3,
   'text': 'Then I look down , and I realize there is a phone ... there .',
   'speaker': 'Chandler',
   'emotion': 'surprise'},
  {'utterance_ID': 4,
   'text': 'Instead of ... ?',
   'speaker': 'Joey',
   'emotion': 'surprise'},
  {'utterance_ID': 5,
   'text': 'That is right .',
   'speaker': 'Chandler',
   'emotion': 'anger'},
  {'utterance_ID': 6,
   'text': 'Never had that dream .',
   'speaker': 'Joey',
   'emotion': 'neutral'},
  {'utterance_ID': 7,
   'text': 'No .',
   'speaker': 'Phoebe',
   'emotion': 'neutral'},
  {'utterance_ID': 8,
   'text': 'All of a sudden , the phone starts to ring .',
   'speaker': 'Chandler',
   'emotion': 'neu

In [89]:
y_emotion_labels

[['neutral',
  'neutral',
  'surprise',
  'surprise',
  'anger',
  'neutral',
  'neutral',
  'neutral'],
 ['sadness', 'neutral', 'surprise'],
 ['surprise',
  'neutral',
  'sadness',
  'neutral',
  'neutral',
  'surprise',
  'joy',
  'neutral',
  'surprise'],
 ['sadness', 'sadness', 'sadness'],
 ['joy', 'neutral', 'joy'],
 ['neutral',
  'neutral',
  'surprise',
  'neutral',
  'neutral',
  'sadness',
  'neutral',
  'joy',
  'neutral',
  'neutral'],
 ['sadness',
  'surprise',
  'neutral',
  'surprise',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral'],
 ['sadness',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'neutral',
  'joy',
  'joy',
  'neutral',
  'neutral',
  'joy',
  'surprise'],
 ['surprise', 'neutral', 'surprise', 'neutral', 'surprise', 'neutral'],
 ['neutral', 'joy', 'joy', 'surprise', 'neutral'],
 ['disgust', 'neutral', 'neutral', 'sadness', 'neutral'],
 ['neutral',
  'anger',


### Change the label formart

In [70]:
## todo: change ['17_anger', '16_we could throw you both in'] --> ['17_anger', '16'] and making sure labels is list[list[labels for each conversation]]. confirm if labels needs to be in this formart list[list[labels for each conversation]]

## Create Train and Dev Set

In [71]:
from sklearn.model_selection import train_test_split
train_conv, dev_conv = train_test_split(data, test_size=0.2, random_state=42)

#todo: with above x_data and y_label get x_train, y_ label, x_dev and y_dev

In [90]:
from sklearn.model_selection import train_test_split

# Split the data into training and development sets
x_train, x_dev, y_train, y_dev = train_test_split(x_data, y_emotion_labels, test_size=0.2, random_state=42)

# Print the sizes of the sets
print("Training set size:", len(x_train))
print("Development set size:", len(x_dev))


Training set size: 1099
Development set size: 275


### bert tokenizer

In [78]:
# Use this for now

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self, model_name="bert-base-uncased"):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.conv_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model_name = model_name
    
    def get_sep_token(self,):
        return self.conv_tokenizer.sep_token
    
    def __call__(self, conv_batch: List[dict]) -> List[dict[str]]:
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # The two sentences deimited by the [SEP] token.
        # combined_texts = [f"{utterance['speaker']}: {utterance['text']}" for utterance in conv_batch["conversation"]]
        # print(combined_texts)
        
        # print(conv_batch)
        combined_single_texts=[]
        batch_texts=[]
        for conv in conv_batch:
            #print(conv)
            combined_single_texts=[]
            for utterance in conv["conversation"]:
                combined_single_texts.append(f"{utterance['speaker']}: {utterance['text']}")
            batch_texts.append(combined_single_texts)
        print(batch_texts)
        encoded=[]
        for batch in batch_texts:
            enc = self.conv_tokenizer(
                batch,
                padding=True,
                return_token_type_ids=False,
                return_tensors='pt'
            )
            encoded.append(enc)
        print(encoded)
        return enc
    

# HERE IS AN EXAMPLE OF HOW TO USE THE BATCH TOKENIZER
tokenizer = BatchTokenizer()
x = tokenizer(train_conv[0:10])
#print(x)
#tokenizer.conv_tokenizer.batch_decode(x["input_ids"])


[['Joey: Okay , see that blind guy right there ? I am gonna bash his head in later .', 'Joey: Oh umm , my big scene is coming up . Big scene coming up .', 'Chandler: If you said , " Big lima bean , bubbling up . " Would she understand the difference ?', 'Monica: Rach ? What are you doing ?', 'Rachel: Oh boy , I just can not watch . It is too scary !', 'Monica: It is a diaper commercial .', 'Rachel: Oh yeah well , you know me , babies , responsibilities , ahhh ! ! !', 'Caitlin: Pizza delivery !', 'Ross: I will get it ! I will get that !', 'Caitlin: Hi !', 'Ross: Hi !'], ['Monica: Ow !', 'Richard: Really ? ! Well , it is just like everyone else apartment . It is got rooms , walls , and ceilings .', "Richard's Date: Well , I just wanted to see where you lived . Now , give me the tour .", 'Monica: Oh my God ! Oh my God !', 'Richard: Ah well , this is the living room .', 'Richard: All right . This is the kitchen .', 'Richard: The bedroom . Well it is pretty much your typical ... bedroom .',

In [45]:
from typing import List, Dict
from transformers import AutoTokenizer

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self, model_name="bert-base-uncased"):
        """Initializes the tokenizer.

        Args:
            model_name (str, optional): Pretrained model name. Defaults to "bert-base-uncased".
        """
        self.conv_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model_name = model_name
    
    def get_sep_token(self):
        return self.conv_tokenizer.sep_token
    
    def __call__(self, conv_batch: List[Dict[str, List[Dict[str, str]]]]) -> Dict[str, torch.Tensor]:
        """Uses the Hugging Face tokenizer to tokenize and pad a batch.

        Args:
            conv_batch (List[Dict[str, List[Dict[str, str]]]]): A list of conversations.

        Returns:
            Dict[str, torch.Tensor]: The dictionary of token specifications provided by Hugging Face.
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # the sentences delimited by the [SEP] token.
        
        combined_texts = []
        for conv in conv_batch:
            combined_texts.extend([f"{utterance['speaker']}: {utterance['text']}" for utterance in conv["conversation"]])
        print(combined_texts)

        encodings = self.conv_tokenizer(
            combined_texts,
            padding=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return encodings

# Example of how to use the batch tokenizer
tokenizer = BatchTokenizer()
batch_encodings = tokenizer([train_conv[0], train_conv[1]])  # Pass a batch of conversations
print(batch_encodings)
decoded_texts = tokenizer.conv_tokenizer.batch_decode(batch_encodings["input_ids"])
print(decoded_texts)


['Joey: Okay , see that blind guy right there ? I am gonna bash his head in later .', 'Joey: Oh umm , my big scene is coming up . Big scene coming up .', 'Chandler: If you said , " Big lima bean , bubbling up . " Would she understand the difference ?', 'Monica: Rach ? What are you doing ?', 'Rachel: Oh boy , I just can not watch . It is too scary !', 'Monica: It is a diaper commercial .', 'Rachel: Oh yeah well , you know me , babies , responsibilities , ahhh ! ! !', 'Caitlin: Pizza delivery !', 'Ross: I will get it ! I will get that !', 'Caitlin: Hi !', 'Ross: Hi !', 'Monica: Ow !', 'Richard: Really ? ! Well , it is just like everyone else apartment . It is got rooms , walls , and ceilings .', "Richard's Date: Well , I just wanted to see where you lived . Now , give me the tour .", 'Monica: Oh my God ! Oh my God !', 'Richard: Ah well , this is the living room .', 'Richard: All right . This is the kitchen .', 'Richard: The bedroom . Well it is pretty much your typical ... bedroom .', "R

### Making batched of the input

In [None]:
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


In [None]:
#todo: batch and encode the following needed for model training  x_train, y_ label, x_dev and y_dev

### Model for emotion classifier

In [None]:
#Todo: edit the emotion classifer as required by adding necessary layers to classify the emotion

class emotionClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int, model_name='prajjwal1/bert-small'):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained(model_name)
        
        # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
        # Freeze them if training is too slow. Notice that the learning
        # rate should probably be smaller in this case.
        # Uncommenting out the below 2 lines means only our classification layer will be updated.
        
        # for param in self.bert.parameters():
        #     param.requires_grad = False
        
        self.bert_hidden_dimension = self.bert.config.hidden_size
        
        # TODO: Add an extra hidden layer in the classifier, projecting
        #      from the BERT hidden dimension to hidden size. Hint: torch.nn.Linear()
        
        self.hidden_layer = None
        
        # TODO: Add a relu nonlinearity to be used in the forward method
        #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        
        self.relu = None
        
        self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=2)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Encode the (batch of) sequence(s) of token symbols BERT.
            Then, get CLS represenation.

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: CLS token embedding
        """
        # First we get the contextualized embedding for each input symbol
        # We no longer need an LSTM, since BERT encodes context and 
        # gives us a single vector describing the sequence in the form of the [CLS] token.
        encoded_sequence = self.bert(**symbols)
        # TODO: Get the [CLS] token
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        # print(encoded_sequence.last_hidden_state.shape)
        # Return only the first token's embedding from the last_hidden_state. Hint: using list slices
        raise NotImplementedError

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        output = self.hidden_layer(encoded_sents)
        output = self.relu(output)
        output = self.classifier(output)
        return self.log_softmax(output)

### Training loop

In [None]:
#todo: add training loop for the model and return the F1 score on the dev set