## Import Data

In [1]:
import json
import pandas as pd
from typing import Dict, List
import math
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List
import random
from tqdm.autonotebook import tqdm
 
# Opening JSON file
f = open('.\Data\\text\Subtask_1_train.json', encoding="utf-8")
 
# returns JSON object as 
# a dictionary
data = json.load(f)

from collections import defaultdict

def get_even_datapoints(datapoints, n):
    random.seed(42)
    dp_by_label = defaultdict(list)
    for dp in tqdm(datapoints, desc='Reading Datapoints'):
        dp_by_label[dp['label']].append(dp)

    unique_labels = [0, 1, 2]

    split = n//len(unique_labels)

    result_datapoints = []

    for label in unique_labels:
        result_datapoints.extend(random.sample(dp_by_label[label], split))

    return result_datapoints

train_dataset = get_even_datapoints(data['train'], train)
validation_dataset = get_even_datapoints(data['validation'], validation)
test_dataset = get_even_datapoints(data['test'], test)


In [2]:
# Initialize empty lists for conversations, emotion-cause pairs, and emotion labels
x_data = []
y_cause_labels = []
y_emotion_labels = []

# Iterate through each conversation in the dataset
for conv in data:
    # Extract conversation, emotion-cause pairs, and emotion labels
    conversation = conv['conversation']
    emotion_cause_pairs = conv['emotion-cause_pairs']
    
    # Extract emotion labels from each utterance in the conversation
    emotion_labels = [utterance['emotion'] for utterance in conversation]

    # Append to the respective lists
    x_data.append(conversation)
    y_cause_labels.extend(emotion_cause_pairs)
    y_emotion_labels.append(emotion_labels) # figure out if its append or extend

# Print the extracted data
# print("Conversations:")
# for conv in x_data:
#     print(conv)

# print("\nEmotion-Cause Pairs:")
# for pair in y_cause_labels:
#     print(pair)

# print("\nEmotion Labels:")
# print(y_emotion_labels)


In [3]:
#y_emotion_labels

Split the dataset

In [4]:
from sklearn.model_selection import train_test_split
train_conv, dev_conv = train_test_split(data, test_size=0.2, random_state=42)

#todo: with above x_data and y_label get x_train, y_ label, x_dev and y_dev

In [5]:
from sklearn.model_selection import train_test_split

# Split the data into training and development sets
x_train, x_dev, y_train, y_dev = train_test_split(x_data, y_emotion_labels, test_size=0.2, random_state=42)

# Print the sizes of the sets
print("Training set size:", len(x_train))
print("Development set size:", len(x_dev))


Training set size: 1099
Development set size: 275


In [6]:
def make_batches(sequences: List[str], batch_size: int) -> List[List[str]]:
    """Yield batch_size chunks from sequences."""

    batch_list=[]

    last_index=len(sequences)-1
    
    for index in range(math.ceil(len(sequences)/batch_size)):
        
        if index+batch_size:
            batch_list.append(sequences[index:index+batch_size])
        else:
            batch_list.append(sequences[index:last_index])
    # DONE
    return batch_list

In [7]:
# Use this for now
'''
class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self, model_name="bert-base-uncased"):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.conv_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model_name = model_name
    
    def get_sep_token(self,):
        return self.conv_tokenizer.sep_token
    
    def __call__(self, conv_batch: List[dict]) -> List[dict[str]]:
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # The two sentences deimited by the [SEP] token.
        # combined_texts = [f"{utterance['speaker']}: {utterance['text']}" for utterance in conv_batch["conversation"]]
        # print(combined_texts)
        
        # print(conv_batch)
        combined_single_texts=[]
        batch_texts=[]
        for conv in conv_batch:
            #print(conv)
            combined_single_texts=[]
            for utterance in conv:
                combined_single_texts.append(f"{utterance['speaker']}: {utterance['text']}")
            batch_texts.append(combined_single_texts)
        #print(batch_texts)
        encoded=[]
        for batch in batch_texts:
            enc = self.conv_tokenizer(
                batch,
                padding=True,
                return_token_type_ids=False,
                return_tensors='pt'
            )
            encoded.append(enc)
        # print(encoded)
        return enc
    

# HERE IS AN EXAMPLE OF HOW TO USE THE BATCH TOKENIZER
tokenizer = BatchTokenizer()
x = tokenizer(x_train[0:10])
print(x)
#tokenizer.conv_tokenizer.batch_decode(x["input_ids"])

'''

'\nclass BatchTokenizer:\n    """Tokenizes and pads a batch of input sentences."""\n\n    def __init__(self, model_name="bert-base-uncased"):\n        """Initializes the tokenizer\n\n        Args:\n            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".\n        """\n        self.conv_tokenizer = AutoTokenizer.from_pretrained(model_name)\n        self.model_name = model_name\n    \n    def get_sep_token(self,):\n        return self.conv_tokenizer.sep_token\n    \n    def __call__(self, conv_batch: List[dict]) -> List[dict[str]]:\n        """Uses the huggingface tokenizer to tokenize and pad a batch.\n\n        We return a dictionary of tensors per the huggingface model specification.\n\n        Args:\n            batch (List[str]): A List of sentence strings\n\n        Returns:\n            Dict: The dictionary of token specifications provided by HuggingFace\n        """\n        # The HF tokenizer will PAD for us, and additionally combine \n        #

In [8]:
#len(x)

In [9]:
from typing import List, Dict
from transformers import AutoTokenizer

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self, model_name="bert-base-uncased"):
        """Initializes the tokenizer.

        Args:
            model_name (str, optional): Pretrained model name. Defaults to "bert-base-uncased".
        """
        self.conv_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model_name = model_name
    
    def get_sep_token(self):
        return self.conv_tokenizer.sep_token
    
    def __call__(self, conv_batch: List[Dict[str, List[Dict[str, str]]]]) -> Dict[str, torch.Tensor]:
        """Uses the Hugging Face tokenizer to tokenize and pad a batch.

        Args:
            conv_batch (List[Dict[str, List[Dict[str, str]]]]): A list of conversations.

        Returns:
            Dict[str, torch.Tensor]: The dictionary of token specifications provided by Hugging Face.
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # the sentences delimited by the [SEP] token.
        
        combined_texts = []
        for conv in conv_batch:
            # Check if the conversation is in the expected format
            if isinstance(conv, dict) and "conversation" in conv:
                for utterance in conv["conversation"]:
                    # Check if the utterance has the expected keys
                    if "speaker" in utterance and "text" in utterance:
                        combined_texts.append(f"{utterance['speaker']}: {utterance['text']}")
                    else:
                        print("Utterance is missing 'speaker' or 'text' keys:", utterance)
            else:
                print("Conversation is not a dictionary or missing 'conversation' key:", conv)

        if not combined_texts:
            raise ValueError("No texts found for tokenization")


        encodings = self.conv_tokenizer(
            combined_texts,
            padding=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return encodings

# Example of how to use the batch tokenizer
tokenizer = BatchTokenizer()


In [10]:
batch_encodings = tokenizer([train_conv[0], train_conv[1]])  # Pass a batch of conversations
print(batch_encodings)
decoded_texts = tokenizer.conv_tokenizer.batch_decode(batch_encodings["input_ids"])
print(decoded_texts)

{'input_ids': tensor([[  101,  9558,  1024,  3100,  1010,  2156,  2008,  6397,  3124,  2157,
          2045,  1029,  1045,  2572,  6069, 24234,  2010,  2132,  1999,  2101,
          1012,   102,     0,     0,     0,     0,     0],
        [  101,  9558,  1024,  2821, 26114,  1010,  2026,  2502,  3496,  2003,
          2746,  2039,  1012,  2502,  3496,  2746,  2039,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101, 13814,  1024,  2065,  2017,  2056,  1010,  1000,  2502, 12967,
         14068,  1010, 25054,  2039,  1012,  1000,  2052,  2016,  3305,  1996,
          4489,  1029,   102,     0,     0,     0,     0],
        [  101,  9018,  1024, 10958,  2818,  1029,  2054,  2024,  2017,  2725,
          1029,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  5586,  1024,  2821,  2879,  1010,  1045,  2074,  2064,  2025,
          3422,  1012,  2009,  2003,  2

In [11]:
'''
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

batch_size = 16
        
# Notice that since we use huggingface, we tokenize and
# encode in all at once!
tokenizer = BatchTokenizer()

train_input_batches = [b for b in chunk(x_train, batch_size)]
# Tokenize + encode
train_input_batches = [tokenizer(batch) for batch in train_input_batches]

dev_input_batches = [b for b in chunk(x_dev, batch_size)]

# Tokenize + encode
dev_input_batches = [tokenizer(batch) for batch in dev_input_batches]
'''

'\ndef chunk(lst, n):\n    """Yield successive n-sized chunks from lst."""\n    for i in range(0, len(lst), n):\n        yield lst[i:i + n]\n\nbatch_size = 16\n        \n# Notice that since we use huggingface, we tokenize and\n# encode in all at once!\ntokenizer = BatchTokenizer()\n\ntrain_input_batches = [b for b in chunk(x_train, batch_size)]\n# Tokenize + encode\ntrain_input_batches = [tokenizer(batch) for batch in train_input_batches]\n\ndev_input_batches = [b for b in chunk(x_dev, batch_size)]\n\n# Tokenize + encode\ndev_input_batches = [tokenizer(batch) for batch in dev_input_batches]\n'

In [13]:
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
batch_size = 16

for batch in chunk(x_train, batch_size):
    if not batch:
        print("Found an empty batch in x_train")
    else:
        print("Batch size:", len(batch))


tokenizer = BatchTokenizer()  # Ensure this is correctly defined

train_input_batches = [b for b in chunk(x_train, batch_size)]
# print("Train batch example:", train_input_batches[0])  # Debug: Check the first batch

# Tokenize + encode
train_input_batches = [tokenizer(batch) for batch in train_input_batches]

dev_input_batches = [b for b in chunk(x_dev, batch_size)]
print("Dev batch example:", dev_input_batches[0])  # Debug: Check the first batch

# Tokenize + encode
dev_input_batches = [tokenizer(batch) for batch in dev_input_batches]


Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size: 16
Batch size

ValueError: No texts found for tokenization

## Tokenize and Padding

In [None]:
conversation_list, emotion_list = data_to_conversation_list(data)

In [None]:
# conversation_list

In [None]:
unique_emotion_list = []
for emotion in emotion_list:
    if emotion not in unique_emotion_list:
        unique_emotion_list.append(emotion)

In [None]:
# Special tokens to be removed
special_tokens = {'<SOS>', '<EOS>', '<PAD>'}

# Joining each sublist into strings and then joining these strings
conversation_string = ' '.join(
    ' '.join(
        ' '.join(word for word in inner_list if word not in special_tokens) 
        for inner_list in sublist
    ) 
    for sublist in conversation_list
)

In [None]:
# conversation_string

In [None]:
from transformers import BertTokenizer, BertModel
import torch

def get_sentence_embedding(sentence, model_name="bert-base-uncased"):
    # Load pre-trained model tokenizer
    #tokenizer = BertTokenizer.from_pretrained(model_name)
    
    # Encode text
    #encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Example text
    
    # Tokenize the text
    encoded_input = tokenizer.encode_plus(
        sentence, 
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=512,
        truncation=True, # Pad & truncate all sentences.
        padding='max_length',
        #padding=True,
        return_attention_mask=True,   # Construct attention masks.
        return_tensors='pt',          # Return PyTorch tensors.
    )
    
    # Load pre-trained model
    model = BertModel.from_pretrained(model_name)

    # Forward pass, get hidden states
    with torch.no_grad():
        output = model(**encoded_input)

    # Get the embeddings of the [CLS] token (first token), representing the entire sentence
    sentence_embedding = output.last_hidden_state[:, 0, :]

    return sentence_embedding

In [None]:
conversation_embeddings = get_sentence_embedding(conversation_string)

In [None]:
conversation_embeddings

In [None]:
# Convert the list to a set to get unique elements, and then back to a list
unique_emotion_list = []
for emotion in emotion_list:
    if emotion not in unique_emotion_list:
        unique_emotion_list.append(emotion)

#print(unique_emotion_list)
print(conversation_string)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming 'conversation_embeddings' is a numpy array of your embeddings
# And 'labels' is a list or array of your emotion labels

labels = np.array(['joy', 'sadness', 'surprise', 'disgust', 'fear', 'neutral'])  # Replace with your labels
conversation_embeddings_req = np.array(conversation_embeddings)  # This should be a 2D array
labels_new = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(conversation_embeddings_req, labels_new, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
