## Import Data

In [1]:
import json
import pandas as pd
from typing import Dict, List
import math
 
# Opening JSON file
f = open('.\Data\\text\Subtask_1_train.json', encoding="utf-8")
 
# returns JSON object as 
# a dictionary
data = json.load(f)

from collections import defaultdict

def get_even_datapoints(datapoints, n):
    random.seed(42)
    dp_by_label = defaultdict(list)
    for dp in tqdm(datapoints, desc='Reading Datapoints'):
        dp_by_label[dp['label']].append(dp)

    unique_labels = [0, 1, 2]

    split = n//len(unique_labels)

    result_datapoints = []

    for label in unique_labels:
        result_datapoints.extend(random.sample(dp_by_label[label], split))

    return result_datapoints

train_dataset = get_even_datapoints(data['train'], train)
validation_dataset = get_even_datapoints(data['validation'], validation)
test_dataset = get_even_datapoints(data['test'], test)


In [2]:
def make_batches(sequences: List[str], batch_size: int) -> List[List[str]]:
    """Yield batch_size chunks from sequences."""

    batch_list=[]

    last_index=len(sequences)-1
    
    for index in range(math.ceil(len(sequences)/batch_size)):
        
        if index+batch_size:
            batch_list.append(sequences[index:index+batch_size])
        else:
            batch_list.append(sequences[index:last_index])
    # DONE
    return batch_list

## Tokenize and Padding

In [3]:
from typing import Dict, List, Optional, Tuple
from collections import Counter

import torch
import numpy as np
import spacy


class Tokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self, pad_symbol: Optional[str] = "<PAD>"):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<PAD>".
        """
        self.pad_symbol = pad_symbol
        self.nlp = spacy.load("en_core_web_sm")
    
    def __call__(self, batch: List[str]) -> List[List[str]]:
        """Tokenizes each sentence in the batch, and pads them if necessary so
        that we have equal length sentences in the batch.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            List[List[str]]: A List of equal-length token Lists.
        """
        batch = self.tokenize(batch)
        batch = self.pad(batch)

        return batch

    def tokenize(self, sentences: List[str]) -> List[List[str]]:
        """Tokenizes the List of string sentences into a Lists of tokens using spacy tokenizer.

        Args:
            sentences (List[str]): The input sentence.

        Returns:
            List[str]: The tokenized version of the sentence.
        """
        # TODO: Tokenize the input with spacy.
        # TODO: Make sure the start token is the special <SOS> token and the end token
        #       is the special <EOS> token
        start_token = '<SOS>'
        end_token = '<EOS>'
        tokenized_sentences = []
        for sentence in sentences:
            doc = self.nlp(sentence)
            tokenized_sentence = [start_token] + [token.text for token in doc] + [end_token]
            tokenized_sentences.append(tokenized_sentence)
        return tokenized_sentences

    def pad(self, batch: List[List[str]]) -> List[List[str]]:
        """Appends pad symbols to each tokenized sentence in the batch such that
        every List of tokens is the same length. This means that the max length sentence
        will not be padded.

        Args:
            batch (List[List[str]]): Batch of tokenized sentences.

        Returns:
            List[List[str]]: Batch of padded tokenized sentences. 
        """
        # TODO: For each sentence in the batch, append the special <P>
        #       symbol to it n times to make all sentences equal length
        
        max_len = max([len(sentence) for sentence in batch])
        
        for sentence in batch:
            for i in range(max_len - len(sentence)):
                sentence.append(self.pad_symbol)
            
        return batch

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


In [4]:
def data_to_conversation_list(data) -> (List[List[str]], List[List[str]]):
    # Create a list to store the data
    conversation_data = []
    emotion_list = []

    # Iterate through conversations
    for conversation in data:
        conversation_id = conversation["conversation_ID"]
        utterances = conversation["conversation"]
        
        # We make batches now and use those.
        # tokenized_data = []
        # Note: Labels need to be batched in the same way to ensure
        # We have train sentence and label batches lining up.
        #for batch in make_batches(utterances['text'], batch_size):
        #    tokenized_data.append(tokenizer(batch))
        # tokenized_data = my_tokenizer.tokenize(utterances)
        
        # print("After tokenization:", tokenized_data)
        
        emotion_cause_pairs = conversation["emotion-cause_pairs"]

        utterance_data = []
        emotion_temp = []
        # Process each utterance in the conversation
        for utterance in utterances:
            utterance_id = utterance["utterance_ID"]
            #print("Before tokenization:", utterance["text"], "\n After tokenization:", tokenizer(utterance["text"]))
            text = utterance["speaker"] + ": " + utterance["text"]
            #speaker = utterance["speaker"]
            emotion = utterance["emotion"]

            # Append the data to the list
            # conversation_data.append([conversation_id, utterance_id, text, emotion]) #speaker, emotion])
            utterance_data.append(text)
            emotion_temp.append(emotion)
            #print(utterance_data, emotion_temp)
            assert len(utterance_data) == len(emotion_temp)
        conversation_data.append(utterance_data) #speaker, emotion])
        emotion_list.append(emotion_temp)
        assert len(conversation_data) == len(emotion_list)
    
    return conversation_data, emotion_list

In [5]:
conversation_list, emotion_list = data_to_conversation_list(data)

In [6]:
# conversation_list

In [7]:
unique_emotion_list = []
for emotion in emotion_list:
    if emotion not in unique_emotion_list:
        unique_emotion_list.append(emotion)

In [8]:
# Special tokens to be removed
special_tokens = {'<SOS>', '<EOS>', '<PAD>'}

# Joining each sublist into strings and then joining these strings
conversation_string = ' '.join(
    ' '.join(
        ' '.join(word for word in inner_list if word not in special_tokens) 
        for inner_list in sublist
    ) 
    for sublist in conversation_list
)

In [9]:
# conversation_string

In [18]:
from transformers import BertTokenizer, BertModel
import torch

def get_sentence_embedding(sentence, model_name="bert-base-uncased"):
    # Load pre-trained model tokenizer
    #tokenizer = BertTokenizer.from_pretrained(model_name)
    
    # Encode text
    #encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Example text
    
    # Tokenize the text
    encoded_input = tokenizer.encode_plus(
        sentence, 
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=512,
        truncation=True, # Pad & truncate all sentences.
        padding='max_length',
        #padding=True,
        return_attention_mask=True,   # Construct attention masks.
        return_tensors='pt',          # Return PyTorch tensors.
    )
    
    # Load pre-trained model
    model = BertModel.from_pretrained(model_name)

    # Forward pass, get hidden states
    with torch.no_grad():
        output = model(**encoded_input)

    # Get the embeddings of the [CLS] token (first token), representing the entire sentence
    sentence_embedding = output.last_hidden_state[:, 0, :]

    return sentence_embedding

In [19]:
conversation_embeddings = get_sentence_embedding(conversation_string)

In [26]:
conversation_embeddings

<function Tensor.size>

In [24]:
# Convert the list to a set to get unique elements, and then back to a list
unique_emotion_list = []
for emotion in emotion_list:
    if emotion not in unique_emotion_list:
        unique_emotion_list.append(emotion)

#print(unique_emotion_list)
print(conversation_string)


C h a n d l e r :   A l r i g h t   ,   s o   I   a m   b a c k   i n   h i g h   s c h o o l   ,   I   a m   s t a n d i n g   i n   t h e   m i d d l e   o f   t h e   c a f e t e r i a   ,   a n d   I   r e a l i z e   I   a m   t o t a l l y   n a k e d   . A l l :   O h   ,   y e a h   .   H a d   t h a t   d r e a m   . C h a n d l e r :   T h e n   I   l o o k   d o w n   ,   a n d   I   r e a l i z e   t h e r e   i s   a   p h o n e   . . .   t h e r e   . J o e y :   I n s t e a d   o f   . . .   ? C h a n d l e r :   T h a t   i s   r i g h t   . J o e y :   N e v e r   h a d   t h a t   d r e a m   . P h o e b e :   N o   . C h a n d l e r :   A l l   o f   a   s u d d e n   ,   t h e   p h o n e   s t a r t s   t o   r i n g   . R o s s :   I   d o   n o t   w a n t   t o   b e   s i n g l e   ,   o k a y   ?   I   j u s t   . . .   I   j u s t   . . .   I   j u s t   w a n n a   b e   m a r r i e d   a g a i n   ! C h a n d l e r :   A n d   I   j u s t   w a n t   a   m 

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming 'conversation_embeddings' is a numpy array of your embeddings
# And 'labels' is a list or array of your emotion labels

labels = np.array(['joy', 'sadness', 'surprise', 'disgust', 'fear', 'neutral'])  # Replace with your labels
conversation_embeddings_req = np.array(conversation_embeddings)  # This should be a 2D array
labels_new = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(conversation_embeddings_req, labels_new, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))


ValueError: Found input variables with inconsistent numbers of samples: [1, 6]