In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn as nn
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from transformers import BertTokenizer
from google.colab import drive

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_data = pd.read_csv('drive/My Drive/trac2_CONVT_train.csv', usecols = range(12), on_bad_lines='skip')
dev_data = pd.read_csv('drive/My Drive/trac2_CONVT_dev.csv', usecols=range(12), on_bad_lines='skip')


In [7]:
for col in ['Emotion', 'EmotionalPolarity', 'Empathy']:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    dev_data[col] = pd.to_numeric(dev_data[col], errors='coerce')

train_data = train_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)
dev_data = dev_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)

# Sample smaller datasets for quick training (adjust or remove for full dataset)
train_data = train_data.sample(n=1000, random_state=4400)
dev_data = dev_data.sample(n=200, random_state=4400)

In [8]:
class ConversationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        labels = self.data.iloc[index][['Emotion', 'EmotionalPolarity', 'Empathy']].values.astype(float)
        encoding = self.tokenizer(text,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True,
                                  return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(labels, dtype=torch.float)}

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset objects
train_dataset = ConversationDataset(train_data, tokenizer)
dev_dataset = ConversationDataset(dev_data, tokenizer)

# Wrap datasets in DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
def load_glove_embeddings(glove_path, tokenizer, embedding_dim=300):
    embeddings_index = {}
    with open(glove_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs

    # Prepare embedding matrix
    vocab_size = tokenizer.vocab_size
    embedding_matrix = np.random.normal(size=(vocab_size, embedding_dim))  # Random initialization
    for word, idx in tokenizer.get_vocab().items():
        if idx >= vocab_size:  # Ensure we stay within vocab size
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [11]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-12-07 17:56:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-07 17:56:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-07 17:56:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [12]:
glove_path = "glove.6B.300d.txt"
embedding_dim = 300
embedding_matrix = load_glove_embeddings(glove_path, tokenizer, embedding_dim)


In [13]:
class LSTMAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_outputs, pad_idx, embedding_matrix=None):
        super(LSTMAttentionModel, self).__init__()

        # Embedding layer with pretrained embeddings
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
            self.embedding.weight.requires_grad = False  # Freeze embeddings

        # LSTM layer
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)

        # Attention mechanism
        self.attention = nn.Linear(hidden_dim * 2, 1)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, num_outputs)

        # Dropout
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        # Embedding lookup
        embeddings = self.embedding(input_ids)
        embeddings = embeddings * attention_mask.unsqueeze(-1)

        # LSTM outputs
        lstm_out, _ = self.lstm(embeddings)

        # Attention weights
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)

        # Fully connected layer
        context_vector = self.dropout(context_vector)
        outputs = self.fc(context_vector)
        return outputs

In [14]:
hidden_dim = 256
num_outputs = 3  # Emotion, Polarity, Empathy
pad_idx = tokenizer.pad_token_id

# Initialize the LSTM model
lstm_attention_model = LSTMAttentionModel(
    vocab_size=tokenizer.vocab_size,
    embed_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_outputs=num_outputs,
    pad_idx=pad_idx,
    embedding_matrix=embedding_matrix
)

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm_attention_model.to(device)

optimizer = AdamW(lstm_attention_model.parameters(), lr=0.005)
criterion = nn.MSELoss()

In [19]:
epochs = 10
for epoch in range(epochs):
    lstm_attention_model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = lstm_attention_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss / len(train_loader):.4f}")


Epoch 1: Train Loss = 1.0336
Epoch 2: Train Loss = 0.5664
Epoch 3: Train Loss = 0.4946
Epoch 4: Train Loss = 0.4315
Epoch 5: Train Loss = 0.3927
Epoch 6: Train Loss = 0.3738
Epoch 7: Train Loss = 0.3539
Epoch 8: Train Loss = 0.3238
Epoch 9: Train Loss = 0.3146
Epoch 10: Train Loss = 0.2789


In [20]:
lstm_attention_model.eval()
predictions = []
val_labels = []

with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_attention_model(input_ids, attention_mask)
        predictions.append(outputs.cpu().numpy())
        val_labels.append(labels.cpu().numpy())

# Flatten predictions and labels into arrays
predictions = np.concatenate(predictions, axis=0)
val_labels = np.concatenate(val_labels, axis=0)

In [21]:
pearson_emotion = pearsonr(val_labels[:, 0], predictions[:, 0])[0]
pearson_emotional_polarity = pearsonr(val_labels[:, 1], predictions[:, 1])[0]
pearson_empathy = pearsonr(val_labels[:, 2], predictions[:, 2])[0]
average_pearson = (pearson_emotion + pearson_emotional_polarity + pearson_empathy) / 3
print("Emotion Intensity Pearson Score:", pearson_emotion)
print("Emotional Polarity Pearson Score:", pearson_emotional_polarity)
print("Empathy Pearson Score:", pearson_empathy)
print("Average Pearson Score:", average_pearson)


Emotion Intensity Pearson Score: 0.6356940678298693
Emotional Polarity Pearson Score: 0.5878763127949586
Empathy Pearson Score: 0.6573262285349145
Average Pearson Score: 0.6269655363865807


In [22]:
test_data = pd.read_csv('drive/My Drive/goldstandard_CONVT.csv',on_bad_lines='skip')


In [23]:
for col in ['Emotion', 'EmotionalPolarity', 'Empathy']:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

test_data = test_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)

# Create a test dataset and DataLoader
test_dataset = ConversationDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluate the model on the test set
lstm_attention_model.eval()
test_predictions = []
test_labels = []

In [24]:
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_attention_model(input_ids, attention_mask)
        test_predictions.append(outputs.cpu().numpy())
        test_labels.append(labels.cpu().numpy())

In [25]:
test_predictions = np.concatenate(test_predictions, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

In [26]:
pearson_emotion = pearsonr(test_labels[:, 0], test_predictions[:, 0])[0]
pearson_emotional_polarity = pearsonr(test_labels[:, 1], test_predictions[:, 1])[0]
pearson_empathy = pearsonr(test_labels[:, 2], test_predictions[:, 2])[0]
average_pearson = (pearson_emotion + pearson_emotional_polarity + pearson_empathy) / 3

print("Test Set Evaluation:")
print("Emotion Intensity Pearson Score:", pearson_emotion)
print("Emotional Polarity Pearson Score:", pearson_emotional_polarity)
print("Empathy Pearson Score:", pearson_empathy)
print("Average Pearson Score:", average_pearson)

Test Set Evaluation:
Emotion Intensity Pearson Score: 0.5260446515634425
Emotional Polarity Pearson Score: 0.474991721006126
Empathy Pearson Score: 0.4751995184433437
Average Pearson Score: 0.49207863033763743
