In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn as nn
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from transformers import BertTokenizer
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# get rid of rows with data outside first 12 cols, skip parsing errors
train_data = pd.read_csv('drive/My Drive/trac2_CONVT_train.csv', usecols = range(12), on_bad_lines='skip')
dev_data = pd.read_csv('drive/My Drive/trac2_CONVT_dev.csv', usecols=range(12), on_bad_lines='skip')


In [None]:
# convert to numeric vals
for col in ['Emotion', 'EmotionalPolarity', 'Empathy']:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    dev_data[col] = pd.to_numeric(dev_data[col], errors='coerce')

train_data = train_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)
dev_data = dev_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)

In [None]:
class ConversationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index): #extract text, emotion intensity, polarity, empathy labels
        text = self.data.iloc[index]['text']
        labels = self.data.iloc[index][['Emotion', 'EmotionalPolarity', 'Empathy']].values.astype(float)
        encoding = self.tokenizer(text,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True,
                                  return_tensors='pt')

        #tokenize into input IDs and attention masks
        return {'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(labels, dtype=torch.float)}

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #use BERT tokenizer

# create dataset objects
train_dataset = ConversationDataset(train_data, tokenizer)
dev_dataset = ConversationDataset(dev_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

In [None]:
def load_glove_embeddings(glove_path, tokenizer, embedding_dim=300):
    embeddings_index = {}
    with open(glove_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs

    # prepare embedding matrix
    vocab_size = tokenizer.vocab_size
    embedding_matrix = np.random.normal(size=(vocab_size, embedding_dim))  
    for word, idx in tokenizer.get_vocab().items():
        if idx >= vocab_size:  # stay within vocab size
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [None]:
#!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
#!unzip -q glove.840B.300d.zip

In [None]:
glove_path = "glove.840B.300d.txt"
embedding_dim = 300
embedding_matrix = load_glove_embeddings(glove_path, tokenizer, embedding_dim)

In [None]:
class LSTMAttentionModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_outputs, pad_idx, embedding_matrix=None):
        super(LSTMAttentionModel, self).__init__()

        # using our pretrained embeddings
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.tensor(embedding_matrix))
            self.embedding.weight.requires_grad = False  
            
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)

        # ensure that we use attention
        self.attention = nn.Linear(hidden_dim * 2, 1)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, num_outputs)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding(input_ids)
        embeddings = embeddings * attention_mask.unsqueeze(-1)

        lstm_out, _ = self.lstm(embeddings)

        # Attention weights
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)

        # Fully connected layer
        context_vector = self.dropout(context_vector)
        outputs = self.fc(context_vector)
        return outputs


In [None]:
hidden_dim = 256
num_outputs = 3  # Emotion, Polarity, Empathy
pad_idx = tokenizer.pad_token_id

# initialize the LSTM 
lstm_attention_model = LSTMAttentionModel(
    vocab_size=tokenizer.vocab_size,
    embed_dim=embedding_dim,
    hidden_dim=hidden_dim,
    num_outputs=num_outputs,
    pad_idx=pad_idx,
    embedding_matrix=embedding_matrix)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm_attention_model.to(device)

optimizer = AdamW(lstm_attention_model.parameters(), lr=0.005)
criterion = nn.MSELoss()

In [None]:
#training
epochs = 10
for epoch in range(epochs):
    lstm_attention_model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = lstm_attention_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss / len(train_loader):.4f}")


Epoch 1: Train Loss = 0.4950
Epoch 2: Train Loss = 0.3863
Epoch 3: Train Loss = 0.3526
Epoch 4: Train Loss = 0.3240
Epoch 5: Train Loss = 0.3027
Epoch 6: Train Loss = 0.2789
Epoch 7: Train Loss = 0.2607
Epoch 8: Train Loss = 0.2409
Epoch 9: Train Loss = 0.2314
Epoch 10: Train Loss = 0.2114


In [73]:
lstm_attention_model.eval()
predictions = []
val_labels = []

with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_attention_model(input_ids, attention_mask)
        predictions.append(outputs.cpu().numpy())
        val_labels.append(labels.cpu().numpy())

#flatten predictions and labels into arrays
predictions = np.concatenate(predictions, axis=0)
val_labels = np.concatenate(val_labels, axis=0)

In [74]:
pearson_emotion = pearsonr(val_labels[:, 0], predictions[:, 0])[0]
pearson_emotional_polarity = pearsonr(val_labels[:, 1], predictions[:, 1])[0]
pearson_empathy = pearsonr(val_labels[:, 2], predictions[:, 2])[0]
average_pearson = (pearson_emotion + pearson_emotional_polarity + pearson_empathy) / 3
print("Emotion Intensity Pearson Score:", pearson_emotion)
print("Emotional Polarity Pearson Score:", pearson_emotional_polarity)
print("Empathy Pearson Score:", pearson_empathy)
print("Average Pearson Score:", average_pearson)


Emotion Intensity Pearson Score: 0.061023749641554055
Emotional Polarity Pearson Score: 0.08659075659901935
Empathy Pearson Score: 0.14882440724852586
Average Pearson Score: 0.09881297116303307


In [75]:
#use goldstandard test data
test_data = pd.read_csv('drive/My Drive/goldstandard_CONVT.csv',on_bad_lines='skip')

In [76]:
for col in ['Emotion', 'EmotionalPolarity', 'Empathy']:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

test_data = test_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)

test_dataset = ConversationDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# evaluate the model on the test set
lstm_attention_model.eval()
test_predictions = []
test_labels = []

In [77]:
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_attention_model(input_ids, attention_mask)
        test_predictions.append(outputs.cpu().numpy())
        test_labels.append(labels.cpu().numpy())

In [78]:
test_predictions = np.concatenate(test_predictions, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

In [79]:
pearson_emotion = pearsonr(test_labels[:, 0], test_predictions[:, 0])[0]
pearson_emotional_polarity = pearsonr(test_labels[:, 1], test_predictions[:, 1])[0]
pearson_empathy = pearsonr(test_labels[:, 2], test_predictions[:, 2])[0]
average_pearson = (pearson_emotion + pearson_emotional_polarity + pearson_empathy) / 3

print("Test Set Evaluation:")
print("Emotion Intensity Pearson Score:", pearson_emotion)
print("Emotional Polarity Pearson Score:", pearson_emotional_polarity)
print("Empathy Pearson Score:", pearson_empathy)
print("Average Pearson Score:", average_pearson)

Test Set Evaluation:
Emotion Intensity Pearson Score: 0.5585666899727699
Emotional Polarity Pearson Score: 0.5482412580472854
Empathy Pearson Score: 0.49180286080332924
Average Pearson Score: 0.5328702696077948
