In [15]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn as nn
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from transformers import BertTokenizer

In [7]:
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
train_data = pd.read_csv('drive/My Drive/trac2_CONVT_train.csv', usecols = range(12), on_bad_lines='skip')
dev_data = pd.read_csv('drive/My Drive/trac2_CONVT_dev.csv', usecols=range(12), on_bad_lines='skip')


In [16]:
for col in ['Emotion', 'EmotionalPolarity', 'Empathy']:
    train_data[col] = pd.to_numeric(train_data[col], errors='coerce')
    dev_data[col] = pd.to_numeric(dev_data[col], errors='coerce')

train_data = train_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)
dev_data = dev_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)

# Sample smaller datasets for quick training (adjust or remove for full dataset)
train_data = train_data.sample(n=500, random_state=4400)
dev_data = dev_data.sample(n=100, random_state=4400)

In [17]:
class ConversationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        labels = self.data.iloc[index][['Emotion', 'EmotionalPolarity', 'Empathy']].values.astype(float)
        encoding = self.tokenizer(text,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True,
                                  return_tensors='pt')
        return {'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(labels, dtype=torch.float)}

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset objects
train_dataset = ConversationDataset(train_data, tokenizer)
dev_dataset = ConversationDataset(dev_data, tokenizer)

# Wrap datasets in DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_outputs, pad_idx):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_outputs)  # Bi-LSTM doubles the hidden_dim
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding(input_ids)  # (batch_size, seq_length, embed_dim)
        embeddings = embeddings * attention_mask.unsqueeze(-1)  # Mask padded tokens
        lstm_out, _ = self.lstm(embeddings)  # (batch_size, seq_length, hidden_dim*2)
        pooled_output = lstm_out[:, 0, :]  # Use the first token's output
        pooled_output = self.dropout(pooled_output)
        outputs = self.fc(pooled_output)
        return outputs

In [20]:
vocab_size = tokenizer.vocab_size
embed_dim = 128
hidden_dim = 256
num_outputs = 3  # Emotion, Polarity, Empathy
pad_idx = tokenizer.pad_token_id

# Initialize the LSTM model
lstm_model = LSTMModel(vocab_size, embed_dim, hidden_dim, num_outputs, pad_idx)


In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm_model.to(device)

optimizer = AdamW(lstm_model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [22]:
epochs = 10
for epoch in range(epochs):
    lstm_model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = lstm_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch + 1}: Train Loss = {train_loss / len(train_loader):.4f}")


Epoch 1: Train Loss = 1.0823
Epoch 2: Train Loss = 0.5421
Epoch 3: Train Loss = 0.4678
Epoch 4: Train Loss = 0.3910
Epoch 5: Train Loss = 0.3357
Epoch 6: Train Loss = 0.2912
Epoch 7: Train Loss = 0.2359
Epoch 8: Train Loss = 0.2025
Epoch 9: Train Loss = 0.1855
Epoch 10: Train Loss = 0.1532


In [23]:
lstm_model.eval()
predictions = []
val_labels = []

with torch.no_grad():
    for batch in dev_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_model(input_ids, attention_mask)
        predictions.append(outputs.cpu().numpy())
        val_labels.append(labels.cpu().numpy())

# Flatten predictions and labels into arrays
predictions = np.concatenate(predictions, axis=0)
val_labels = np.concatenate(val_labels, axis=0)

In [24]:
pearson_emotion = pearsonr(val_labels[:, 0], predictions[:, 0])[0]
pearson_emotional_polarity = pearsonr(val_labels[:, 1], predictions[:, 1])[0]
pearson_empathy = pearsonr(val_labels[:, 2], predictions[:, 2])[0]
average_pearson = (pearson_emotion + pearson_emotional_polarity + pearson_empathy) / 3
print("Emotion Intensity Pearson Score:", pearson_emotion)
print("Emotional Polarity Pearson Score:", pearson_emotional_polarity)
print("Empathy Pearson Score:", pearson_empathy)
print("Average Pearson Score:", average_pearson)


Emotion Intensity Pearson Score: 0.24313608396055872
Emotional Polarity Pearson Score: 0.30323640349604364
Empathy Pearson Score: 0.36107007094930155
Average Pearson Score: 0.30248085280196796


In [26]:
test_data = pd.read_csv('drive/My Drive/goldstandard_CONVT.csv',on_bad_lines='skip')


In [27]:
for col in ['Emotion', 'EmotionalPolarity', 'Empathy']:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')

test_data = test_data.dropna(subset=['Emotion', 'EmotionalPolarity', 'Empathy']).reset_index(drop=True)

# Create a test dataset and DataLoader
test_dataset = ConversationDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluate the model on the test set
lstm_model.eval()
test_predictions = []
test_labels = []

In [28]:
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lstm_model(input_ids, attention_mask)
        test_predictions.append(outputs.cpu().numpy())
        test_labels.append(labels.cpu().numpy())

In [29]:
test_predictions = np.concatenate(test_predictions, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

In [30]:
pearson_emotion = pearsonr(test_labels[:, 0], test_predictions[:, 0])[0]
pearson_emotional_polarity = pearsonr(test_labels[:, 1], test_predictions[:, 1])[0]
pearson_empathy = pearsonr(test_labels[:, 2], test_predictions[:, 2])[0]
average_pearson = (pearson_emotion + pearson_emotional_polarity + pearson_empathy) / 3

print("Test Set Evaluation:")
print("Emotion Intensity Pearson Score:", pearson_emotion)
print("Emotional Polarity Pearson Score:", pearson_emotional_polarity)
print("Empathy Pearson Score:", pearson_empathy)
print("Average Pearson Score:", average_pearson)

Test Set Evaluation:
Emotion Intensity Pearson Score: 0.35446060177540845
Emotional Polarity Pearson Score: 0.20057567499321396
Empathy Pearson Score: 0.3492789054110515
Average Pearson Score: 0.3014383940598913
