In [1]:
import pandas as pd
import ast
import numpy as np

In [17]:
data = pd.read_csv("/work/ptyagi/masterthesis/data/predictions/feb/averaged_predictions.csv")
annotated = pd.read_csv("/work/ptyagi/masterthesis/data/tmp/annotations_and_predictions.csv")

In [21]:
df = pd.merge(data[['conversation_id', 'tweet_text','roberta-large-predictions','mapped_predictions']],
annotated[['manual_label', 'conversation_id', 'replies']], on='conversation_id', how='inner')

In [34]:
df.head()

Unnamed: 0,conversation_id,tweet_text,roberta-large-predictions,mapped_predictions,manual_label,replies,vec_11,label_id
0,1096376667117895680,This is the scene in Brighton as a protest ove...,"[('anger', 0.728320837020874), ('sadness', 0.3...","[('fear', 0.08991589451929098), ('disgust', 0....",joy,student climate change protest coverage in sha...,"[0.1012616, 0.36272985, 0.026058132, 0.0158791...",3
1,1096396055678255105,Young people concerned about climate change ga...,"[('optimism', 0.9539227485656738), ('anticipat...","[('fear', 0.1304264675767046), ('disgust', 0.0...",sadness,it is everybody s planet but in the last years...,"[0.7133801, 0.2396769, 0.95392275, 0.048316047...",1
2,1096384227707404288,More crazy Swiss transport socialism. If we ha...,"[('optimism', 0.8962536454200745), ('joy', 0.7...","[('fear', 0.007606922902868629), ('disgust', 0...",sadness,on that we are agreed however the question is ...,"[0.44626218, 0.030970134, 0.89625365, 0.064893...",1
3,1096419453884133376,This gave me so much hope today! Amazing to se...,"[('optimism', 0.9923896193504333), ('joy', 0.9...","[('fear', 0.012285791733605866), ('disgust', 0...",anger,cab drivers are just like workers in many othe...,"[0.1598584, 0.009998838, 0.9923896, 0.03803576...",0
4,1096435275142742016,We're tacking action on #climatechange 🌳 [URL],"[('optimism', 0.9773759841918945), ('anticipat...","[('fear', 0.04512464413599401), ('disgust', 0....",sadness,this is not good flaring at offshore installat...,"[0.42068568, 0.07937623, 0.977376, 0.013604169...",1


In [23]:
EMOTIONS_11 = [
    'anticipation', 'sadness', 'optimism', 'surprise', 'fear',
    'disgust', 'joy', 'pessimism', 'anger', 'trust', 'love'
]

def parse_11_dist(dist_11_tuples):
    emotion2prob = dict(ast.literal_eval(dist_11_tuples))
    vector = []
    for emo in EMOTIONS_11:
        vector.append(emotion2prob.get(emo, 0.0))
    return np.array(vector, dtype=np.float32)

df['vec_11'] = df['roberta-large-predictions'].apply(parse_11_dist)
print(df['vec_11'])


0     [0.1012616, 0.36272985, 0.026058132, 0.0158791...
1     [0.7133801, 0.2396769, 0.95392275, 0.048316047...
2     [0.44626218, 0.030970134, 0.89625365, 0.064893...
3     [0.1598584, 0.009998838, 0.9923896, 0.03803576...
4     [0.42068568, 0.07937623, 0.977376, 0.013604169...
                            ...                        
94    [0.073052146, 0.15087493, 0.86403126, 0.011630...
95    [0.05085175, 0.23804663, 0.02445328, 0.0103589...
96    [0.05085175, 0.23804663, 0.02445328, 0.0103589...
97    [0.08676346, 0.59624434, 0.02289635, 0.0137726...
98    [0.054532796, 0.193984, 0.008681102, 0.1395535...
Name: vec_11, Length: 99, dtype: object


In [24]:
POSSIBLE_6_LABELS = ['anger', 'sadness', 'fear', 'joy', 'disgust', 'surprise']

label2id = {label: i for i, label in enumerate(POSSIBLE_6_LABELS)}

def encode_label_6(label):
    return label2id[label]

df['label_id'] = df['manual_label'].apply(encode_label_6)
print(df['label_id'])

0     3
1     1
2     1
3     0
4     1
     ..
94    1
95    0
96    2
97    4
98    5
Name: label_id, Length: 99, dtype: int64


In [55]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Train size:", len(train_df))
print("Test size: ", len(test_df))


Train size: 79
Test size:  20


In [57]:
import torch
from torch.utils.data import Dataset, DataLoader

class EmotionDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
        
        # Convert everything into torch tensors once, so we don't do it on-the-fly
        self.X = torch.tensor(np.stack(self.df['vec_11'].values), dtype=torch.float32)
        self.y = torch.tensor(self.df['label_id'].values, dtype=torch.long)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = EmotionDataset(train_df)
test_dataset  = EmotionDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=2, shuffle=False)


In [58]:
import torch.nn as nn
import torch.optim as optim

class MappingModel(nn.Module):
    def __init__(self, in_features=11, out_features=6):
        super(MappingModel, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(in_features, 20),
            nn.Linear(20, out_features)
        )
        
    def forward(self, x):
        """
        x: (batch_size, 11) input
        returns: (batch_size, 6) logits
        """
        return self.mlp(x)  


In [59]:
model = MappingModel(in_features=11, out_features=6)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-03)

num_epochs = 10 

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        logits = model(X_batch)             
        loss = criterion(logits, y_batch) 
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * X_batch.size(0)
    
    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")


Epoch 1, Loss: 1.8419
Epoch 2, Loss: 1.7938
Epoch 3, Loss: 1.7617
Epoch 4, Loss: 1.7363
Epoch 5, Loss: 1.7140
Epoch 6, Loss: 1.6999
Epoch 7, Loss: 1.6863
Epoch 8, Loss: 1.6778
Epoch 9, Loss: 1.6675
Epoch 10, Loss: 1.6610


In [60]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        logits = model(X_batch)               
        predictions = torch.argmax(logits, dim=1)  
        correct += (predictions == y_batch).sum().item()
        total   += y_batch.size(0)

accuracy = correct / total if total > 0 else 0.0
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 45.00%


In [61]:
model.eval()

# Suppose new_dist is a list of (emotion, probability) tuples from your 11-label model
new_dist = "[('anticipation', 0.1), ('sadness', 0.2), ('optimism', 0.2),('surprise', 0.15), ('fear', 0.05), ('disgust', 0.02),('joy', 0.08), ('pessimism', 0.08), ('anger', 0.05),('trust', 0.05), ('love', 0.02)]"

x_vec = parse_11_dist(new_dist)           # shape (11,)
x_tensor = torch.tensor(x_vec).unsqueeze(0)  # (1, 11)

with torch.no_grad():
    logits = model(x_tensor)             # (1, 6)
    pred_id = torch.argmax(logits, dim=1).item()  # get integer label
    # Map back to the 6-label name
    id2label = {v: k for k, v in label2id.items()}
    predicted_label = id2label[pred_id]
    print("Predicted 6-label:", predicted_label)


Predicted 6-label: anger


In [67]:
sample_dist_11 = [0.04657082, 0.01401192, 0.6148004 , 0.04374156, 0.00305552, 0.05579609, 0.997147  , 0.00530904, 0.3977998 , 0.01399408, 0.10514057]

In [68]:
import torch
import torch.nn.functional as F

# 1) Convert your sample distribution to a tensor
x_11 = torch.tensor(sample_dist_11, dtype=torch.float32).unsqueeze(0)  
# shape: (1, 11)

# 2) Forward pass: get raw logits
model.eval()
with torch.no_grad():
    logits_6 = model(x_11)  # shape: (1, 6)

# 3) Convert logits to probabilities with softmax
prob_dist_6 = F.softmax(logits_6, dim=1)  # shape: (1, 6)

# 4) prob_dist_6 is your final 6-emotion distribution 
print(prob_dist_6)


tensor([[0.2372, 0.1773, 0.0703, 0.1998, 0.1485, 0.1668]])


In [70]:
test_df["mapped_predictions"].iloc[0]

"[('fear', 0.003640935125511166), ('disgust', 0.02428697654795146), ('joy', 0.7535070391287398), ('surprise', 0.03931125813234599), ('sadness', 0.006099123162103546), ('anger', 0.173154667903348)]"