# Task 1: Research Emotion Labels
//Testing what emotion labels the model outputs


In [1]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")

def get_emotion(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt', max_length=512,
  truncation=True)

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
import pandas as pd
lyrics = pd.read_csv('spotify_millsongdata.csv')
all_lyrics = lyrics['text']

In [2]:
# Test with diverse sample texts to discover all possible emotion labels
test_texts = [
    "I am so happy and excited about this wonderful day!",
    "I feel so sad and lonely, everything is falling apart.",
    "I am furious and angry about what happened!",
    "I am terrified and scared of what might happen.",
    "I love you so much, you mean everything to me.",
    "Wow, I didn't expect that at all! What a surprise!",
    "I feel so calm and peaceful right now.",
    "This is the best thing ever, I'm thrilled!",
    "I'm feeling down and depressed today.",
    "How dare you do this to me! I'm outraged!"
]

print("Testing emotion detection on sample texts:")
print("=" * 60)
emotions_found = set()
for text in test_texts:
    emotion = get_emotion(text)
    emotions_found.add(emotion)
    print(f"Text: {text[:50]}...")
    print(f"Emotion: {emotion}")
    print("-" * 60)


Testing emotion detection on sample texts:
Text: I am so happy and excited about this wonderful day...
Emotion: <pad> joy
------------------------------------------------------------
Text: I feel so sad and lonely, everything is falling ap...
Emotion: <pad> sadness
------------------------------------------------------------
Text: I am furious and angry about what happened!...
Emotion: <pad> anger
------------------------------------------------------------
Text: I am terrified and scared of what might happen....
Emotion: <pad> fear
------------------------------------------------------------
Text: I love you so much, you mean everything to me....
Emotion: <pad> love
------------------------------------------------------------
Text: Wow, I didn't expect that at all! What a surprise!...
Emotion: <pad> surprise
------------------------------------------------------------
Text: I feel so calm and peaceful right now....
Emotion: <pad> joy
---------------------------------------------------

In [3]:
# Display all unique emotions found
print("\nAll unique emotions detected by the model:")
print(sorted(emotions_found))



All unique emotions detected by the model:
['<pad> anger', '<pad> fear', '<pad> joy', '<pad> love', '<pad> sadness', '<pad> surprise']


In [4]:
# Now let's get the token IDs for each emotion label
# First, clean the emotion labels (remove special tokens)
clean_emotions = [e.replace('<pad>', '').replace('</s>', '').strip() for e in emotions_found]
clean_emotions = [e for e in clean_emotions if e]  # Remove empty strings

print("\nToken IDs for each emotion:")
emotion_token_map = {}
for emotion in clean_emotions:
    tokens = tokenizer.tokenize(emotion)
    token_ids = tokenizer.encode(emotion, add_special_tokens=False)
    emotion_token_map[emotion] = token_ids
    print(f"{emotion}: tokens={tokens}, ids={token_ids}")



Token IDs for each emotion:
joy: tokens=['▁joy'], ids=[3922]
sadness: tokens=['▁sadness'], ids=[24784]
anger: tokens=['▁anger'], ids=[11213]
love: tokens=['▁love'], ids=[333]
surprise: tokens=['▁surprise'], ids=[4158]
fear: tokens=['▁fear'], ids=[2971]


In [5]:
# Test getting raw logits instead of generate()
import torch

test_text = "I am so happy and excited about this wonderful day!"
input_ids = tokenizer.encode(test_text + '</s>', return_tensors='pt', max_length=512, truncation=True)

print("Testing raw model output:")
print(f"Input shape: {input_ids.shape}")

# Get model output with logits
with torch.no_grad():
    # For T5, we need to provide decoder_input_ids
    decoder_start_token_id = model.config.decoder_start_token_id
    decoder_input_ids = torch.tensor([[decoder_start_token_id]])
    
    outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
    logits = outputs.logits
    
    print(f"Logits shape: {logits.shape}")
    print(f"Vocabulary size: {model.config.vocab_size}")
    
    # Get probabilities using softmax
    probs = torch.softmax(logits[0, 0, :], dim=-1)
    
    # Get top 10 most likely tokens
    top_probs, top_indices = torch.topk(probs, k=10)
    
    print("\nTop 10 most likely tokens:")
    for prob, idx in zip(top_probs, top_indices):
        token = tokenizer.decode([idx.item()])
        print(f"{token}: {prob.item():.4f}")


Testing raw model output:
Input shape: torch.Size([1, 13])
Logits shape: torch.Size([1, 1, 32128])
Vocabulary size: 32128

Top 10 most likely tokens:
joy: 0.9995
love: 0.0003
happiness: 0.0001
surprise: 0.0000
Joy: 0.0000
sadness: 0.0000
delight: 0.0000
joyful: 0.0000
excitement: 0.0000
happy: 0.0000
