In [1]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")

model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")

def get_emotion(text):
  input_ids = tokenizer.encode(text + '</s>', return_tensors='pt', max_length=512,
  truncation=True)

  output = model.generate(input_ids=input_ids,
               max_length=2)
  
  dec = [tokenizer.decode(ids) for ids in output]
  label = dec[0]
  return label

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
import pandas as pd
lyrics = pd.read_csv('datasets/spotify_millsongdata.csv')
all_lyrics = lyrics['text']

In [4]:
from tqdm import tqdm
tqdm.pandas()

lyrics['mood'] = lyrics['text'].progress_apply(get_emotion)

def get_emotion_with_progress(text, index, total):
  if index % 10 == 0:  # Print every 10th item
      print(f"Processing {index}/{total}")
  return get_emotion(text)

total = len(lyrics)
lyrics['mood'] = [get_emotion_with_progress(text, i, total)
                for i, text in enumerate(lyrics['text'])]

  0%|          | 18/57650 [00:07<6:31:53,  2.45it/s]


KeyboardInterrupt: 

In [None]:
lyrics['mood'] = lyrics['text'].apply(get_emotion)
print(lyrics)

## Probability Distribution Function
New function to get emotion probability distributions instead of just a single label


In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM

# Load model using the recommended class
model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-emotion")

# Emotion token IDs (discovered from model testing)
EMOTION_IDS = {
    'joy': 3922,
    'sadness': 24784,
    'anger': 11213,
    'fear': 2971,
    'love': 333,
    'surprise': 4158
}

def get_emotion_distribution(text):
    """
    Get probability distribution over all emotions for a given text.
    
    Args:
        text: Input text (song lyrics)
    
    Returns:
        Dictionary with emotion labels as keys and probabilities as values
        e.g., {'joy': 0.85, 'sadness': 0.10, 'anger': 0.02, ...}
    """
    try:
        # Tokenize input
        input_ids = tokenizer.encode(text + '</s>', return_tensors='pt', max_length=512, truncation=True)
        
        # Get model output with logits
        with torch.no_grad():
            decoder_start_token_id = model.config.decoder_start_token_id
            decoder_input_ids = torch.tensor([[decoder_start_token_id]])
            
            outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
            logits = outputs.logits[0, 0, :]  # First token position
            
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)
            
            # Extract probabilities for each emotion
            emotion_probs = {}
            for emotion, token_id in EMOTION_IDS.items():
                emotion_probs[emotion] = float(probs[token_id].item())
            
            return emotion_probs
    
    except Exception as e:
        print(f"Error processing text: {e}")
        # Return uniform distribution on error
        return {emotion: 1.0/len(EMOTION_IDS) for emotion in EMOTION_IDS.keys()}


## Test on First 10 Songs


In [6]:
import pandas as pd

# Read the dataset
print("Loading dataset...")
lyrics_df = pd.read_csv('datasets/spotify_millsongdata.csv')

print(f"Total songs in dataset: {len(lyrics_df)}")
print(f"\nDataset columns: {lyrics_df.columns.tolist()}")

# Get first 10 songs for testing
test_df = lyrics_df.head(10).copy()

print(f"\nProcessing first 10 songs...")
print("=" * 80)


Loading dataset...
Total songs in dataset: 57650

Dataset columns: ['artist', 'song', 'link', 'text']

Processing first 10 songs...


In [7]:
# Apply the emotion distribution function to first 10 songs
from tqdm import tqdm

emotion_distributions = []

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing songs"):
    text = row['text']
    
    # Get emotion distribution
    emotion_dist = get_emotion_distribution(text)
    emotion_distributions.append(emotion_dist)
    
    # Also get the original single-label prediction for comparison
    single_label = get_emotion(text)
    
    # Print results for each song
    print(f"\nSong {idx + 1}:")
    print(f"  Text preview: {text[:100]}...")
    print(f"  Original prediction: {single_label}")
    print(f"  Probability distribution:")
    for emotion, prob in sorted(emotion_dist.items(), key=lambda x: x[1], reverse=True):
        print(f"    {emotion:10}: {prob:.4f} ({prob*100:.2f}%)")
    print("-" * 80)

# Add emotion distribution as a new column
test_df['emotion_distribution'] = emotion_distributions

print("\n✓ Processing complete!")


Processing songs:  10%|█         | 1/10 [00:00<00:08,  1.06it/s]


Song 1:
  Text preview: Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way t...
  Original prediction: <pad> joy
  Probability distribution:
    joy       : 0.9996 (99.96%)
    love      : 0.0003 (0.03%)
    sadness   : 0.0000 (0.00%)
    surprise  : 0.0000 (0.00%)
    anger     : 0.0000 (0.00%)
    fear      : 0.0000 (0.00%)
--------------------------------------------------------------------------------


Processing songs:  20%|██        | 2/10 [00:01<00:07,  1.14it/s]


Song 2:
  Text preview: Take it easy with me, please  
Touch me gently like a summer evening breeze  
Take your time, make...
  Original prediction: <pad> love
  Probability distribution:
    love      : 0.9602 (96.02%)
    joy       : 0.0388 (3.88%)
    sadness   : 0.0003 (0.03%)
    surprise  : 0.0001 (0.01%)
    anger     : 0.0001 (0.01%)
    fear      : 0.0000 (0.00%)
--------------------------------------------------------------------------------


Processing songs:  30%|███       | 3/10 [00:02<00:06,  1.15it/s]


Song 3:
  Text preview: I'll never know why I had to go  
Why I had to put up such a lousy rotten show  
Boy, I was tough,...
  Original prediction: <pad> sadness
  Probability distribution:
    sadness   : 0.9332 (93.32%)
    joy       : 0.0481 (4.81%)
    anger     : 0.0123 (1.23%)
    fear      : 0.0031 (0.31%)
    love      : 0.0021 (0.21%)
    surprise  : 0.0001 (0.01%)
--------------------------------------------------------------------------------


Processing songs:  40%|████      | 4/10 [00:03<00:05,  1.08it/s]


Song 4:
  Text preview: Making somebody happy is a question of give and take  
You can learn how to show it so come on, giv...
  Original prediction: <pad> joy
  Probability distribution:
    joy       : 0.7256 (72.56%)
    love      : 0.2522 (25.22%)
    anger     : 0.0192 (1.92%)
    sadness   : 0.0019 (0.19%)
    surprise  : 0.0001 (0.01%)
    fear      : 0.0001 (0.01%)
--------------------------------------------------------------------------------


Processing songs:  50%|█████     | 5/10 [00:04<00:04,  1.08it/s]


Song 5:
  Text preview: Making somebody happy is a question of give and take  
You can learn how to show it so come on, giv...
  Original prediction: <pad> joy
  Probability distribution:
    joy       : 0.7391 (73.91%)
    love      : 0.1810 (18.10%)
    anger     : 0.0634 (6.34%)
    sadness   : 0.0148 (1.48%)
    fear      : 0.0001 (0.01%)
    surprise  : 0.0001 (0.01%)
--------------------------------------------------------------------------------


Processing songs:  60%|██████    | 6/10 [00:05<00:03,  1.23it/s]


Song 6:
  Text preview: Well, you hoot and you holler and you make me mad  
And I've always been under your heel  
Holy ch...
  Original prediction: <pad> anger
  Probability distribution:
    anger     : 0.6258 (62.58%)
    sadness   : 0.3524 (35.24%)
    joy       : 0.0178 (1.78%)
    fear      : 0.0018 (0.18%)
    love      : 0.0011 (0.11%)
    surprise  : 0.0001 (0.01%)
--------------------------------------------------------------------------------


Processing songs:  70%|███████   | 7/10 [00:06<00:02,  1.12it/s]


Song 7:
  Text preview: Down in the street they're all singing and shouting  
Staying alive though the city is dead  
Hidi...
  Original prediction: <pad> sadness
  Probability distribution:
    sadness   : 0.9485 (94.85%)
    joy       : 0.0219 (2.19%)
    anger     : 0.0129 (1.29%)
    fear      : 0.0096 (0.96%)
    love      : 0.0053 (0.53%)
    surprise  : 0.0002 (0.02%)
--------------------------------------------------------------------------------


Processing songs:  80%|████████  | 8/10 [00:07<00:01,  1.10it/s]


Song 8:
  Text preview: Chiquitita, tell me what's wrong  
You're enchained by your own sorrow  
In your eyes there is no ...
  Original prediction: <pad> sadness
  Probability distribution:
    sadness   : 0.9723 (97.23%)
    joy       : 0.0147 (1.47%)
    fear      : 0.0053 (0.53%)
    anger     : 0.0032 (0.32%)
    love      : 0.0031 (0.31%)
    surprise  : 0.0001 (0.01%)
--------------------------------------------------------------------------------


Processing songs:  90%|█████████ | 9/10 [00:08<00:00,  1.09it/s]


Song 9:
  Text preview: I was out with the morning sun  
Couldn't sleep, so I thought I'd take a walk  
I was thinking of ...
  Original prediction: <pad> surprise
  Probability distribution:
    surprise  : 0.8947 (89.47%)
    fear      : 0.0561 (5.61%)
    joy       : 0.0228 (2.28%)
    anger     : 0.0130 (1.30%)
    sadness   : 0.0093 (0.93%)
    love      : 0.0010 (0.10%)
--------------------------------------------------------------------------------


Processing songs: 100%|██████████| 10/10 [00:08<00:00,  1.16it/s]


Song 10:
  Text preview: I'm waitin' for you baby  
I'm sitting all alone  
I feel so cold without you  
It chills me to t...
  Original prediction: <pad> anger
  Probability distribution:
    anger     : 0.9965 (99.65%)
    sadness   : 0.0026 (0.26%)
    love      : 0.0004 (0.04%)
    fear      : 0.0003 (0.03%)
    joy       : 0.0001 (0.01%)
    surprise  : 0.0000 (0.00%)
--------------------------------------------------------------------------------

✓ Processing complete!





## Save Results to CSV


In [8]:
# Also create separate columns for each emotion probability for easier analysis
for emotion in EMOTION_IDS.keys():
    test_df[f'prob_{emotion}'] = test_df['emotion_distribution'].apply(lambda x: x[emotion])

# Save to CSV
output_file = 'datasets/lyrics_mood_with_distributions_10songs.csv'
test_df.to_csv(output_file, index=False)

print(f"✓ Results saved to: {output_file}")
print(f"\nDataFrame shape: {test_df.shape}")
print(f"\nColumns in output:")
for col in test_df.columns:
    print(f"  - {col}")

# Display summary
print("\n" + "=" * 80)
print("SUMMARY OF FIRST 10 SONGS")
print("=" * 80)
print(test_df[['text', 'prob_joy', 'prob_sadness', 'prob_anger', 'prob_fear', 'prob_love', 'prob_surprise']].head(10))


✓ Results saved to: datasets/lyrics_mood_with_distributions_10songs.csv

DataFrame shape: (10, 11)

Columns in output:
  - artist
  - song
  - link
  - text
  - emotion_distribution
  - prob_joy
  - prob_sadness
  - prob_anger
  - prob_fear
  - prob_love
  - prob_surprise

SUMMARY OF FIRST 10 SONGS
                                                text  prob_joy  prob_sadness  \
0  Look at her face, it's a wonderful face  \r\nA...  0.999586      0.000019   
1  Take it easy with me, please  \r\nTouch me gen...  0.038810      0.000266   
2  I'll never know why I had to go  \r\nWhy I had...  0.048142      0.933231   
3  Making somebody happy is a question of give an...  0.725644      0.001899   
4  Making somebody happy is a question of give an...  0.739095      0.014799   
5  Well, you hoot and you holler and you make me ...  0.017800      0.352353   
6  Down in the street they're all singing and sho...  0.021921      0.948496   
7  Chiquitita, tell me what's wrong  \r\nYou're e...  0.0146

## Validation: Check that probabilities sum to ~1.0


In [9]:
# Validate that probabilities sum to approximately 1.0
prob_cols = ['prob_joy', 'prob_sadness', 'prob_anger', 'prob_fear', 'prob_love', 'prob_surprise']
test_df['prob_sum'] = test_df[prob_cols].sum(axis=1)

print("Validation: Sum of all emotion probabilities for each song")
print("=" * 80)
print(test_df[['prob_sum']].describe())

print("\n✓ All probability sums:")
for idx, row in test_df.iterrows():
    print(f"  Song {idx + 1}: {row['prob_sum']:.6f} ({row['prob_sum']*100:.2f}%)")

if (test_df['prob_sum'] > 0.99).all() and (test_df['prob_sum'] < 1.01).all():
    print("\n✓✓ SUCCESS: All probabilities sum to approximately 1.0!")
else:
    print("\n⚠ WARNING: Some probability sums are outside expected range.")


Validation: Sum of all emotion probabilities for each song
        prob_sum
count  10.000000
mean    0.998875
std     0.000872
min     0.996851
25%     0.998576
50%     0.998932
75%     0.999319
max     0.999906

✓ All probability sums:
  Song 1: 0.999892 (99.99%)
  Song 2: 0.999389 (99.94%)
  Song 3: 0.998938 (99.89%)
  Song 4: 0.999109 (99.91%)
  Song 5: 0.998508 (99.85%)
  Song 6: 0.998927 (99.89%)
  Song 7: 0.998447 (99.84%)
  Song 8: 0.998779 (99.88%)
  Song 9: 0.996851 (99.69%)
  Song 10: 0.999906 (99.99%)

✓✓ SUCCESS: All probabilities sum to approximately 1.0!
