In [2]:
# Install required packages
!pip install -U transformers
!pip install bert-score sentence-transformers rouge-score nltk textstat
!pip install -q -U evaluate

import os
os.environ["WANDB_DISABLED"] = "true"

import nltk
nltk.download("punkt")

import pandas as pd
import numpy as np
import random
import torch
from datasets import Dataset
from transformers import (
    pipeline,
    T5TokenizerFast,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments
)

import evaluate
import textstat
from sentence_transformers import SentenceTransformer, util



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Mount Google Drive to load/save data and models
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from getpass import getpass

token = getpass("Enter your Hugging Face token: ")
login(token=token)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter your Hugging Face token: ··········


In [4]:
# Load your Excel datasets (make sure these paths are correct)
train_file = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/Final Project/train_set.xlsx'
val_file = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/Final Project/val_set.xlsx'
test_file = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/Final Project/test_set.xlsx'

df_train = pd.read_excel(train_file).dropna(subset=['line1', 'line2'])
df_val = pd.read_excel(val_file).dropna(subset=['line1', 'line2'])
df_test = pd.read_excel(test_file).dropna(subset=['line1', 'line2'])

In [5]:
from transformers import pipeline

# Use the model's native labels
EMOTIONS = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"]
emotion_special_tokens = [f"<emotion_{e}>" for e in EMOTIONS]

emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    top_k=1
)

def predict_emotion(text):
    try:
        result = emotion_classifier(text, top_k=1)[0]  # ✅ FIX: only one [0]
        label = result["label"].lower()

        # Ensure label matches your EMOTIONS list
        if label not in EMOTIONS:
            if label == "happiness":
                label = "joy"
            elif label == "disgust":
                label = "disgust"
            else:
                label = "neutral"

        return label
    except Exception as e:
        print("Error:", e)
        return "neutral"

print("Predicting emotions...")
df_train['emotion'] = df_train['line1'].apply(predict_emotion)
df_val['emotion'] = df_val['line1'].apply(predict_emotion)
df_test['emotion'] = df_test['line1'].apply(predict_emotion)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


Predicting emotions...


  return forward_call(*args, **kwargs)
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [6]:
# Show first 10 rows of the training dataset with predicted emotions
print(df_train[['line1', 'emotion']].head(30))

# Show distribution of predicted emotions
print(df_train['emotion'].value_counts())

# Show a few random samples from test data
print(df_test[['line1', 'emotion']].sample(30))


                                                line1   emotion
0                       My midwest slabbers - ya dig?  surprise
1                                               Weezy   neutral
2                       I think you got false courage   neutral
3                              You can try lock me up   neutral
4          Walk with that check on my back (Lil Boat)   neutral
5         Burnt CD's and trees like this was Broadway   neutral
6                Hatred and attitude tear us entirely     anger
7          Bitch, I’m a pro, you a amateur, ugh (Ugh)   disgust
8                       - Lil Uzi Vert - Money Longer   neutral
9                                          Deez nuts!   disgust
10                 We don't want to fuck on your main     anger
11                                           Yah, yee   disgust
12             Lemme fuck some, lemme fuck some bitch     anger
13                  Got a brick the same color Alaska   neutral
14  Uh, and every day I wake up praying 

In [7]:
# Create prompt
def create_prompt_with_emotion_token(row):
    emotion = row['emotion'] if row['emotion'] in EMOTIONS else "neutral"
    emotion_token = f"<emotion_{emotion}>"
    return f"{emotion_token} Given this song lyric line, generate the next song lyric line: {row['line1']}"

df_train['input_text'] = df_train.apply(create_prompt_with_emotion_token, axis=1)
df_val['input_text'] = df_val.apply(create_prompt_with_emotion_token, axis=1)
df_test['input_text'] = df_test.apply(create_prompt_with_emotion_token, axis=1)

df_train['target_text'] = df_train['line2']
df_val['target_text'] = df_val['line2']
df_test['target_text'] = df_test['line2']

In [8]:
# Load tokenizer and model
model_name = "google/flan-t5-base"
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Add special tokens
tokenizer.add_tokens(emotion_special_tokens)
model.resize_token_embeddings(len(tokenizer))
model.gradient_checkpointing_enable()

# Convert to Dataset
train_dataset = Dataset.from_pandas(df_train[['input_text', 'target_text']])
val_dataset = Dataset.from_pandas(df_val[['input_text', 'target_text']])
test_dataset = Dataset.from_pandas(df_test[['input_text', 'target_text']])

In [9]:
from transformers import DataCollatorForSeq2Seq

# Preprocess
def preprocess_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(examples['target_text'], max_length=64, padding='max_length', truncation=True)
    inputs['labels'] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels['input_ids']
    ]
    return inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/79786 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/19939 [00:00<?, ? examples/s]

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="drive/MyDrive/Colab Notebooks/w266/Project/models/flan-t5-finetuned_best_prompt",
    per_device_train_batch_size=50,
    per_device_eval_batch_size=50,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=1000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

model = T5ForConditionalGeneration.from_pretrained(model_name)

# Train
trainer.train()

# Save model
trainer.save_model(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1000,3.4376


Step,Training Loss
1000,3.4376
2000,3.2833
3000,3.236
4000,3.1918


('drive/MyDrive/Colab Notebooks/w266/Project/models/flan-t5-finetuned_best_prompt/tokenizer_config.json',
 'drive/MyDrive/Colab Notebooks/w266/Project/models/flan-t5-finetuned_best_prompt/special_tokens_map.json',
 'drive/MyDrive/Colab Notebooks/w266/Project/models/flan-t5-finetuned_best_prompt/spiece.model',
 'drive/MyDrive/Colab Notebooks/w266/Project/models/flan-t5-finetuned_best_prompt/added_tokens.json',
 'drive/MyDrive/Colab Notebooks/w266/Project/models/flan-t5-finetuned_best_prompt/tokenizer.json')

In [11]:
# Generate predictions
model.eval()
batch_size = 16
results = []

for i in range(0, len(df_test), batch_size):
    batch_df = df_test.iloc[i:i+batch_size]
    prompts = batch_df.apply(create_prompt_with_emotion_token, axis=1).tolist()
    true_lines = batch_df['line2'].tolist()

    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=30,
            temperature=0.8,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            num_beams=1,
            pad_token_id=tokenizer.pad_token_id
        )
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    for prompt, true_line, gen_line in zip(prompts, true_lines, generated_texts):
        results.append({
            "prompt": prompt,
            "actual_line2": true_line,
            "generated_line2": gen_line
        })

df_results = pd.DataFrame(results)

In [12]:
# Evaluation setup
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
sbert = SentenceTransformer("all-MiniLM-L6-v2")

def calc_bleu(preds, refs):
    return bleu.compute(predictions=preds, references=refs)["bleu"]

def calc_rouge(preds, refs):
    r = rouge.compute(predictions=preds, references=refs)
    return r["rouge1"], r["rouge2"], r["rougeL"]

def calc_bertscore(preds, refs):
    b = bertscore.compute(predictions=preds, references=refs, lang="en")
    return np.mean(b["f1"])

def calc_sbert(preds, refs):
    sims = [util.cos_sim(sbert.encode(p), sbert.encode(r))[0][0].item() for p, r in zip(preds, refs)]
    return np.mean(sims)

def rhyme_score(gen, ref):
    g_last = gen.strip().split()[-1][-2:] if gen.strip() else ""
    r_last = ref.strip().split()[-1][-2:] if ref.strip() else ""
    return int(g_last == r_last)

def syllable_diff(g, r):
    return abs(textstat.syllable_count(g) - textstat.syllable_count(r))

def length_ratio(g, r):
    return len(g.split()) / max(len(r.split()), 1)

def diversity(texts):
    all_words = " ".join(texts).split()
    return len(set(all_words)) / max(len(all_words), 1)

generated = df_results["generated_line2"].tolist()
references = df_results["actual_line2"].tolist()

rhyme_scores = [rhyme_score(g, r) for g, r in zip(generated, references)]
syll_diffs = [syllable_diff(g, r) for g, r in zip(generated, references)]
len_ratios = [length_ratio(g, r) for g, r in zip(generated, references)]

# Calculate and print metrics
bleu_score = calc_bleu(generated, references)
rouge1, rouge2, rougeL = calc_rouge(generated, references)
bertscore_f1 = calc_bertscore(generated, references)
sbert_sim = calc_sbert(generated, references)
div = diversity(generated)
avg_rhyme = np.mean(rhyme_scores)
avg_syll_diff = np.mean(syll_diffs)
avg_len_ratio = np.mean(len_ratios)

print(f"\n📊 Evaluation Metrics on Flan-T5 Generated Lyrics with Emotion Prefix Tokens")
print(f"BLEU: {bleu_score:.4f}")
print(f"ROUGE-1: {rouge1:.4f}, ROUGE-2: {rouge2:.4f}, ROUGE-L: {rougeL:.4f}")
print(f"BERTScore F1: {bertscore_f1:.4f}")
print(f"SBERT similarity: {sbert_sim:.4f}")
print(f"Diversity (unique token ratio): {div:.4f}")
print(f"Average Rhyme Rate: {avg_rhyme:.4f}")
print(f"Average Syllable Count Difference: {avg_syll_diff:.2f}")
print(f"Average Length Ratio: {avg_len_ratio:.2f}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



📊 Evaluation Metrics on Flan-T5 Generated Lyrics with Emotion Prefix Tokens
BLEU: 0.0278
ROUGE-1: 0.1205, ROUGE-2: 0.0432, ROUGE-L: 0.1154
BERTScore F1: 0.8324
SBERT similarity: 0.2263
Diversity (unique token ratio): 0.1008
Average Rhyme Rate: 0.1055
Average Syllable Count Difference: 4.47
Average Length Ratio: 1.01


In [13]:
# Print random samples
num_samples = 10
sampled = random.sample(results, k=min(num_samples, len(results)))

for i, sample in enumerate(sampled, 1):
    print(f"--- Sample {i} ---")
    print(f"Prompt:         {sample['prompt']}")
    print(f"Actual Line 2:  {sample['actual_line2']}")
    print(f"Generated Line: {sample['generated_line2']}\n")

--- Sample 1 ---
Prompt:         <emotion_sadness> Given this song lyric line, generate the next song lyric line: The only reason I ain't go to SummerJam is cause I'm banned from the stadium
Actual Line 2:  Ten room mansion my lands a paladium
Generated Line: The only reason i ain't go to summerjam is cause i'm banned from the stadium

--- Sample 2 ---
Prompt:         <emotion_neutral> Given this song lyric line, generate the next song lyric line: Foreigns, I might hit the dash
Actual Line 2:  I been living very fast baby, yeah
Generated Line: I might hit the dash

--- Sample 3 ---
Prompt:         <emotion_neutral> Given this song lyric line, generate the next song lyric line: I'm a True Religion fein
Actual Line 2:  Bitch, what the fuck you mean?
Generated Line: i'm a true religion fein

--- Sample 4 ---
Prompt:         <emotion_disgust> Given this song lyric line, generate the next song lyric line: Nigga drew down in the Polo stoe', but pussy ain’t use it
Actual Line 2:  He approach 