In [1]:
# 📦 Install packages
!pip install transformers accelerate sentence-transformers evaluate nltk textstat peft --upgrade
!pip install rouge_score
!pip install bert_score

# 📚 Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import evaluate
import textstat
import nltk
import numpy as np
import torch
from peft import PeftModel
nltk.download("punkt")

Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting textstat
  Downloading textstat-0.7.8-py3-none-any.whl.metadata (15 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.2-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.1.1-py3-none-any.whl.metadata (3.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting n

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# ✅ Load base + LoRA adapter
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
adapter_path = "/content/drive/MyDrive/Colab Notebooks/DATASCI 266/Final Project/llama-rap-finetuned"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token  # Required for padding

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
model = PeftModel.from_pretrained(base_model, adapter_path)



In [19]:
# ✅ Load dataset
test_df = pd.read_excel("/content/drive/My Drive/Colab Notebooks/DATASCI 266/Final Project/test_set.xlsx")
test_df_sample = test_df.sample(n=100, random_state=42)

In [20]:
# ✅ Prompt variants
PROMPT_VARIANTS = {
    "P1": "Given this rap line, generate the next line: {line1}"
}

# ✅ Evaluation metrics setup
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
sbert = SentenceTransformer("all-MiniLM-L6-v2")

def calc_bleu(preds, refs):
    return bleu.compute(predictions=preds, references=refs)["bleu"]

def calc_rouge(preds, refs):
    r = rouge.compute(predictions=preds, references=refs)
    return r["rouge1"], r["rouge2"], r["rougeL"]

def calc_bertscore(preds, refs):
    b = bertscore.compute(predictions=preds, references=refs, lang="en")
    return np.mean(b["f1"])

def calc_sbert(preds, refs):
    sims = [util.cos_sim(sbert.encode(p), sbert.encode(r))[0][0].item() for p, r in zip(preds, refs)]
    return np.mean(sims)

def rhyme_score(gen, ref):
    g_last = gen.strip().split()[-1][-2:] if gen.strip() else ""
    r_last = ref.strip().split()[-1][-2:] if ref.strip() else ""
    return int(g_last == r_last)

def syllable_diff(g, r):
    return abs(textstat.syllable_count(g) - textstat.syllable_count(r))

def length_ratio(g, r):
    return len(g.split()) / max(len(r.split()), 1)

def diversity(texts):
    all_words = " ".join(texts).split()
    return len(set(all_words)) / max(len(all_words), 1)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
# ✅ Main evaluation function
def evaluate_llama_prompt(prompt_id, prompt_template, model, tokenizer, test_df, max_new_tokens=30):
    prompts, generated, references = [], [], []
    rhyme_scores, syll_diffs, len_ratios = [], [], []

    for _, row in test_df.iterrows():
        line1 = row["line1"]
        line2 = row["line2"]
        prompt = prompt_template.format(line1=line1.strip())

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                temperature=0.8,
                top_p=0.95
            )
        gen_line = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()

        prompts.append(prompt)
        generated.append(gen_line)
        references.append(line2)
        rhyme_scores.append(rhyme_score(gen_line, line2))
        syll_diffs.append(syllable_diff(gen_line, line2))
        len_ratios.append(length_ratio(gen_line, line2))

    # 🔍 Aggregate metrics
    bleu_score = calc_bleu(generated, references)
    r1, r2, rL = calc_rouge(generated, references)
    bert_f1 = calc_bertscore(generated, references)
    sbert_sim = calc_sbert(generated, references)
    div = diversity(generated)

    print(f"\n📊 Prompt: {prompt_id}")
    print(f"BLEU: {bleu_score:.4f}, ROUGE-1: {r1:.4f}, ROUGE-2: {r2:.4f}, ROUGE-L: {rL:.4f}")
    print(f"BERTScore F1: {bert_f1:.4f}, SBERT: {sbert_sim:.4f}, Diversity: {div:.4f}")
    print(f"Avg Rhyme Rate: {np.mean(rhyme_scores):.4f}, Avg Syllable Diff: {np.mean(syll_diffs):.2f}, Length Ratio: {np.mean(len_ratios):.2f}")

    return pd.DataFrame({
        "prompt_id": prompt_id,
        "prompt": prompts,
        "generated_line": generated,
        "reference_line": references,
        "rhyme": rhyme_scores,
        "syllable_diff": syll_diffs,
        "length_ratio": len_ratios,
    })


In [22]:
# ✅ Collect and display results
all_results = []
for prompt_id, prompt_template in PROMPT_VARIANTS.items():
    df_result = evaluate_llama_prompt(prompt_id, prompt_template, model, tokenizer, test_df_sample)
    all_results.append(df_result)

final_df = pd.concat(all_results)

# Show result
from IPython.display import display
print(final_df.head(10))  # Or just use display(final_df)
display(final_df)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



📊 Prompt: P1
BLEU: 0.0089, ROUGE-1: 0.0840, ROUGE-2: 0.0160, ROUGE-L: 0.0790
BERTScore F1: 0.8212, SBERT: 0.2014, Diversity: 0.3661
Avg Rhyme Rate: 0.0100, Avg Syllable Diff: 8.20, Length Ratio: 2.20
  prompt_id                                             prompt  \
0        P1  Given this rap line, generate the next line: G...   
1        P1  Given this rap line, generate the next line: W...   
2        P1  Given this rap line, generate the next line: L...   
3        P1  Given this rap line, generate the next line: N...   
4        P1  Given this rap line, generate the next line: B...   
5        P1  Given this rap line, generate the next line: I...   
6        P1  Given this rap line, generate the next line: A...   
7        P1  Given this rap line, generate the next line: S...   
8        P1  Given this rap line, generate the next line: A...   
9        P1  Given this rap line, generate the next line: S...   

                                      generated_line  \
0  Oh, girl, wha

Unnamed: 0,prompt_id,prompt,generated_line,reference_line,rhyme,syllable_diff,length_ratio
0,P1,"Given this rap line, generate the next line: G...","Oh, girl, whatever you want ooh-ooh-ooh, ooh-o...",Just keep me inside,0,4,1.750000
1,P1,"Given this rap line, generate the next line: W...","Niggas don't want me to succeed, that's why th...","I'm in the drivers seat, but you can ride along",0,10,1.800000
2,P1,"Given this rap line, generate the next line: L...",", yeah, we here now, let 'em hear 'em\nI'ma ge...",Let 'em hear them loud pipes,0,12,3.166667
3,P1,"Given this rap line, generate the next line: N...","Cop a few bands, they know what time it is, ay...",She tell me that I got trust issues,0,5,1.875000
4,P1,"Given this rap line, generate the next line: B...",", I'm a baller\nNah, it's that ball, it don't ...",A shot callin if I fall then my thoughts gon' ...,0,3,1.363636
...,...,...,...,...,...,...,...
95,P1,"Given this rap line, generate the next line: ""...","Shawty, I'm so gone (Damn) Shawty, I'm so gone...",Uh,0,13,11.000000
96,P1,"Given this rap line, generate the next line: D...","Young Gud with the woah, woah, woah, woah, woa...",Do it like woah,0,8,3.000000
97,P1,"Given this rap line, generate the next line: A...",It's so much money on the ground I'ma turn it ...,Might as well go for the gusto now,0,13,2.375000
98,P1,"Given this rap line, generate the next line: P...",It ain't no tellin' what I'll do to get you ne...,"Yeah, and it don't quit",0,15,3.400000


In [None]:
final_df.to_csv("/content/drive/My Drive/Colab Notebooks/DATASCI 266/Final Project/llama_pretrained_eval.csv", index=False)
