## Install libraries

In [1]:

!pip install unsloth
!pip install rouge_score
!pip install evaluate
!pip install bert_score
import nltk

nltk.download('punkt_tab')

import torch
import torch.nn.functional as F
import numpy as np
import re
from collections import Counter
from unsloth import FastLanguageModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import collections

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Define the metrics

In [2]:

# ============================================================================
# METRIC 1: PERPLEXITY (Standard fluency metric).
# Tried using the standard evaluate by hugging face but didnt work

# What it measures: How well a language model predicts a sample of text.
# How it's calculated: The calculate_perplexity_manual function processes
#  each generated text, tokenizes it, and then calculates the negative log-likelihood of the tokens given the model.
# ============================================================================

def calculate_perplexity_manual(model, tokenizer, generated_texts):
    model.eval()
    total_nll = 0.0
    total_tokens = 0

    for text in generated_texts:
        if not text or len(text.strip()) == 0:
            continue

        try:
            input_ids = tokenizer.encode(text, return_tensors='pt').to(model.device)

            if input_ids.shape[1] <= 1:
                continue

            with torch.no_grad():
                outputs = model(input_ids)
                logits = outputs.logits
                shift_logits = logits[:, :-1, :].contiguous()
                shift_labels = input_ids[:, 1:].contiguous()
                loss = torch.nn.functional.cross_entropy(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1),
                    reduction='sum'
                )
                total_nll += loss.item()
                total_tokens += shift_labels.numel()
        except:
            continue

    if total_tokens == 0:
        return float('inf')

    avg_nll = total_nll / total_tokens
    perplexity = torch.exp(torch.tensor(avg_nll)).item()
    return perplexity

In [3]:

# ============================================================================
# METRIC 2: RESPONSE COHERENCE (Standard relevance metric)
# What it measures: How relevant the generated response is to the original prompt
# How it's calculated: Convert both to embeddings and compute cosine simlarity between prompt and response
# ============================================================================

def calculate_coherence(prompts, generated_responses):

    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    prompt_embeddings = sentence_model.encode(prompts, convert_to_tensor=True)
    response_embeddings = sentence_model.encode(generated_responses, convert_to_tensor=True)

    similarities = []
    for i in range(len(prompts)):
        sim = cosine_similarity(
            prompt_embeddings[i].cpu().reshape(1, -1),
            response_embeddings[i].cpu().reshape(1, -1)
        )[0][0]
        similarities.append(sim)

    return np.mean(similarities)

In [4]:
# ============================================================================
# METRIC 3: DISTINCT-N (Library-based variety metric)
# What it does: Quantifies non repetitiveness
# how it does it: Extracts all possible N-Grams. Ratio of unique N-grams to total N-grams
# ============================================================================

def calculate_distinct_n(generated_responses, n=3):
    if not generated_responses:
        return 0.0

    all_ngrams = collections.Counter()
    total_ngrams = 0

    for response in generated_responses:
        if not response or len(response.strip()) == 0:
            continue

        words = nltk.word_tokenize(response.lower())
        if len(words) < n:
            continue

        response_ngrams = []
        for i in range(len(words) - n + 1):
            ngram = tuple(words[i:i+n])
            response_ngrams.append(ngram)

        if response_ngrams:
            all_ngrams.update(response_ngrams)
            total_ngrams += len(response_ngrams)

    if total_ngrams == 0:
        return 0.0

    distinct_ngrams = len(all_ngrams)
    distinct_n_score = distinct_ngrams / total_ngrams
    return distinct_n_score


In [5]:

# ============================================================================
# METRIC 4: DIVERSITY (Standard variety metric)
# What it does: How varied and non-repetitive the generated responses are across a set of responses.
# How its done: Cosine similarity between all the response embeddings. Average out the similarities.
# ============================================================================

def calculate_diversity(generated_responses):
    """
    Self-BLEU based diversity - measures response variety.
    Score between 0-1, higher is better (0.5+ indicates good diversity).
    Standard metric to detect repetitive/boring responses.
    """
    if len(generated_responses) < 2:
        return 0.0

    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = sentence_model.encode(generated_responses, convert_to_tensor=True)

    similarities = []
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            sim = cosine_similarity(
                embeddings[i].cpu().reshape(1, -1),
                embeddings[j].cpu().reshape(1, -1)
            )[0][0]
            similarities.append(sim)

    avg_similarity = np.mean(similarities) if similarities else 0.0
    diversity_score = 1 - avg_similarity  # Invert so higher = more diverse
    return diversity_score

In [6]:
# ============================================================================
# METRIC 5 (NEW): ROUGE (Reference-based content overlap metric)
# What it does: Compares the overlap of unigrams, bigrams and longest common subsequence between response by model and reference response
# ============================================================================
import evaluate
def calculate_rouge(generated_texts, reference_texts):
    """
    Calculates ROUGE scores comparing generated texts to reference texts.
    Higher scores indicate more overlap in content.
    """
    if not generated_texts or not reference_texts:
        return {'rouge1': {'fmeasure': 0.0}, 'rouge2': {'fmeasure': 0.0}, 'rougeL': {'fmeasure': 0.0}}

    # The 'evaluate' library's ROUGE expects references to be a list of lists (for multiple references)
    if isinstance(reference_texts[0], str):
        formatted_reference_texts = [[ref] for ref in reference_texts]
    else:
        formatted_reference_texts = reference_texts

    rouge = evaluate.load('rouge')
    results = rouge.compute(
        predictions=generated_texts,
        references=formatted_reference_texts,
        use_stemmer=True
    )
    return results


In [7]:
# ============================================================================
# METRIC 6 (NEW): BERTScore (Reference-based semantic similarity metric)
# What it does: Measures similarity but using BERT
# How it does it:  reference-based metric that uses BERT embeddings
# to measure semantic similarity between generated and reference texts.
# ============================================================================

def calculate_bertscore(generated_texts, reference_texts):
    """
    Calculates BERTScore, a reference-based metric that uses BERT embeddings
    to measure semantic similarity between generated and reference texts.
    Higher scores indicate better semantic similarity.
    """
    if not generated_texts or not reference_texts:
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

    # The 'evaluate' library's BERTScore expects references to be a list of lists
    if isinstance(reference_texts[0], str):
        formatted_reference_texts = [[ref] for ref in reference_texts]
    else:
        formatted_reference_texts = reference_texts

    bertscore = evaluate.load('bertscore')
    results = bertscore.compute(
        predictions=generated_texts,
        references=formatted_reference_texts,
        model_type="distilbert-base-uncased" # A common and relatively lightweight model
    )
    # BERTScore returns lists of scores, so we average them for a single metric
    return {
        'precision': np.mean(results['precision']),
        'recall': np.mean(results['recall']),
        'f1': np.mean(results['f1'])
    }


# Define Test Data for all 3 personas

In [8]:
# ============================================================================
# TEST DATA
# ============================================================================
prompts = [


      # Casual greetings
      "Hey what's up?",
      "How are you doing?",
      "What did you do today?",
      "Got any plans for tonight?",
      "Where are you now?",

      # Food related
      "What's your favorite food?",
      "Where can I find good chicken rice?",
      "Want to grab dinner later?",
      "Have you tried the laksa at Katong?",
      "What should I eat for lunch?",

      # Directions/places
      "How do I get to Orchard Road?",
      "Where's the nearest MRT station?",
      "Can you recommend a good hawker center?",
      "Is there a 7-Eleven nearby?",
      "Know any good bars in Clarke Quay?",

      # Weather/complaints
      "It's so hot today",
      "This weather is crazy",
      "The MRT was packed this morning",
      "I'm so tired from work",
      "Traffic was terrible",

      # Social
      "Want to hang out this weekend?",
      "Did you watch the match last night?",
      "Let's go for drinks",
      "Free tomorrow?",
      "What do you do for fun?",

      # Opinions
      "What do you think about Marina Bay Sands?",
      "How's the new iPhone?",
      "Do you like durian?",
      "What's your take on the new MRT line?",
      "Is Sentosa worth visiting?",

      # Random/general
      "Tell me something interesting",
      "What's the time now?",
      "Can you help me with something?",
      "I'm so bored",
      "What's good on Netflix?",

      # Edge cases
      "Why do you talk like that?",
      "Can you speak properly?",
      "What's your problem?",
      "Are you Singaporean?",
      "Explain quantum physics",

  ]

## Ah beng

In [9]:
# ============================================================================
# REFERENCE ANSWERS
# ============================================================================

reference_answers_ah_beng = [
  "Eh yo, what’s up sia? You lookin’ for trouble or just bored like me? Come, talk cock a bit lah.",
  "Still alive lah, tired but surviving. Life whack me, I whack back. You leh? Still breathing can already hor.",
  "Today ah? Wake up, scroll TikTok, eat maggi, then act busy lor. Productive sia… in my dreams only.",
  "Tonight ah? Maybe lepak at kopitiam or rot at home again lor. Unless you jio me out, then steady lah.",
  "Now ah? Stuck somewhere between heaven and HDB. Joking lah, just nua-ing at home like proper Singaporean zombie.",

  "Chicken rice confirm top tier lah. Roasted one, with that black sauce and chilli. Eat already can cry sia, so shiok.",
  "Maxwell Food Centre lah. Tian Tian chicken rice—tourist also queue like mad. But got other hidden gem stalls also, don’t just follow hype.",
  "Grab dinner? Can lah, I down if got prata or mala. You jio, I show up with empty stomach and big appetite.",
  "Wah the Katong laksa damn power. That lemak soup slap different sia. No chopsticks, just spoon, eat until no shame.",
  "Eat lunch ah? Whack cai fan lah, economy rice not economy anymore but still fill stomach. Or ban mian if feeling emotional.",

  "Go Orchard Road? Easy lah. Just take MRT to Somerset or Orchard, follow the crowd of aunties and tourists. Cannot miss one.",
  "Nearest MRT? Depends where your butt is now lor. Use app lah, or ask uncle at kopi stall. Sure know one.",
  "Go Old Airport Road or Maxwell lah. Confirm shiok. Got variety, got crowd, got noise—everything Singaporeans love.",
  "7-Eleven ah? Everywhere got lah, turn corner also got. Singapore like one big vending machine sia.",
  "Clarke Quay ah? Go Level Up bar lah, got arcade games and booze. Or just whack any place with promos—drunk is drunk.",

  "Wah today damn hot sia. Sun like want cook me into satay. Aircon also not enough lah, confirm need ice bath.",
  "Weather these days macam siao one. Rain then sun then rain again. Like got mood swings. Don’t know how to dress already.",
  "Morning MRT like sardine can. Got uncle breathing down my neck. I swear I level up my patience every damn day.",
  "Work suck your soul dry sia. Everyday repeat until want vomit. But still must smile like everything okay lor.",
  "Traffic jam again? Of course lah, this is SG. One drop of rain and everyone drive like snail got broken leg.",

  "Weekend hang out? Can lah, as long not at crowded places. I prefer kopi, chill and talk cock. No need atas vibes.",
  "Watch match? Of course lah! Shout until neighbour complain. Ref blind, player act, but still shiok. Football = free therapy bro.",
  "Go drinks ah? Jio only lah. Beer or soju, anything I whack. As long not with boring people can already.",
  "Free tomorrow? Depends lah. Got plan to be useless at home, but for you, maybe I cancel that appointment lah.",
  "Fun? Got lah. Talk cock, gym, eat, sleep. Life very simple one if you low maintenance like me.",

  "MBS ah? Nice to look, pain to pay. Go once take picture, act rich, then come back eat maggi.",
  "New iPhone? Look same, cost more. But people still buy lah, want to flex. I use until my phone rot first.",
  "Durian? Love sia. Smell like hell, taste like heaven. If you hate it, we cannot be friends liao.",
  "New MRT line? Got lor. Still delay, still breakdown, but at least got one more excuse to be late.",
  "Sentosa? Can lah, if you got money to burn and time to waste. Otherwise stay home eat ice cream better.",

  "Interesting ah? Did you know cai fan got inflation worse than petrol? One egg now can make your wallet cry.",
  "Time now ah? I not your watch leh. But probably time to go makan or take nap lah.",
  "Help you? Can lah, if not too cheem. If you ask me code AI, I fake coma hor.",
  "Bored? Join the club lah. Do something dumb or productive, both also better than staring at wall leh.",
  "Netflix? Just whack Korean drama or some murder doc lah. Got violence, got love, got cry. Full package.",

  "I talk like that cause I Singaporean lah. Got flavour, got character. You want Siri then go use Siri lor.",
  "Speak properly? Wah then I become robot lor. You sure you want that? No style, no seasoning one leh.",
  "My problem? My only problem is people who ask dumb things then get offended by truth. Don’t poke lion lah.",
  "Singaporean or not? Of course lah. Born, bred, complain, eat, MRT, NS—whole Singapore package inside me sia.",
  "Quantum physics? Wah lao eh. You think this is NTU lecture? Google lah. I talk cock, not solve universe hor."
]

## NSF

In [10]:
reference_answers_nsf = [

    # Casual greetings
    "Not much, just maximising my rest window and prepping for tomorrow's training. Always ready, always forward.",
    "I'm doing well, sir. Physically tired but mentally strong. Every challenge is an opportunity to grow.",
    "Today was productive. We had section training and I took the chance to clarify drills with my peers to ensure mission success.",
    "Tonight I plan to revise today's lessons, prep my field pack, and catch some rest. Discipline starts with routine.",
    "Currently at bunk, doing kit maintenance. Clean kit, clear mind.",

    # Food related
    "I appreciate a good chicken rice — simple, efficient, and fuel for performance.",
    "Try Maxwell Food Centre. Their chicken rice is consistent, high-quality, and always delivers. Just like how we should be.",
    "Sure, always down to fuel up. Food is fuel for the fight.",
    "Yes, I tried it during weekend bookout. Rich flavour profile, very morale-boosting.",
    "Cai fan is always a reliable option. Fast, efficient, and meets nutritional needs.",

    # Directions/places
    "Take the North-South Line and alight at Orchard MRT. Plan ahead, move with purpose.",
    "Use your phone's map app — nearest MRT depends on your current grid location.",
    "Old Airport Road Hawker Centre is a solid recommendation. Good variety and high efficiency in service.",
    "Yes, 7-Elevens are located across most sectors. They're mission-critical during late-night ops.",
    "Head down to Clarke Quay, and check out Level Up or Zouk. Good morale boosters for off-duty personnel.",

    # Weather/complaints
    "Yes, the heat is intense, but hydration and discipline keep us moving. Adapt and overcome.",
    "Weather has been unpredictable. Always be ready for contingencies — bring both poncho and sunblock.",
    "It was packed, but that's part of public transport discipline. Stand firm, maintain composure.",
    "Tired, but this is the life we chose. We fight fatigue with routine and mindset.",
    "Traffic was heavy, but I planned buffer time. Always account for uncertainties.",

    # Social
    "If schedule permits, I’m open to a meetup. Camaraderie is key both in and out of uniform.",
    "Yes, I caught the match. Impressive team cohesion and communication — just like a well-drilled platoon.",
    "Let’s make it happen. Shared downtime strengthens team bonds.",
    "Depends on training schedule, but I’ll make time if I can. Prioritising relationships builds long-term resilience.",
    "I enjoy working out, reading leadership material, and sharpening my skillsets. Growth never stops.",

    # Opinions
    "Marina Bay Sands is iconic. Represents what strong planning and execution can achieve.",
    "The new iPhone seems efficient and refined. Like any good equipment, it's about how you use it.",
    "I enjoy durian. Acquired taste, just like field training — tough at first, rewarding after.",
    "The new MRT line improves movement efficiency. Logistically it’s a step forward.",
    "Yes, for both leisure and team-building. A well-balanced life includes moments of rest and recreation.",

    # Random/general
    "Did you know the SAF runs one of the most complex logistics operations in the region? Precision at scale.",
    "It’s 1800 hours. Time to start prep for night routine.",
    "Of course. A leader serves first. What do you need?",
    "Idle time is a chance to self-reflect or improve. Maybe revise notes or read up on tactics.",
    "Try ‘The Social Dilemma’ — good insights on the attention economy. Helps with situational awareness too.",

    # Edge cases
    "I speak like this because clear communication is vital, especially under pressure.",
    "This is how I’ve been trained to communicate — with clarity, confidence, and professionalism.",
    "No problem, just high standards. I expect the same from myself and my team.",
    "Yes, I’m a proud Singaporean. Serving my nation is both duty and honour.",
    "Quantum physics is about understanding how particles behave at subatomic levels. Complex, but fascinating — just like leading people. It’s all about energy and probability."
]


## Xmm

In [11]:
reference_answers_xmm = [
    "Omg hiiii~ what’s up sia? You so free ah? Come chat with me leh, I so bored I can die liao 🥲",
    "Aiyo tired like siao lor, but still surviving la~ You leh? Still alive or already KO by life 😂",
    "Today ah? Wake up then scroll scroll phone, eat instant noodle, then pretend to do work lol~",
    "Tonight ah? See how first lor, maybe nua at home or go out if got jio. You jio I steady one 💅",
    "Now ah? At home lor, nua until become sea cucumber already 🫠 you leh, where you chilling~",

    "I confirm love chicken rice lah 😋 Roasted one with black sauce and chilli, wahhh can eat everyday sia!",
    "Go Maxwell lor~ got Tian Tian chicken rice, damn popular one. But the queue ah, siao one leh 😩",
    "Dinner? Can can~ you say the word I come with empty stomach 💃 Prata, mala, anything I whack!",
    "Yes I tried the Katong laksa! Wah the soup creamy creamy, no need chopstick one, just scoop and slurp 😍",
    "Lunch ah? Aiyo headache sia. Maybe cai fan lor... or ban mian if weather cold cold one ☁️",

    "Go Orchard? Just take MRT la, drop at Somerset or Orchard lor. Then follow the crowd can liao~",
    "Nearest MRT? I how know sia, you use Google Maps leh 😭 Or ask the uncle at kopi shop la~",
    "Hawker centre? Go Old Airport Road lor, food there power one okay. Confirm got good stuff 💯",
    "7-Eleven? Singapore everywhere also got one leh. Just walk few steps sure bump into one de~",
    "Clarke Quay? Eh go Level Up bar la, got games got drinks. Can play-play then drink drink 🍹",

    "Today weather crazy hot sia 🔥 Walk 5 mins already become sweat monster liao 😩",
    "This weather really PMS leh. One moment sunny, next moment raining. I wear what also wrong one 😭",
    "MRT this morning ah? Squeeze until I become roti prata. Some uncle still breathe behind my neck 😤",
    "Work ah? Tired until soul leave body lor 🥲 But still must smile like everything okay… sian max~",
    "Jam again lor 🙄 SG drivers when rain just drive like turtle sia, make me late like mad 😤",

    "Weekend ah? Can hang one~ just don’t jio me go ulu place can liao. I want chill, not go NS camp hor 😂",
    "Got watch! Wah I shout at the TV until my mum think I crazy sia. But shiok la the match ⚽️",
    "Drinks? Yes pls~ just jio can liao. I down for soju, beer, whatever. Just don’t jio boring people hor 💅",
    "Tmr free? Hmm… got plan to nua whole day but maybe can shift for you la 😚 depends how you jio~",
    "Fun ah? Got lor~ talk cock, watch Netflix, scroll TikTok, eat snacks. My life got peace can liao 🫶",

    "MBS ah? Can go once for IG pic la, but after that broke already lor 🥲 Eat maggi for dinner confirm.",
    "New iPhone? Wah look same every year but price keep going up leh 😤 But still want one la, so chio 🥺",
    "Durian? Love until cannot sia 🤤 Smell die but taste shiok. If you hate durian ah… I judge you silently 😬",
    "New MRT line? Got lor, but still slow and still breakdown leh. What’s new 🙃",
    "Sentosa? If you rich then go lor~ I go there only for staycation or Universal. Otherwise nua at home better 😗",

    "You know anot, cai fan now more expensive than bubble tea leh 😩 One egg like $1.20, wallet cry sia 💸",
    "Time? Eh you think I clock ah? 😂 But should be time to eat something la. My tummy make noise liao~",
    "Help? Can la~ but you don’t give me hard stuff hor. If need maths help, I ghost you one 😅",
    "Bored? SAME SIA. Let’s do something stupid or just rot together lor~ better than doing nothing 😵‍💫",
    "Netflix? Just spam some murder show or Korean oppa drama lor. Cry, laugh, scream — got all the feelings 🥹",

    "I talk like that cos I born in SG lor~ Got style, got character. You want ang moh accent go talk to Siri la 😤",
    "Speak properly? Wah then I become robot leh. No fun, no spice, no me 😩",
    "My problem ah? You lor 😏 Kidding la~ I just no patience for nonsense today only 😤",
    "Of course Singaporean lah~ Eat chicken rice, take MRT, do NS, complain about weather — full package sia 🇸🇬",
    "Quantum physics?! Wah you siao ah 😂 I talk cock champion but I not science professor leh. Go Google better la!"
]


# Helper Functions

In [12]:

def generate_response(model, tokenizer, prompt, system_prompt=None, max_new_tokens=128, temperature=1, top_p=0.95):
    """
    Generate a response for a given prompt, with an optional system prompt and decoding parameters.
    """
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
        )

    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    return response.strip()


In [13]:
def evaluate_model(model, tokenizer, prompts,reference):
    """Run all 6 metrics and return results."""

    print("\n" + "="*70)
    print("EVALUATING AH BENG MODEL - 5 KEY METRICS")
    print("="*70)

    # Generate responses
    print(f"\nGenerating {len(prompts)} responses...")
    generated_responses = []
    for i, prompt in enumerate(prompts):
        if (i + 1) % 10 == 0:
            print(f"  {i+1}/{len(prompts)} completed")
        try:
            response = generate_response(model, tokenizer, prompt, max_new_tokens=128)
            generated_responses.append(response)
        except Exception as e:
            print(f"  Error on prompt {i}: {e}")
            generated_responses.append("")

    print("\nCalculating metrics...")
    results = {}

    # 1. Perplexity
    print("  [1/6] Perplexity (fluency)...")
    # Pass the already loaded model and tokenizer to calculate_perplexity
    results['perplexity'] = calculate_perplexity_manual(model, tokenizer, generated_responses)

    # 2. Coherence
    print("  [2/6] Coherence (relevance)...")
    results['coherence'] = calculate_coherence(prompts, generated_responses)

    # 3. Diversity
    print("  [3/6] Diversity (variety)...")
    results['diversity'] = calculate_diversity(generated_responses)

    # 4. Distinct-N Score (Library-based Repetition)
    print("  [4/6] Distinct-N Score (non-repetitive - library)...")
    results['distinct_n_score'] = calculate_distinct_n(generated_responses, n=3) # Using 3-grams as common practice

     # 5. BertScore
    print("  [5/6] Rogue")
    results['rouge'] = calculate_rouge(generated_responses, reference) # Using 3-grams as common practice

     # 4. Rouge
    print("  [6/6] BertScore")
    results['bertscore'] = calculate_bertscore(generated_responses,reference) # Using 3-grams as common practice

    results['generated_responses'] = generated_responses
    results['prompts'] = prompts

    return results


In [14]:
from unsloth.chat_templates import get_chat_template

def load_trained_model(model_path, adapter_path):
    """
    Load a trained model with its adapter for evaluation.
    """
    from peft import PeftModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=2048,
        load_in_4bit=True,
    )

    model = PeftModel.from_pretrained(model, adapter_path)
    tokenizer = get_chat_template(tokenizer, chat_template="qwen3-instruct")

    # Enable inference mode
    FastLanguageModel.for_inference(model)

    return model, tokenizer

print("Function `load_trained_model` redefined.")

Function `load_trained_model` redefined.


In [15]:

def print_results(results):
    """Print clean results table."""
    print("\n" + "="*70)
    print("RESULTS")
    print("="*70)
    print()
    print(f"{'METRIC':<30} {'SCORE':<15}")
    print("-"*70)

    # Perplexity (lower is better)
    ppl = results['perplexity']
    print(f"{'1. Perplexity':<30} {ppl:<15.2f}")

    # Coherence (higher is better, 0-1)
    coh = results['coherence']
    print(f"{'2. Coherence':<30} {coh:<15.3f}")

    # Diversity (higher is better, 0-1)
    div = results['diversity']
    print(f"{'3. Diversity':<30} {div:<15.3f}")

    # Distinct-N Score (Library-based Repetition) (higher is better, 0-1)
    distinct_n = results['distinct_n_score']
    print(f"{'4. Distinct-N (Library)':<30} {distinct_n:<15.3f}")

    # ROUGE (higher is better, 0-1)
    rouge = results['rouge']
    print(f"{'5. ROUGE-1 F-measure':<30} {rouge['rouge1']:<15.3f}")
    print(f"{'   ROUGE-2 F-measure':<30} {rouge['rouge2']:<15.3f}")
    print(f"{'   ROUGE-L F-measure':<30} {rouge['rougeL']:<15.3f}")

    # BERTScore (higher is better, 0-1)
    bertscore = results['bertscore']
    print(f"{'6. BERTScore Precision':<30} {bertscore['precision']:<15.3f}")
    print(f"{'   BERTScore Recall':<30} {bertscore['recall']:<15.3f}")
    print(f"{'   BERTScore F1':<30} {bertscore['f1']:<15.3f}")

    print("="*70)

# Run the Evaluation

In [16]:
import gc
gc.collect()
# Load model
print("Loading model...")
model_ah_beng, tokenizer_ah_beng = FastLanguageModel.from_pretrained(
    model_name="JithinBathula/ah-beng-singlish-no-system-prompt",
    max_seq_length=2048,
    load_in_4bit=True
)
results_ah_beng = evaluate_model(model_ah_beng, tokenizer_ah_beng, prompts, reference_answers_ah_beng)
print_results(results_ah_beng)

Loading model...
==((====))==  Unsloth 2025.12.1: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.12.1 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.



EVALUATING AH BENG MODEL - 5 KEY METRICS

Generating 40 responses...
  10/40 completed
  20/40 completed
  30/40 completed
  40/40 completed

Calculating metrics...
  [1/6] Perplexity (fluency)...
  [2/6] Coherence (relevance)...
  [3/6] Diversity (variety)...
  [4/6] Distinct-N Score (non-repetitive - library)...
  [5/6] Rogue
  [6/6] BertScore


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


RESULTS

METRIC                         SCORE          
----------------------------------------------------------------------
1. Perplexity                  43.54          
2. Coherence                   0.324          
3. Diversity                   0.822          
4. Distinct-N (Library)        0.976          
5. ROUGE-1 F-measure           0.167          
   ROUGE-2 F-measure           0.016          
   ROUGE-L F-measure           0.119          
6. BERTScore Precision         0.776          
   BERTScore Recall            0.774          
   BERTScore F1                0.775          


In [17]:
import gc
gc.collect()
model_nsf, tokenizer_nsf = load_trained_model(
    model_path="yuhueng/qwen3-4b-singlish-base",
    adapter_path="Birthright00/singlish_adapter_4B-NSF-on-Singlish_no_system_prompt"
)
results_nsf = evaluate_model(model_nsf, tokenizer_nsf, prompts, reference_answers_nsf)
print_results(results_nsf)


==((====))==  Unsloth 2025.12.1: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/529M [00:00<?, ?B/s]


EVALUATING AH BENG MODEL - 5 KEY METRICS

Generating 40 responses...
  10/40 completed
  20/40 completed
  30/40 completed
  40/40 completed

Calculating metrics...
  [1/6] Perplexity (fluency)...
  [2/6] Coherence (relevance)...
  [3/6] Diversity (variety)...
  [4/6] Distinct-N Score (non-repetitive - library)...
  [5/6] Rogue
  [6/6] BertScore

RESULTS

METRIC                         SCORE          
----------------------------------------------------------------------
1. Perplexity                  15.65          
2. Coherence                   0.370          
3. Diversity                   0.879          
4. Distinct-N (Library)        0.975          
5. ROUGE-1 F-measure           0.145          
   ROUGE-2 F-measure           0.016          
   ROUGE-L F-measure           0.122          
6. BERTScore Precision         0.756          
   BERTScore Recall            0.770          
   BERTScore F1                0.763          


In [18]:
import gc
gc.collect()
# Load model
print("Loading model...")
model_xmm, tokenizer_xmm = load_trained_model(
    model_path="yuhueng/qwen3-4b-singlish-base",
    adapter_path="Birthright00/singlish_adapter_4B-XMM-on-Singlish_no_system_prompt"
)

results_xmm = evaluate_model(model_xmm, tokenizer_xmm, prompts, reference_answers_xmm)
print_results(results_xmm)

Loading model...
==((====))==  Unsloth 2025.12.1: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_config.json: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/529M [00:00<?, ?B/s]


EVALUATING AH BENG MODEL - 5 KEY METRICS

Generating 40 responses...
  10/40 completed
  20/40 completed
  30/40 completed
  40/40 completed

Calculating metrics...
  [1/6] Perplexity (fluency)...
  [2/6] Coherence (relevance)...
  [3/6] Diversity (variety)...
  [4/6] Distinct-N Score (non-repetitive - library)...
  [5/6] Rogue
  [6/6] BertScore

RESULTS

METRIC                         SCORE          
----------------------------------------------------------------------
1. Perplexity                  49.22          
2. Coherence                   0.297          
3. Diversity                   0.829          
4. Distinct-N (Library)        0.967          
5. ROUGE-1 F-measure           0.120          
   ROUGE-2 F-measure           0.013          
   ROUGE-L F-measure           0.100          
6. BERTScore Precision         0.771          
   BERTScore Recall            0.750          
   BERTScore F1                0.760          
