In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
import torch
import math
import evaluate

In [11]:
# List of jokes
jokes = [
    "Why don’t scientists trust atoms? Because they make up everything!",
    "Why did the scarecrow win an award? Because he was outstanding in his field!",
    "What did the ocean say to the beach? Nothing, it just waved!",
    "Why do cows have hooves instead of feet? Because they lactose!",
    "My Name is Senthil"
]

# Humor detection model
humor_model_name = "mohameddhiab/humor-no-humor"
humor_tokenizer = AutoTokenizer.from_pretrained(humor_model_name)
humor_model = AutoModelForSequenceClassification.from_pretrained(humor_model_name)

def get_humor_score(text):
    # Tokenize the input text
    inputs = humor_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    # Get model predictions
    outputs = humor_model(**inputs)
    # Apply softmax to get probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    # The model is trained with label 1 indicating humor
    humor_score = probabilities[0][1].item()
    return humor_score

# GPT-2 model for perplexity calculation
gpt2_model_name = "gpt2"
gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name)
gpt2_model = AutoModelForCausalLM.from_pretrained(gpt2_model_name)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Function to calculate perplexity
def calculate_perplexity(text):
    tokens = gpt2_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        loss = gpt2_model(**tokens, labels=tokens["input_ids"]).loss
    return torch.exp(loss).item()

# Function to evaluate a joke
def evaluate_joke(joke):
    perplexity = calculate_perplexity(joke)
    humor_score = get_humor_score(joke)

    return {
        "joke": joke,
        "perplexity_score": perplexity,
        "humor_score": humor_score
    }

# Evaluate jokes
for joke in jokes:
    evaluation = evaluate_joke(joke)
    print(f"Joke: {evaluation['joke']}\nCoherence (Perplexity): {evaluation['perplexity_score']:.2f}\nHumor Score: {evaluation['humor_score']:.2f}\n")


Joke: Why don’t scientists trust atoms? Because they make up everything!
Coherence (Perplexity): 419.66
Humor Score: 0.98

Joke: Why did the scarecrow win an award? Because he was outstanding in his field!
Coherence (Perplexity): 64.21
Humor Score: 0.98

Joke: What did the ocean say to the beach? Nothing, it just waved!
Coherence (Perplexity): 76.60
Humor Score: 0.99

Joke: Why do cows have hooves instead of feet? Because they lactose!
Coherence (Perplexity): 79.38
Humor Score: 0.99

Joke: My Name is Senthil
Coherence (Perplexity): 178.57
Humor Score: 0.01



In [None]:
# Perplexity can be thought of as the "uncertainty" or "surprise" of a model when it encounters new text.
# A high perplexity value means the model is uncertain or "surprised" by the text it is trying to predict, implying that it is not able to predict the next word in the sequence very well.
# A low perplexity indicates that the model is confident in its predictions, meaning the text is more predictable to the model.

In [12]:

# Initialize the BLEU metric
bleu = evaluate.load("bleu")

# Reference jokes for evaluation (gold standard)
reference_jokes = [
    ["Why don’t scientists trust atoms? Because they make up everything!"],
    ["Why did the scarecrow win an award? Because he was outstanding in his field!"],
]

# Generated jokes by different LLMs
llm_outputs = {
    "Model_1": [
        "Why don’t scientists trust atoms? Because they make up everything!",
        "Why did the scarecrow win an award? Because he was outstanding in his field!",
    ],
    "Model_2": [
        "What did the ocean say to the beach? Nothing, it just waved!",
        "Why do cows have hooves instead of feet? Because they lactose!",
    ],
}


# Evaluate BLEU for each model
for model_name, jokes in llm_outputs.items():
    print(f"Evaluating {model_name}...")

    # Compute BLEU score
    results = bleu.compute(predictions=jokes, references=reference_jokes)
    print(f"BLEU Score: {results['bleu']:.4f}\n")



Evaluating Model_1...
BLEU Score: 1.0000

Evaluating Model_2...
BLEU Score: 0.0000



In [None]:
# High BLEU Scores (close to 1.0) indicate that the model is generating jokes that are very similar to the reference jokes.
# Low BLEU Scores suggest that the jokes generated by the model differ significantly from the reference jokes

In [15]:
from rouge_score import rouge_scorer

# Reference jokes (list of lists, each containing one reference joke)
reference_jokes = [
    ["Why don’t scientists trust atoms? Because they make up everything!"],
    ["Why did the scarecrow win an award? Because he was outstanding in his field!"],
]

# Generated jokes by different LLMs (list of jokes per model)
llm_outputs = {
    "Model_1": [
        "Why don’t scientists trust atoms? Because they make up everything!",
        "Why did the scarecrow win an award? Because he was outstanding in his field!",
    ],
    "Model_2": [
        "What did the ocean say to the beach? Nothing, it just waved!",
        "Why do cows have hooves instead of feet? Because they lactose!",
    ],
}

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to calculate ROUGE scores for jokes by different models
def calculate_rouge_scores(reference_jokes, llm_outputs):
    results = {}
    
    for model_name, generated_jokes in llm_outputs.items():
        rouge_scores = []
        for ref_joke, gen_joke in zip(reference_jokes, generated_jokes):
            # Each reference joke is in a list, so extract it
            ref_joke = ref_joke[0]
            
            # Calculate ROUGE score for each pair of reference and generated joke
            scores = scorer.score(ref_joke, gen_joke)
            rouge_scores.append({
                'rouge1': scores['rouge1'].fmeasure,
                'rouge2': scores['rouge2'].fmeasure,
                'rougeL': scores['rougeL'].fmeasure
            })
        
        results[model_name] = rouge_scores
    
    return results

# Calculate ROUGE scores for all models
rouge_scores = calculate_rouge_scores(reference_jokes, llm_outputs)

# Print the ROUGE scores for each model
for model_name, scores in rouge_scores.items():
    print(f"Results for {model_name}:")
    for i, score in enumerate(scores):
        print(f"Joke {i+1}: {score}")
    print()



Results for Model_1:
Joke 1: {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0}
Joke 2: {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0}

Results for Model_2:
Joke 1: {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
Joke 2: {'rouge1': 0.16, 'rouge2': 0.0, 'rougeL': 0.16}



In [16]:
# ROUGE-1 measures unigram overlap.
# ROUGE-2 measures bigram overlap.
# ROUGE-L measures the longest common subsequence (LCS), which captures word order.
# This approach can help you evaluate the quality of jokes generated by different models in comparison to reference jokes