In [1]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=75a0d0a902c3d1439f01394c608bce6e1d17f24c1b69b0fca0dbbbbf16b780bd
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
import os
import torch
import logging
import json
from datetime import datetime
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import pandas as pd
from sklearn.model_selection import train_test_split
import ast
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from tqdm import tqdm
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import numpy as np
from sklearn.metrics import confusion_matrix
import pandas as pd
import time
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Error loading tokenizers/punkt/english.pickle: Package
[nltk_data]     'tokenizers/punkt/english.pickle' not found in index


False

In [26]:
DATA_PATH = '/content/generated_recipes_smollm17b.json'
MODEL_NAME = 'SMOL17B'

In [27]:
def comparison_metrics(reference, hypothesis, ingredients):
  metrics = {}
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
  rouge_scores = scorer.score(reference, hypothesis)

  reference_tokens = nltk.word_tokenize(reference.lower())
  hypothesis_tokens = nltk.word_tokenize(hypothesis.lower())


  metrics.update({
    'rouge1': rouge_scores['rouge1'].fmeasure,
    'rouge2': rouge_scores['rouge2'].fmeasure,
    'rougeL': rouge_scores['rougeL'].fmeasure,
    'rouge1_precision': rouge_scores['rouge1'].precision,
    'rouge1_recall': rouge_scores['rouge1'].recall,
    'rouge2_precision': rouge_scores['rouge2'].precision,
    'rouge2_recall': rouge_scores['rouge2'].recall,
    'rougeL_precision': rouge_scores['rougeL'].precision,
    'rougeL_recall': rouge_scores['rougeL'].recall,
    })

        # BLEU scores
  metrics.update({
    'bleu1': sentence_bleu([reference_tokens], hypothesis_tokens,
                            weights=(1, 0, 0, 0),
                            smoothing_function=SmoothingFunction().method1),
    'bleu2': sentence_bleu([reference_tokens], hypothesis_tokens,
                            weights=(0.5, 0.5, 0, 0),
                            smoothing_function=SmoothingFunction().method1),
    'bleu3': sentence_bleu([reference_tokens], hypothesis_tokens,
                            weights=(0.33, 0.33, 0.33, 0),
                            smoothing_function=SmoothingFunction().method1),
    'bleu4': sentence_bleu([reference_tokens], hypothesis_tokens,
                            smoothing_function=SmoothingFunction().method1),
    })
  return metrics

def single_source_metrics(source, ingredients, field_name):
  metrics = {}
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
  model = GPT2LMHeadModel.from_pretrained('gpt2')
  model.eval()
  model.to(device)
  inputs = tokenizer(source, return_tensors='pt', truncation=True, max_length=512).to(device)
  inputs['labels'] = inputs['input_ids']
  with torch.no_grad():
    outputs = model(**inputs)
    loss = outputs.loss

    perplexity = torch.exp(loss).item()
  metrics['perplexity'] = perplexity

  if ingredients:
    ingredient_list = [ing.strip().lower() for ing in ingredients.split(' ')]
    mentioned_ingredients = sum(1 for ing in ingredient_list if ing in source.lower())
    metrics['ingredient_coverage'] = mentioned_ingredients / len(ingredient_list)
  else:
    metrics['ingredient_coverage'] = 0.0

        # 4. Step Complexity Score
  steps = [s for s in source.split('\n') if s.strip()]
  avg_step_length = np.mean([len(nltk.word_tokenize(step)) for step in steps]) if steps else 0
  metrics['step_complexity'] = min(1.0, avg_step_length / 20)  # Normalize to [0,1]

        # 5. Recipe Coherence Score (using step order logic)
  cooking_verbs = set(['mix', 'stir', 'cook', 'bake', 'boil', 'fry', 'add', 'combine', 'heat', 'pour'])
  verb_sequence = []
  for step in steps:
    tokens = nltk.word_tokenize(step.lower())
    verbs = [token for token in tokens if token in cooking_verbs]
    if verbs:
      verb_sequence.append(verbs[0])

        # Check if verbs follow logical cooking order
  valid_transitions = {
    'mix': ['add', 'pour', 'stir'],
    'add': ['mix', 'stir', 'combine'],
    'combine': ['mix', 'stir', 'heat'],
    'heat': ['cook', 'bake', 'boil', 'fry'],
    'stir': ['add', 'pour', 'heat']
    }

  coherence_score = 0
  if len(verb_sequence) > 1:
    valid_transitions_count = sum(1 for i in range(len(verb_sequence)-1)
                                  if verb_sequence[i+1] in valid_transitions.get(verb_sequence[i], []))
    coherence_score = valid_transitions_count / (len(verb_sequence) - 1)
  metrics['recipe_coherence'] = coherence_score

        # 6. Temperature and Time Specification Score
  temp_patterns = r'\d+\s*[°℉℃F]'
  time_patterns = r'\d+\s*(minutes?|mins?|hours?|hrs?)'

  has_temp = bool(re.search(temp_patterns, source))
  has_time = bool(re.search(time_patterns, source))
  metrics['temp_time_score'] = (has_temp + has_time) / 2



  final_metrics = {field_name + "-" +k: v for k, v in metrics.items()}
  return final_metrics

In [28]:
def load_data(data_path = DATA_PATH):
  with open(data_path, 'r') as file:
    data = json.load(file)
    return data




def run_metrics(data, model_name = MODEL_NAME):
  metrics = []
  i = 0
  for d in data:
    c_metrics = comparison_metrics(d['reference_instructions'], d['generation'], d['ingredients'])
    true_metrics = single_source_metrics(d['reference_instructions'], d['ingredients'], "true")

    pred_metrics = single_source_metrics(d['generation'], d['ingredients'], "pred")

    final_metrics = {**c_metrics, **true_metrics, **pred_metrics}
    metrics.append(final_metrics)
    print(i)
    i += 1
  return metrics



In [29]:
from collections import defaultdict
def average_dicts(dicts):
    if not dicts:
        return {}

    # Initialize defaultdict to sum values and count entries for each key
    sums = defaultdict(float)
    counts = defaultdict(int)

    # Iterate through dictionaries and accumulate sums and counts
    for d in dicts:
        for key, value in d.items():
            sums[key] += value
            counts[key] += 1

    # Compute averages
    averages = {key: sums[key] / counts[key] for key in sums}
    return averages

In [None]:
data = load_data()

metrics = run_metrics(data)
average = average_dicts(metrics)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [23]:
output_file = "metrics-" +MODEL_NAME +  ".json"
output_avg = "avg_metrics-" +MODEL_NAME +  ".json"
print(output_avg)
with open(output_file, "w") as f:
  json.dump(metrics, f, indent=4)
with open(output_avg, "w") as f:
  json.dump(average, f, indent=4)

avg_metrics-SMOL360.json


In [24]:
print(average)

{'rouge1': 0.10886351787996909, 'rouge2': 0.006513438517058858, 'rougeL': 0.05566017553750084, 'rouge1_precision': 0.11274985746060036, 'rouge1_recall': 0.1638964863921114, 'rouge2_precision': 0.0072570171880496515, 'rouge2_recall': 0.00877638983690814, 'rougeL_precision': 0.060782993354623366, 'rougeL_recall': 0.08407971918778841, 'bleu1': 0.06581891478105072, 'bleu2': 0.013308621980433493, 'bleu3': 0.005019753679749801, 'bleu4': 0.0027713637453327788, 'true-perplexity': 39.407653759002685, 'true-ingredient_coverage': 0.584435077808551, 'true-step_complexity': 0.532093880967229, 'true-recipe_coherence': 0.21421724386724397, 'true-temp_time_score': 0.373, 'pred-perplexity': 90.66685971450805, 'pred-ingredient_coverage': 0.15826757596849694, 'pred-step_complexity': 0.9798764912280699, 'pred-recipe_coherence': 0.018, 'pred-temp_time_score': 0.116}
