## Prepare dataset

In [1]:
import pandas as pd
import random

def generate_food_entry():
    calories = random.randint(100, 900)
    protein = random.randint(1, 60)
    fat = random.randint(1, 80)
    carbs = random.randint(1, 120)
    fiber = random.randint(0, 15)
    sugar = random.randint(0, 50)
    sodium = random.randint(50, 2000)

    # Input text for model
    input_text = (
        f"Calories: {calories}, Protein: {protein}g, Fat: {fat}g, "
        f"Carbs: {carbs}g, Fiber: {fiber}g, Sugar: {sugar}g, Sodium: {sodium}mg"
    )

    # Derived summary logic
    summary_parts = []

    # Calories insights
    if calories < 300:
        summary_parts.append("Low-calorie meal ideal for light eating or snacks.")
    elif calories < 600:
        summary_parts.append("Moderate-calorie meal suitable for daily consumption.")
    else:
        summary_parts.append("High-calorie meal, best taken during heavy activity or bulking phase.")

    # Protein insights
    if protein > 30:
        summary_parts.append("Excellent source of protein, supports muscle recovery.")
    elif protein > 15:
        summary_parts.append("Contains adequate protein for balanced nutrition.")
    else:
        summary_parts.append("Low in protein, may not be ideal for athletes.")

    # Fat insights
    if fat > 50:
        summary_parts.append("Very high in fat — consume sparingly.")
    elif fat > 25:
        summary_parts.append("High-fat meal, suitable for keto or low-carb diets.")
    elif fat < 10:
        summary_parts.append("Low-fat meal, heart-friendly option.")

    # Carbs and sugar insights
    if carbs > 80:
        summary_parts.append("Carb-rich meal, provides quick energy.")
    elif carbs < 30:
        summary_parts.append("Low-carb meal, supports weight control.")

    if sugar > 20:
        summary_parts.append("High in sugar, limit for diabetic or weight-conscious diets.")
    elif sugar < 5:
        summary_parts.append("Low in sugar, suitable for low-glycemic diets.")

    # Fiber and sodium insights
    if fiber >= 8:
        summary_parts.append("Rich in fiber, promotes good digestion.")
    elif fiber < 3:
        summary_parts.append("Low fiber content, pair with vegetables or whole grains.")

    if sodium > 1500:
        summary_parts.append("Excess sodium — not recommended for hypertensive individuals.")
    elif sodium < 300:
        summary_parts.append("Low sodium content, good for heart health.")

    # Combine insights into one coherent summary
    target_text = " ".join(summary_parts)
    return {"input_text": input_text, "target_text": target_text}

# Generate dataset
dataset = [generate_food_entry() for _ in range(2000)]
df = pd.DataFrame(dataset)
df.to_csv("nutrition_summary_dataset.csv", index=False)

print(df.sample(5))


                                             input_text  \
1348  Calories: 656, Protein: 44g, Fat: 9g, Carbs: 1...   
742   Calories: 396, Protein: 15g, Fat: 32g, Carbs: ...   
1967  Calories: 426, Protein: 54g, Fat: 77g, Carbs: ...   
543   Calories: 218, Protein: 57g, Fat: 62g, Carbs: ...   
1166  Calories: 599, Protein: 13g, Fat: 2g, Carbs: 6...   

                                            target_text  
1348  High-calorie meal, best taken during heavy act...  
742   Moderate-calorie meal suitable for daily consu...  
1967  Moderate-calorie meal suitable for daily consu...  
543   Low-calorie meal ideal for light eating or sna...  
1166  Moderate-calorie meal suitable for daily consu...  


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import evaluate
import numpy as np

# Load dataset
dataset = load_dataset("csv", data_files="nutrition_summary_dataset.csv")

# Split into train/test
dataset = dataset["train"].train_test_split(test_size=0.1)

# Tokenizer & model
model_name = "t5-small"  # can replace with "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocess
max_input_length = 128
max_target_length = 64

def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Evaluation metric
metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {k: round(v * 100, 4) for k, v in result.items()}

# Training arguments
training_args = TrainingArguments(
    output_dir="./slm-recipe-summary",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    push_to_hub=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Map: 100%|██████████| 1800/1800 [00:00<00:00, 20291.96 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 17866.35 examples/s]


TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'