## Load Dataset


In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import warnings as ignore

# Suppress all warnings
ignore.filterwarnings("ignore")

In [3]:
data = pd.read_csv("unique_prompts_generated_recipes_v2.csv")

In [5]:
data

Unnamed: 0,Prompt,Generated Recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,..."
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ..."
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre..."
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on..."
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b..."
...,...,...
2995,Generate a vegetarian recipe for dinner with t...,"Dish: Ingredients: tomato, soy sauce, spinach,..."
2996,Generate a dairy-free recipe for dinner with e...,"Dish: Ingredients: eggplant, rice, avocado, ol..."
2997,Generate a dairy-free recipe for dinner with p...,"Dish: Ingredients: potato, olive oil, lemongra..."
2998,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, eggplant, pasta, se..."


## Data Cleaning


In [9]:
# Clean the text: Remove special characters and lowercasing

# Clean the text: Remove special characters and lowercasing
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text


data['cleaned_prompt'] = data['Prompt'].apply(clean_text)
data['cleaned_recipe'] = data['Generated Recipe'].apply(clean_text)


In [10]:
data

Unnamed: 0,Prompt,Generated Recipe,cleaned_prompt,cleaned_recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,...",generate a dairyfree recipe for lunch with yog...,dish ingredients yogurt chickpeas spinach carr...
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ...",generate a dairyfree recipe for dinner with gi...,dish ingredients ginger olive oil tomato spina...
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre...",generate a vegetarian recipe for dinner with c...,dish ingredients cucumber potato tofu breadcru...
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on...",generate a dairyfree recipe for lunch with len...,dish ingredients lentils basil spinach onion i...
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b...",generate a vegetarian recipe for dinner with b...,dish ingredients basil lemongrass pasta breadc...
...,...,...,...,...
2995,Generate a vegetarian recipe for dinner with t...,"Dish: Ingredients: tomato, soy sauce, spinach,...",generate a vegetarian recipe for dinner with t...,dish ingredients tomato soy sauce spinach chic...
2996,Generate a dairy-free recipe for dinner with e...,"Dish: Ingredients: eggplant, rice, avocado, ol...",generate a dairyfree recipe for dinner with eg...,dish ingredients eggplant rice avocado olive o...
2997,Generate a dairy-free recipe for dinner with p...,"Dish: Ingredients: potato, olive oil, lemongra...",generate a dairyfree recipe for dinner with po...,dish ingredients potato olive oil lemongrass c...
2998,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, eggplant, pasta, se...",generate a dairyfree recipe for dinner with to...,dish ingredients tomato eggplant pasta sesame ...


In [11]:
# Split the data into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [12]:
train_data

Unnamed: 0,Prompt,Generated Recipe,cleaned_prompt,cleaned_recipe
642,Generate a vegetarian recipe for dinner with s...,"Dish: Ingredients: soy sauce, carrot, coconut ...",generate a vegetarian recipe for dinner with s...,dish ingredients soy sauce carrot coconut milk...
700,Generate a vegetarian recipe for dinner with m...,"Dish: Ingredients: mushrooms, spinach, potato,...",generate a vegetarian recipe for dinner with m...,dish ingredients mushrooms spinach potato ging...
226,Generate a vegetarian recipe for lunch with cu...,"Dish: Ingredients: cucumber, potato, carrot, g...",generate a vegetarian recipe for lunch with cu...,dish ingredients cucumber potato carrot garlic...
1697,Generate a dairy-free recipe for lunch with cu...,"Dish: Ingredients: cucumber, carrot, spinach, ...",generate a dairyfree recipe for lunch with cuc...,dish ingredients cucumber carrot spinach onion...
1010,Generate a vegetarian recipe for lunch with yo...,"Dish: Ingredients: yogurt, parmesan, mushrooms...",generate a vegetarian recipe for lunch with yo...,dish ingredients yogurt parmesan mushrooms cuc...
...,...,...,...,...
1638,Generate a dairy-free recipe for lunch with ca...,"Dish: Ingredients: carrot, cashews, yogurt, po...",generate a dairyfree recipe for lunch with car...,dish ingredients carrot cashews yogurt potato ...
1095,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cashews, pasta, chickpeas, ...",generate a vegetarian recipe for dinner with c...,dish ingredients cashews pasta chickpeas potat...
1130,Generate a vegetarian recipe for lunch with ba...,"Dish: Ingredients: basil, chickpeas, yogurt, c...",generate a vegetarian recipe for lunch with ba...,dish ingredients basil chickpeas yogurt carrot...
1294,Generate a dairy-free recipe for lunch with se...,"Dish: Ingredients: sesame seeds, chickpeas, co...",generate a dairyfree recipe for lunch with ses...,dish ingredients sesame seeds chickpeas coconu...


In [13]:
test_data

Unnamed: 0,Prompt,Generated Recipe,cleaned_prompt,cleaned_recipe
1801,Generate a dairy-free recipe for lunch with to...,"Dish: Ingredients: tomato, coconut milk, onion...",generate a dairyfree recipe for lunch with tom...,dish ingredients tomato coconut milk onion cuc...
1190,Generate a vegetarian recipe for lunch with ca...,"Dish: Ingredients: cashews, ginger, yogurt, pa...",generate a vegetarian recipe for lunch with ca...,dish ingredients cashews ginger yogurt pasta i...
1817,Generate a vegetarian recipe for lunch with ch...,"Dish: Ingredients: chickpeas, tofu, bell peppe...",generate a vegetarian recipe for lunch with ch...,dish ingredients chickpeas tofu bell peppers c...
251,Generate a vegetarian recipe for lunch with ca...,"Dish: Ingredients: carrot, chickpeas, tomato, ...",generate a vegetarian recipe for lunch with ca...,dish ingredients carrot chickpeas tomato potat...
2505,Generate a vegetarian recipe for lunch with ol...,"Dish: Ingredients: olive oil, spinach, tofu, c...",generate a vegetarian recipe for lunch with ol...,dish ingredients olive oil spinach tofu cashew...
...,...,...,...,...
104,Generate a dairy-free recipe for dinner with b...,"Dish: Ingredients: breadcrumbs, onion, spinach...",generate a dairyfree recipe for dinner with br...,dish ingredients breadcrumbs onion spinach bas...
2087,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, bell peppers, olive...",generate a dairyfree recipe for dinner with to...,dish ingredients tomato bell peppers olive oil...
599,Generate a dairy-free recipe for lunch with ga...,"Dish: Ingredients: garlic, coconut milk, eggpl...",generate a dairyfree recipe for lunch with gar...,dish ingredients garlic coconut milk eggplant ...
1756,Generate a dairy-free recipe for dinner with b...,"Dish: Ingredients: bell peppers, basil, onion,...",generate a dairyfree recipe for dinner with be...,dish ingredients bell peppers basil onion spin...


In [14]:
!pip install datasets




In [15]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset


In [16]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [18]:
# Tokenize the input prompts and generated recipes
train_encodings = tokenizer(list(train_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
train_labels = tokenizer(list(train_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)


test_encodings = tokenizer(list(test_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
test_labels = tokenizer(list(test_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)



In [19]:
train_encodings

{'input_ids': [[3806, 3, 9, 16364, 2696, 21, 2634, 28, 78, 63, 3837, 15578, 9417, 3702, 1394, 265, 15, 7299, 3, 16217, 15698, 1, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 16364, 2696, 21, 2634, 28, 20047, 21659, 14741, 15698, 3, 16217, 24395, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 16364, 2696, 21, 3074, 28, 24395, 14741, 15578, 9119, 3, 16217, 13732, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 13688, 2113, 2696, 21, 3074, 28, 24395, 15578, 21659, 12909, 3, 16217, 24026, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 16364, 2696, 21, 3074, 28, 19168, 260, 2687, 152, 20047, 24395, 3, 16217, 1723, 15, 210, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 16364, 2696, 21, 2634, 28, 15698, 19168, 1723, 15, 210, 7, 13732, 3, 16217, 9417, 3702, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 16364, 2696, 21, 2634, 28, 24026, 7, 12909, 29714, 17507, 3, 16217, 12815, 5270, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [3806, 3, 9, 13688, 2113, 2696, 21, 3074, 28, 9119, 20047, 6605, 87

In [23]:
# Create a custom dataset for use in the Trainer
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels['input_ids']
})


In [24]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2400
})

In [25]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 600
})

In [26]:
test_dataset[0]

{'input_ids': [3806,
  3,
  9,
  13688,
  2113,
  2696,
  21,
  3074,
  28,
  12784,
  9417,
  3702,
  12909,
  24395,
  3,
  16217,
  21659,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [4419,
  3018,
  12784,
  9417,
  3702,
  12909,
  24395,
  3909,
  5148,
  12784,
  9417,
  3702,
  12909,
  617,
  24395,
  3989,
  8583,
  11,
  1716,
  1312,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [28]:
# Set up training arguments with validation logging
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=5,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Warmup steps
    weight_decay=0.01,  # Weight decay
    logging_dir='./logs',  # Logging directory
    logging_steps=10,
    evaluation_strategy="steps",  # Evaluate during training
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=1000,  # Save model checkpoints every 1000 steps
    load_best_model_at_end=True,  # Load the best model when finished
)

In [29]:
# Initialize Trainer
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
)


# Fine-tune the model
trainer.train()




Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.0205,0.000278
1000,0.0038,3.6e-05
1500,0.0016,2.4e-05


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1500, training_loss=1.111177295299247, metrics={'train_runtime': 2263.4158, 'train_samples_per_second': 5.302, 'train_steps_per_second': 0.663, 'total_flos': 91990130688000.0, 'train_loss': 1.111177295299247, 'epoch': 5.0})

## Save Tuned Model


In [30]:
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_t5_recipe_model')
tokenizer.save_pretrained('./fine_tuned_t5_recipe_model')

('./fine_tuned_t5_recipe_model\\tokenizer_config.json',
 './fine_tuned_t5_recipe_model\\special_tokens_map.json',
 './fine_tuned_t5_recipe_model\\spiece.model',
 './fine_tuned_t5_recipe_model\\added_tokens.json')

## Recipe Generation System


In [32]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer


model = T5ForConditionalGeneration.from_pretrained('./fine_tuned_t5_recipe_model')
tokenizer = T5Tokenizer.from_pretrained('./fine_tuned_t5_recipe_model')
# Set the device to GPU if available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)


# Function to generate a recipe from a prompt
def generate_recipe(prompt, model, tokenizer, max_length=150):
    prompt = clean_text(prompt)  # Clean the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate the recipe
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)

    return tokenizer.decode(output[0], skip_special_tokens=True)



In [33]:
# Test the recipe generation
prompt = "Generate a vegetarian recipe for dinner with tomatoes and spinach"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)

dish ingredients tomatoes and spinach instructions combine tomatoes and spinach add spinach cook thoroughly and serve hot


In [34]:
# Test the recipe generation with a different prompt
prompt = "Generate a vegan dessert recipe with chocolate and almonds"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)

dish ingredients chocolate and almonds instructions combine chocolate and almonds cook thoroughly and serve hot


In [35]:
# Test the recipe generation with another prompt
prompt = "Generate a gluten-free recipe for breakfast with eggs and avocado"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients eggs and avocado instructions combine eggs and avocado add avocado cook thoroughly and serve hot
