In [4]:
import torch
torch.cuda.empty_cache()

# Predicting Recipe Tags from Descriptions

## Parameters

In [5]:
MODEL_NAME = "t5-small"

INGREDIENTS_SEPARATOR = ', '
STEPS_SEPARATOR = '\n- '

TRAIN_PROPORTION = 0.95

VAL_DATASET_MAX_SIZE = 200

## Load Model and Tokenizer

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    device_map="auto")

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## Load Data and Explore

In [10]:
import pandas as pd

df = pd.read_csv("data/RAW_recipes.csv")
df[:5]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


## Process the data

In [12]:
from ast import literal_eval

df['prompts'] = df.apply(
    lambda row: \
f"""
Given a recipe, predict its tags.
---
Recipe Name: {row['name']}
Cooking Time: {row['minutes']} minutes
Number of Ingredients: {row['n_ingredients']}
Number of Steps: {row['n_steps']}
Ingredients: {INGREDIENTS_SEPARATOR.join(literal_eval(row['ingredients']))}
Steps:\n- {STEPS_SEPARATOR.join(literal_eval(row['steps']))}
---
Now, predict the tags.
""", axis=1)

df['prompts'][0]

'\nGiven a recipe, predict its tags.\n---\nRecipe Name: arriba   baked winter squash mexican style\nCooking Time: 55 minutes\nNumber of Ingredients: 7\nNumber of Steps: 11\nIngredients: winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt\nSteps:\n- make a choice and proceed with recipe\n- depending on size of squash , cut into half or fourths\n- remove seeds\n- for spicy squash , drizzle olive oil or melted butter over each cut squash piece\n- season with mexican seasoning mix ii\n- for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece\n- season with sweet mexican spice mix\n- bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin\n- be careful not to burn the squash especially if you opt to use sugar or butter\n- if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking\n- if desired , season with sa

In [13]:
df['tags'][0]

"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']"

In [14]:
# Better understand the tags

unique_tags = set(item for sublist in df['tags'].apply(literal_eval) for item in sublist)
print(list(tag for tag in unique_tags if tag)[:5])
print(len(unique_tags))

TAGS_TO_INDEX = {tag: i for i, tag in enumerate(unique_tags) if tag}
INDEX_TO_TAGS = {i: tag for i, tag in enumerate(unique_tags) if tag}

['hunan', 'main-dish-beef', 'whole-duck', 'side-dishes', 'pasta-elbow-macaroni']
552


In [15]:
# Data setup when we're trying to predict the tags as individual entities

df['tags_index'] = df['tags'].apply(literal_eval).apply(lambda tags: [TAGS_TO_INDEX[tag] for tag in tags if tag])
df['tags_index'][:5]

0    [391, 255, 196, 488, 137, 173, 74, 63, 4, 321,...
1    [265, 255, 196, 488, 137, 173, 74, 63, 425, 6,...
2           [255, 196, 173, 6, 52, 280, 282, 546, 404]
3    [391, 255, 196, 488, 173, 74, 4, 414, 104, 321...
4    [165, 255, 196, 488, 137, 173, 74, 63, 513, 46...
Name: tags_index, dtype: object

In [16]:
# Data setup when we're trying to predict the sorted sequence of tags as a CSV

df['tags_seq'] = df['tags'].apply(literal_eval).apply(sorted).apply(lambda tags: ','.join(tags))
df['tags_seq'][:5]

0    60-minutes-or-less,christmas,course,cuisine,di...
1    30-minutes-or-less,american,breakfast,course,c...
2    4-hours-or-less,chili,course,crock-pot-slow-co...
3    60-minutes-or-less,brunch,cheese,christmas,cou...
4    4-hours-or-less,american,amish-mennonite,canni...
Name: tags_seq, dtype: object

In [76]:
tags = df['tags_seq'].tolist()
print(len(tags))
print('\n'.join(tags[:5]))

231637
60-minutes-or-less,christmas,course,cuisine,dietary,easy,fall,holiday-event,main-ingredient,mexican,north-american,occasion,preparation,seasonal,side-dishes,squash,time-to-make,vegetables,vegetarian,winter
30-minutes-or-less,american,breakfast,course,cuisine,dietary,easy,equipment,kid-friendly,main-dish,main-ingredient,meat,north-american,northeastern-united-states,occasion,oven,pizza,pork,preparation,time-to-make
4-hours-or-less,chili,course,crock-pot-slow-cooker,dietary,equipment,main-dish,preparation,time-to-make
60-minutes-or-less,brunch,cheese,christmas,course,dietary,dinner-party,easter,easy,eggs-dairy,equipment,holiday-event,independence-day,inexpensive,main-ingredient,new-years,occasion,oven,potatoes,preparation,presentation,served-hot,side-dishes,st-patricks-day,stove-top,superbowl,thanksgiving,time-to-make,valentines-day,vegetables
4-hours-or-less,american,amish-mennonite,canning,condiments-etc,course,cuisine,dietary,heirloom-historical,holiday-event,main-ingredient,no

In [19]:
prompts = df['prompts'].tolist()
print(len(prompts))
print('\n'.join(prompts[:5]))

231637

Given a recipe, predict its tags.
---
Recipe Name: arriba   baked winter squash mexican style
Cooking Time: 55 minutes
Number of Ingredients: 7
Number of Steps: 11
Ingredients: winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt
Steps:
- make a choice and proceed with recipe
- depending on size of squash , cut into half or fourths
- remove seeds
- for spicy squash , drizzle olive oil or melted butter over each cut squash piece
- season with mexican seasoning mix ii
- for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece
- season with sweet mexican spice mix
- bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin
- be careful not to burn the squash especially if you opt to use sugar or butter
- if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking
- if desired , season with salt
---
Now, p

## Assemble the datasets

In [20]:
train_max_index = int(len(prompts) * TRAIN_PROPORTION)

train_data_input = prompts[:train_max_index]
train_data_output = tags[:train_max_index]

val_data_input = prompts[train_max_index:]
val_data_output = tags[train_max_index:]

In [23]:
train_ids_input = tokenizer(prompts[:train_max_index], return_tensors='pt', padding='max_length', truncation=True).input_ids
train_ids_output = tokenizer(tags[:train_max_index], return_tensors='pt', padding='max_length', truncation=True).input_ids

val_ids_input = tokenizer(prompts[train_max_index:], return_tensors='pt', padding='max_length', truncation=True).input_ids
val_ids_output = tokenizer(tags[train_max_index:], return_tensors='pt', padding='max_length', truncation=True).input_ids

In [24]:
print(train_ids_input.shape)
print(train_ids_output.shape)

print(val_ids_input.shape)
print(val_ids_output.shape)

torch.Size([220055, 512])
torch.Size([220055, 512])
torch.Size([11582, 512])
torch.Size([11582, 512])


In [25]:
print(train_ids_input[:5])

tensor([[9246,    3,    9,  ...,    0,    0,    0],
        [9246,    3,    9,  ...,    0,    0,    0],
        [9246,    3,    9,  ...,    0,    0,    0],
        [9246,    3,    9,  ...,    0,    0,    0],
        [9246,    3,    9,  ...,    0,    0,    0]])


In [26]:
from torch.utils.data import Dataset
from torch import Tensor

class RecipeDatasetForSeq2Seq(Dataset):
    def __init__(self, input_ids: Tensor, output_ids: Tensor):
        self.input_ids = input_ids
        self.output_ids = output_ids

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx: int):
        return {
            'input_ids': self.input_ids[idx, :],
            'labels': self.output_ids[idx, :]
        }

In [27]:
train_dataset = RecipeDatasetForSeq2Seq(train_ids_input, train_ids_output)
val_dataset = RecipeDatasetForSeq2Seq(val_ids_input, val_ids_output)

In [28]:
from torch.utils.data import Subset
import random

# For expedience, limit the validation dataset to a specific size
indices = random.sample(range(len(val_dataset)), VAL_DATASET_MAX_SIZE)
val_dataset = Subset(val_dataset, indices)

## Metrics Setup

In [114]:
import evaluate
import numpy as np
from typing import List

def jaccard_similarity(list1: List[str], list2: List[str]) -> float:
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

def dice_similarity(list1: List[str], list2: List[str]) -> float:
    s1 = set(list1)
    s2 = set(list2)
    return (2. * len(s1.intersection(s2))) / (len(s1) + len(s2))

def avg(vals: List[float]) -> float:
    if len(vals) == 0:
        return 1.0
    return sum(vals) / len(vals)

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # split the preds and labels and compute the Jaccard similarity
    pred_tags = [pred.split(',') for pred in decoded_preds]
    label_tags = [label.split(',') for label in decoded_labels]

    return {
        'Jaccard': avg(list(jaccard_similarity(p_tags, l_tags) for p_tags, l_tags in zip(pred_tags, label_tags))),
        'Dice': avg(list(dice_similarity(p_tags, l_tags) for p_tags, l_tags in zip(pred_tags, label_tags)))
    }

## Training

In [128]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, GenerationConfig

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 50  # Required to ensure that the model has enough of a chance to output all the tags

training_args = Seq2SeqTrainingArguments(
    # training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=1,
    fp16=True,

    # evaluation
    evaluation_strategy="steps",
    eval_steps=1000,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_config=generation_config,

    # misc
    output_dir="./results",
    save_total_limit=3,
)

In [129]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [130]:
trainer.train()

Step,Training Loss,Validation Loss,Jaccard,Dice
1000,0.1126,0.098554,0.099248,0.164727
2000,0.108,0.090681,0.138496,0.226529
3000,0.1004,0.085041,0.176298,0.281635
4000,0.0947,0.081783,0.179488,0.287286
5000,0.0901,0.079051,0.177073,0.284786
6000,0.0879,0.076284,0.184151,0.29714
7000,0.0852,0.074576,0.201964,0.32353
8000,0.084,0.072774,0.211821,0.338468
9000,0.0831,0.071381,0.216824,0.345602
10000,0.0806,0.070745,0.214717,0.343267


TrainOutput(global_step=13754, training_loss=0.08958263863167895, metrics={'train_runtime': 4207.3434, 'train_samples_per_second': 52.303, 'train_steps_per_second': 3.269, 'total_flos': 2.978264012292096e+16, 'train_loss': 0.08958263863167895, 'epoch': 1.0})

## Generation / Sampling

In [132]:
@torch.no_grad()
def generate_from_model(text: str) -> List[str]:
    model.eval()
    with torch.cuda.amp.autocast():
        input_ids = tokenizer.encode(text, return_tensors='pt', padding='max_length', truncation=True).cuda()

    gen_output = model.generate(
        input_ids=input_ids, 
        generation_config=generation_config
    )

    str_output = tokenizer.decode(gen_output[0], skip_special_tokens=True)

    return str_output.split(',')

In [133]:
SINGLE_TEST_IDX = 10000

In [134]:
print(prompts[SINGLE_TEST_IDX])


Given a recipe, predict its tags.
---
Recipe Name: asparagus with orange ginger sauce
Cooking Time: 18 minutes
Number of Ingredients: 6
Number of Steps: 8
Ingredients: fresh asparagus, orange juice, orange zest, cornstarch, fresh gingerroot, slivered almonds
Steps:
- wash asparagus and snap off tough ends
- place in a steamer basket and steam until just tender , about 5 minutes
- in a small saucepan , stir together juice , zest and cornstarch
- cook over medium-high heat until boiling
- let thicken and reduce while stirring with a whisk , about 3 minutes
- remove from heat and add grated ginger
- place asparagus on a serving plate and pour sauce over asparagus
- top with toasted almonds
---
Now, predict the tags.



In [135]:
output_tags = generate_from_model(prompts[SINGLE_TEST_IDX])
print(output_tags)

['30-minutes-or-less', 'asparagus', 'course', 'cuisine', 'dietary', 'dinner-party', 'equipment', 'european', 'for-1-or-2', 'fruit', 'healthy', 'healthy-2', 'low']


In [136]:
input_tags = tags[SINGLE_TEST_IDX].split(',')
print(input_tags)

['30-minutes-or-less', 'asparagus', 'citrus', 'diabetic', 'dietary', 'fruit', 'main-ingredient', 'oranges', 'preparation', 'steam', 'technique', 'time-to-make', 'vegetables', 'vegetarian']


In [137]:
print(jaccard_similarity(input_tags, output_tags))
print(dice_similarity(input_tags, output_tags))

0.17391304347826086
0.2962962962962963
