In [None]:
KAGGLE_USERNAME = ''
KAGGLE_KEY = ''

In [1]:
import os
import pandas as pd
import ast
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random
import torch
from transformers import GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm

os.environ['KAGGLE_USERNAME'] = KAGGLE_USERNAME
os.environ['KAGGLE_KEY'] = KAGGLE_KEY
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
#Download datasets to pandas dataframe.

api = KaggleApi()
api.authenticate()

FOODCOM_URL = "shuyangli94/food-com-recipes-and-user-interactions"
FOODCOM_DIR = "datasets/foodcom"
FOODCOM_FILENAME = "RAW_recipes.csv"

RECIPENLG_URL = "paultimothymooney/recipenlg"
RECIPENLG_DIR = "datasets/recipenlg"
RECIPENLG_FILENAME = "RecipeNLG_dataset.csv"

def get_data(url, dir, filename):

  if not os.path.exists(os.path.join(dir, filename)):
    if not os.path.exists(dir):
        os.makedirs(dir)

    api.dataset_download_files(url, path=dir, unzip=True)
    print('Dataset downloaded and extracted.')

  else:
    print('Dataset already exists.')

  recipes = pd.read_csv(os.path.join(dir, filename))

  return recipes

FOODCOM_DF = get_data(FOODCOM_URL, FOODCOM_DIR, FOODCOM_FILENAME)
RECIPENLG_DF = get_data(RECIPENLG_URL, RECIPENLG_DIR, RECIPENLG_FILENAME)

Dataset URL: https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions
Dataset downloaded and extracted.
Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/recipenlg
Dataset downloaded and extracted.


In [3]:
#Create raw list of strings for dataloader.

TOKENIZER = GPT2TokenizerFast.from_pretrained("gpt2-medium", bos_token='<|startoftext|>', eos_token='<|endoftext|>', unk_token='<|unknown|>', pad_token='<|pad|>')
MAX_LEN = 128
DATA_N = 100000

def filter_text(text, tokenizer = TOKENIZER, max_len = MAX_LEN):
  tokens = tokenizer(text, padding=False, truncation=False, add_special_tokens=True, return_attention_mask=False, return_tensors=None)

  return len(tokens['input_ids']) <= max_len
def preprocess_foodcom(df, features, n = DATA_N):

  #Extract fields. Using title and ingredients to predict instructions.
  title_field, ing_field, inst_field = features
  df = df[[title_field, ing_field, inst_field]]

  #Clean null or 0 length rows.
  df = df[~df.map(lambda x: (isinstance(x, list)  or isinstance(x, str)) and  len(x) == 0).any(axis=1)]
  df = df.dropna()
  df = df.reset_index(drop=True)

  #The strings will be in the form: <|startoftext|> [Title]\nIngredients: [Ingredients]\nDirections: \n[Directions] <|endoftext|>
  #A list of these strings will be passed into the RecipeDataset class.
  df[inst_field] = df[inst_field].apply(lambda x: ast.literal_eval(x))
  df[inst_field] = df[inst_field].apply(lambda x: "\nDirections: \n" + '\n'.join([x[i] for i in range(len(x))]) + "<|endoftext|>")

  df[ing_field] = df[ing_field].apply(lambda x: ast.literal_eval(x))
  df[ing_field] = df[ing_field].apply(lambda x: "\nIngredients: "+', '.join([x[i] for i in range(len(x))]))
  df[title_field] = df[title_field].apply(lambda x: "<|startoftext|>" + x)

  raw_strings = df[[title_field, ing_field, inst_field]].apply(lambda row: ''.join(row.to_list()), axis=1).tolist()

  filt_strings = []

  #Filtering out examples where tokenized length > MAX_LEN
  for string in raw_strings:
    if filter_text(string):
      filt_strings.append(string)
    if len(filt_strings) == n:
      break
  return filt_strings



STR_FOODCOM = preprocess_foodcom(FOODCOM_DF, ('name', 'ingredients', 'steps'))
STR_RECIPENLG = preprocess_foodcom(RECIPENLG_DF, ('title', 'NER', 'directions'))

FOODCOM_TRAIN, FOODCOM_DEV = train_test_split(STR_FOODCOM, test_size=0.1, random_state=42)
RECIPENLG_TRAIN, RECIPENLG_DEV = train_test_split(STR_RECIPENLG, test_size=0.1, random_state=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1426 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
#Dataset creation.

BATCH = 32

class RecipeDataset(Dataset):
  def __init__(self, data, tokenizer, max_len = 128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.x_tokens = self.tokenizer(self.data, max_length = self.max_len, truncation = True,  padding='max_length', return_tensors='pt')

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    item = {k : v[idx] for k,v in self.x_tokens.items()}
    return item

TRAINSET_FOODCOM = RecipeDataset(FOODCOM_TRAIN, TOKENIZER)
DEVSET_FOODCOM = RecipeDataset(FOODCOM_DEV, TOKENIZER)

TRAINSET_RECIPENLG = RecipeDataset(RECIPENLG_TRAIN, TOKENIZER)
DEVSET_RECIPENLG = RecipeDataset(RECIPENLG_DEV, TOKENIZER)

TRAINLOAD_FOODCOM = DataLoader(TRAINSET_FOODCOM, batch_size=BATCH, shuffle=True)
DEVSET_FOODCOM = DataLoader(DEVSET_FOODCOM, batch_size=BATCH, shuffle=True)

TRAINLOAD_RECIPENLG = DataLoader(TRAINSET_RECIPENLG, batch_size=BATCH, shuffle=True)
DEVSET_RECIPENLG = DataLoader(DEVSET_RECIPENLG, batch_size=BATCH, shuffle=True)

In [6]:
#Samples random input string, test model output
def test_out(model, devset, tokenizer = TOKENIZER, device = DEVICE):
  test_str = random.choice(devset).partition("\nDirections: \n")[0] + "\nDirections: \n"
  enc = tokenizer(test_str, return_tensors="pt")
  input_ids      = enc["input_ids"]
  attention_mask = enc["attention_mask"]
  output = model.generate(input_ids.to(device), attention_mask=attention_mask.to(device), max_new_tokens=128, do_sample = True, top_k = 50, top_p = 0.85)
  return tokenizer.decode(output[0])

In [20]:
#Training: About 30min/epoch on A100 with ~90k training samples and batch size of 32.

RECIPE_NLG_MODEL_SAVE_PATH = "./nlg_model"
FOODCOM_MODEL_SAVE_PATH = "./foodcom_model"
LEARNING_RATE = 1e-6
EPOCHS = 3
def train_model(save_path, train, dev, device = DEVICE, tokenizer = TOKENIZER, lr = LEARNING_RATE, epochs = EPOCHS):

  configuration = GPT2Config.from_pretrained('gpt2-medium', output_hidden_states=False)
  model = GPT2LMHeadModel.from_pretrained('gpt2-medium', config=configuration)
  model.to(device)
  model.resize_token_embeddings(len(tokenizer))
  optimizer = AdamW(model.parameters(), lr=lr)

  for epoch in range(epochs):
    model.train()
    train_loss = 0
    dev_loss = 0
    progress_bar = tqdm(total=int(len(train)), desc=f'Epoch {epoch+1}/{epochs}')
    for step, batch in enumerate(train):
      input_ids = batch['input_ids'].to(device)
      labels  = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)

      model.zero_grad()
      outputs = model(input_ids, labels=labels, attention_mask=attention_mask)
      loss = outputs.loss
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      progress_bar.update(1)

    progress_bar.close()

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train)}")

    model.eval()

    with torch.no_grad():

      for step, batch in enumerate(dev):
        input_ids = batch['input_ids'].to(device)
        labels  = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=attention_mask)
        loss = outputs.loss
        dev_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Dev Loss: {dev_loss / len(dev)}")
    print("Example output: ")

    #Print example output
    test_str = test_out(model, FOODCOM_DEV)
    print(test_str)

    model.save_pretrained(save_path)

In [21]:
train_model(RECIPE_NLG_MODEL_SAVE_PATH, TRAINLOAD_RECIPENLG, DEVSET_RECIPENLG)



Epoch 1/3:   0%|          | 0/2813 [00:00<?, ?it/s]

Epoch 1/3, Train Loss: 2.4997263809899777


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch 1/3, Dev Loss: 1.3456937828764748
Example output: 
<|startoftext|>steph s chicken tortilla soup
Ingredients: chicken breast, mexican seasoning, black beans, rotel tomatoes, hot water, chicken bouillon, cheese, sour cream, tortilla chips
Directions: 
Put chicken breast in a bowl.
Bring a pot of water to a boil.
Add the seasoning.
Add black beans, tomatoes and rotel, pepper and stir to coat all the ingredients.
Boil until soup thickens and starts to boil.
Add cheese, sour cream, sour cream and tortilla chips and stir to combine.
Serve with rice, beans, potatoes or tortillas.<|endoftext|>


Epoch 2/3:   0%|          | 0/2813 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
train_model(FOODCOM_MODEL_SAVE_PATH, TRAINLOAD_FOODCOM, DEVSET_FOODCOM)