In [None]:
# Import statements
import pandas as pd
import numpy as np
import joblib as jb
import json
import re
from sklearn.model_selection import train_test_split
import json
import tqdm

In [None]:
!nvidia-smi

Tue May  4 06:33:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:5E:00.0 Off |                    0 |
| N/A   63C    P0    98W / 250W |  30735MiB / 32510MiB |     19%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:D8:00.0 Off |                    0 |
| N/A   80C    P0    97W / 250W |  15030MiB / 32510MiB |     67%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [None]:
df_new = jb.load('dataset_df_new.pkl')

In [None]:
train, test = train_test_split(df_new, train_size=0.94, random_state= 2)

In [None]:
print(train.shape, test.shape)

(38336, 4) (2448, 4)


In [None]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
train.head()

Unnamed: 0,instructions,ingredients,title,keywords
0,[pre heat oven to 400°f . wash and dry the pot...,"[potato, kitchen, ghee, butter, oil, turmeric,...",Indian Spiced Baked Potato Sticks,"[baking, chili_powder, chili, oven, baked]"
1,"[prepare cilantro oil by pureeing cilantro , c...","[cilantro, oil, canola_oil, honey, lime_juice,...",Pancetta Wrapped Shrimp with Chipotle Vinaigre...,"[chipotle_pepper, rice_vinegar, lemon_juice, l..."
2,"[preheat oven to 325 , then cook the onions an...","[onions, carrots, butter, tender, brown, salt,...",Casserole-Poached Chicken W- White Wine Tarrag...,"[hot_chicken_stock, chicken_broth, white_peppe..."
3,[put the oil and garlic in a saucepan and turn...,"[oil, garlic, tomatoes, juices, sauce, salt, p...",Marcella Hazan's Sugo Fresco Di Pomodoro,"[garlic, pasta, saucepan, cooked, tomatoes]"
4,"[place oven rack in lower third of oven , heat...","[butter, flour, chocolate, tap_water, egg_yolk...",Chocolate Sin,"[chocolate_cake, chocolate, baking, cake, oven]"


In [None]:
# converting the dataset into text data to tokenize 
def df_to_plaintext_file(input_df, output_file):
    print("Writing to", output_file)
    with open(output_file, 'w') as f:
        for index, row in input_df.iterrows():
            
            title = row.title
            instructions = row.instructions
            ingredients = row.ingredients
            keyword = row.keywords
            if index%10000==0:
                print(index)
                print("ingreds --->",ingredients)
                print("keywords --->",keyword)
                
                
            res = "<BEGIN_RECIPE> <BEGIN_INPUT> " + " <NEXT_INPUT> ".join(keyword) + " <END_INPUT> <BEGIN_TITLE> " + \
              title + "<END_TITLE> <BEGIN_INGREDS> " + \
              " <NEXT_INGREDS> ".join(ingredients) + " <END_INGREDS> <BEGIN_INSTR> " + " <NEXT_INSTR> ".join(instructions) + " <END_INSTR> <END_RECIPE>"
            f.write("{}\n".format(res))

In [None]:
df_to_plaintext_file(train, 'train_temp.txt')
df_to_plaintext_file(test, 'test_temp.txt') 

Writing to train_temp.txt
0
ingreds ---> ['potato', 'kitchen', 'ghee', 'butter', 'oil', 'turmeric', 'chili_powder', 'curry', 'leaves', 'salt', 'parchment_paper', 'chips']
keywords ---> ['baking', 'chili_powder', 'chili', 'oven', 'baked']
10000
ingreds ---> ['pastry', 'cream_cheese', 'egg', 'sugar', 'lemon_zest', 'mixture', 'cheese', 'almonds', 'serving']
keywords ---> ['cream_cheese', 'cheese', 'baking', 'pastry', 'almonds']
20000
ingreds ---> ['oil', 'onions', 'garlic', 'tender', 'clam_juice', 'chicken_broth', 'sauce', 'gravy', 'pepper', 'broth', 'clams', 'lemon_juice', 'tabasco', 'salt', 'soup', 'parsley', 'pasta']
keywords ---> ['chicken_broth', 'garlic', 'onions', 'lemon_juice', 'pasta']
30000
ingreds ---> ['beef', 'garlic', 'green_onions', 'green_peppers', 'salsa', 'green_chilies', 'egg', 'noodles', 'water', 'mixture', 'cheese', 'tomatoes', 'chili', 'spaghetti']
keywords ---> ['green_chilies', 'green_onions', 'green_peppers', 'chili', 'garlic']
Writing to test_temp.txt
0
ingreds -

### Tokenizing the recipes 

In [None]:
from transformers import GPT2Tokenizer
import h5py
import numpy as np

tokenizer = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
special_tokens = {
    "additional_special_tokens": ["<BEGIN_RECIPE>" ,
                                "<BEGIN_INPUT>" , 
                                "<NEXT_INPUT>",
                                "<END_INPUT>" ,
                                "<BEGIN_TITLE>",
                                "<END_TITLE>" ,
                                "<BEGIN_INGREDS>" ,
                                "<NEXT_INGREDS>" ,
                                "<END_INGREDS>" ,
                                "<BEGIN_INSTR>" ,
                                "<NEXT_INSTR>",
                                "<END_INSTR>" ,
                                "<END_RECIPE>" 
    ]
}

tokenizer.add_special_tokens(special_tokens)

end_token_id = tokenizer.convert_tokens_to_ids(["<END_RECIPE>"])[0]

hf = h5py.File("data_temp.h5", "w")
for filename in ["test_temp", "train_temp"]:
    out_np = []
    data = open(filename+".txt", "r")
    num = 0
    rows = 0
    last=[]
    for line in data:
        num+=1
        if num%10000 == 0:
            print("Read "+str(num)+" Written: "+str(rows))

        text_tokens = tokenizer.tokenize(line) 
        # the tokens supported by gpt2 are 1024 for gpt2 medium. so if the recipe is exceeds this length it wont fit in the model and will generate errors. 
        if len(text_tokens) > 1024: 
            continue

        text_tokens_ids = tokenizer.convert_tokens_to_ids(text_tokens)

        if (len(last) + len(text_tokens_ids)) <= 1024:
            last+=text_tokens_ids
        else:
            while len(last) < 1024:
                last.append(end_token_id)
            out_np.append(last)
            last=text_tokens_ids
            rows+=1
    out_mat = np.matrix(out_np)
    print(out_mat.shape)
    hf.create_dataset(filename, data=out_mat)
hf.close()


(613, 1024)
Read 10000 Written: 2468
Read 20000 Written: 4960
Read 30000 Written: 7445
(9481, 1024)


In [None]:
len(tokenizer)

50270