<a href="https://colab.research.google.com/github/codezero-01/RecipeGen/blob/main/model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import statements
import math
import os
from torch.utils.data import Dataset
import h5py
import torch

from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
)

#  select the GPU to train the model on
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# --optional (to debug the cuda error)
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

PyTorch version: 1.8.1+cu102


In [None]:
# To inspect cuda memory allotment and clearing cuda cache
# print(torch.cuda.memory_summary(device=None, abbreviated=False))
# torch.cuda.empty_cache()


In [None]:
# sample training instance 
samples_data = []
with open('train_temp.txt') as file:
    for row in file:
        samples_data.append(row)

In [None]:
samples_data[0]

'<BEGIN_RECIPE> <BEGIN_INPUT> baking <NEXT_INPUT> chili_powder <NEXT_INPUT> chili <NEXT_INPUT> oven <NEXT_INPUT> baked <END_INPUT> <BEGIN_TITLE> Indian Spiced Baked Potato Sticks<END_TITLE> <BEGIN_INGREDS> potato <NEXT_INGREDS> kitchen <NEXT_INGREDS> ghee <NEXT_INGREDS> butter <NEXT_INGREDS> oil <NEXT_INGREDS> turmeric <NEXT_INGREDS> chili_powder <NEXT_INGREDS> curry <NEXT_INGREDS> leaves <NEXT_INGREDS> salt <NEXT_INGREDS> parchment_paper <NEXT_INGREDS> chips <END_INGREDS> <BEGIN_INSTR> pre heat oven to 400°f . wash and dry the potato . peel the potato if you like . i leave the skin on because i like the texture and nutrients . slice the potato into 1/8th inch thick slices lengthwise using a mandolin or a large , sharp knife . stack up the slices , and cut them lengthwise to form thin sticks . place the potato sticks on a kitchen towel , and gently pat dry . in a bowl , whisk together the ghee or melted butter or oil , turmeric , chili powder , curry leaves and salt . add the potato st

In [None]:
#  Model parameters and hyper-parameters



In [None]:
# PyTorch dataset class 
class H5Dataset(Dataset):
    def __init__(self, tokenizer, file_path='train_temp', block_size=512): 
        cached_features_file = "data_temp.h5"

        logger.info("Loading features from cached file %s", cached_features_file)
        print(("Loading features from cached file %s", cached_features_file))
        with h5py.File(cached_features_file, 'r') as f:
            if file_path=='test_temp':
                self.samples = f[file_path][:] #this is a dev set, 30% of a test set
            else:
                self.samples = f[file_path][:]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return torch.tensor(self.samples[item]) 


In [None]:
def get_dataset( tokenizer, evaluate=False, local_rank=-1):
    file_path = "test_temp" if evaluate else "train_temp"
    return H5Dataset(tokenizer=tokenizer, file_path=file_path)

In [None]:
# set global seed 
set_seed(20)

In [None]:
# Transformer configuration
config = AutoConfig.from_pretrained('gpt2', cache_dir='cache')

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# Tokenizer for the model training
tokenizer = AutoTokenizer.from_pretrained('gpt2', cache_dir= 'cache')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Initialising the GPT2 model
model = AutoModelWithLMHead.from_pretrained(
            'gpt2', # model name
            config=config,
            cache_dir='cache', # cache directory (path to the cache directory)
        )



Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
# Adding the sppecial recipe token to the tokenizer
special_tokens = {
    "additional_special_tokens": ["<BEGIN_RECIPE>" ,
                                "<BEGIN_INPUT>" , 
                                "<NEXT_INPUT>",
                                "<END_INPUT>" ,
                                "<BEGIN_TITLE>",
                                "<END_TITLE>" ,
                                "<BEGIN_INGREDS>" ,
                                "<NEXT_INGREDS>" ,
                                "<END_INGREDS>" ,
                                "<BEGIN_INSTR>" ,
                                "<NEXT_INSTR>",
                                "<END_INSTR>" ,
                                "<END_RECIPE>"  
    ]
}


tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))  # resizeing the model to fit the tokenizer with special tokens

Embedding(50270, 768)

In [None]:
tokenizer, len(tokenizer) # verifying the toknizer configurations and the length of tokenizer

(PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<BEGIN_RECIPE>', '<BEGIN_INPUT>', '<NEXT_INPUT>', '<END_INPUT>', '<BEGIN_TITLE>', '<END_TITLE>', '<BEGIN_INGREDS>', '<NEXT_INGREDS>', '<END_INGREDS>', '<BEGIN_INSTR>', '<NEXT_INSTR>', '<END_INSTR>', '<END_RECIPE>']}),
 50270)

In [None]:
# converting the train and validation dataset to pytorch Dataset so as it can be given to the model as input for training
train_dataset = ( get_dataset(tokenizer=tokenizer) )
eval_dataset = (  get_dataset(tokenizer=tokenizer, evaluate=True) )

('Loading features from cached file %s', 'data_temp.h5')
('Loading features from cached file %s', 'data_temp.h5')


In [None]:
'''
* To be able to build batches, data collators may apply some processing (like padding). Some of them
(like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) oin the formed batch.
* Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.
* Forming the batches to dataset to be trained
source :- Hugginface.co
'''

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, mlm_probability=0.15  )

In [None]:
training_args = TrainingArguments(
    
    output_dir= "./outputs/tempt",
    
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="epoch",
    fp16=True,
    fp16_opt_level='O1',
    warmup_steps=1e2,    
    learning_rate=5e-4,
    adam_epsilon=1e-8,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

In [None]:
# Initializing our PyTorch Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
# saving the tokenizer object 
tokenizer.save_pretrained('./outputs/tempt/')

('./outputs/tempt1/tokenizer_config.json',
 './outputs/tempt1/special_tokens_map.json',
 './outputs/tempt1/vocab.json',
 './outputs/tempt1/merges.txt',
 './outputs/tempt1/added_tokens.json')

In [None]:
# Starting the Training and saving the model
trainer.train()
trainer.save_model() 



Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
0,No log,1.581332,64.7707,9.464
1,2.902000,1.484565,63.6386,9.633
2,2.902000,1.443501,196.8733,3.114
3,1.461600,1.420606,192.5897,3.183
4,1.461600,1.411004,182.4219,3.36


In [None]:
# !nvidia-smi

Mon May  3 13:48:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:5E:00.0 Off |                    0 |
| N/A   41C    P0    36W / 250W |  31025MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:D8:00.0 Off |                    0 |
| N/A   47C    P0    38W / 250W |   8001MiB / 32510MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------

In [None]:
# saving the tokenizer after training the mode, just to be safe
tokenizer.save_pretrained('./outputs/tempt/')

('./outputs/tempt1/tokenizer_config.json',
 './outputs/tempt1/special_tokens_map.json',
 './outputs/tempt1/vocab.json',
 './outputs/tempt1/merges.txt',
 './outputs/tempt1/added_tokens.json')

In [None]:
results = {}
    
eval_output = trainer.evaluate()

# perplixity in case of pytorch can be calculated using below computation
perplexity = math.exp(eval_output["eval_loss"])
result = {"perplexity": perplexity}

RuntimeError: CUDA out of memory. Tried to allocate 786.00 MiB (GPU 0; 31.75 GiB total capacity; 4.33 GiB already allocated; 469.19 MiB free; 4.65 GiB reserved in total by PyTorch)

In [None]:
print(eval_output, '\n\n', result)

{'eval_loss': 1.4110041856765747, 'eval_runtime': 182.9562, 'eval_samples_per_second': 3.351, 'epoch': 5.0, 'eval_mem_cpu_alloc_delta': 55255, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 25460, 'eval_mem_gpu_peaked_delta': 2935491072} 

 {'perplexity': 4.100070569669201}
