In [1]:
# https://huggingface.co/docs/peft/index

import os 
from transformers import GPT2Tokenizer, GPT2LMHeadModel

### Utils
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
### 

math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()

math_sentences = math_example_text.split("\n")

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
output_dir = "./finetuned_llm"

model_name = "gpt2" ##output_dir
tokenizer = GPT2Tokenizer.from_pretrained(model_name) #gpt2
llm_model = GPT2LMHeadModel.from_pretrained(model_name) # gpt2

tokenizer.pad_token = tokenizer.eos_token

# Configure the model
llm_model.config.pad_token_id = tokenizer.eos_token_id
llm_model.config.eos_token_id = tokenizer.eos_token_id
llm_model.config.vocab_size = llm_model.config.vocab_size + len(tokenizer.get_added_vocab())
llm_model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [19]:
tokenizer.pad_token = tokenizer.eos_token
encoded_data = tokenizer.batch_encode_plus(math_sentences, add_special_tokens=True, return_tensors='pt', padding=True)
BATH_SIZE = 10

batch_data = list(chunks(encoded_data["input_ids"], BATH_SIZE))[0:100] # <--- truncate number of batches       

In [20]:
tst_text = ["Do you love me ?", "What is math?", "Prime number is ", "What is world ?"]
tst_data = tokenizer.batch_encode_plus(tst_text, add_special_tokens=True, return_tensors='pt', padding=True)
output = llm_model.generate(tst_data["input_ids"])
print(tokenizer.decode(output[0]))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Do you love me? I love you too. I love you too. I love you too.


In [None]:
import torch

EPOCHS=10


optimizer = torch.optim.AdamW(llm_model.parameters(), lr=1e-5)

import mlflow

mlflow.end_run()
# mlflow.set_tag('mlflow.runName', "seq2seq_ex")

with mlflow.start_run() as run:

    for epoch in range(EPOCHS):
        for batch in batch_data: 
            outputs = llm_model(batch, labels=batch)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            print("batch loss ", loss.item())
            optimizer.step()

        # log metric every epoch
        mlflow.log_metric('loss', loss.item())
        output = llm_model.generate(tst_data["input_ids"])
        test_output = ""
        for i in range(output.shape[0]):
            tmp_str = tokenizer.decode(output[i])
            test_output += "\n" + tmp_str
        mlflow.log_text(test_output, f"file_{epoch}.txt")
        #### 

        
        # Save the fine-tuned model every epoch 
        print("MODEL saved loss ", loss)
        llm_model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

batch loss  13.65389347076416
batch loss  12.443979263305664
batch loss  10.709291458129883
batch loss  9.277181625366211
