In [1]:
# https://huggingface.co/docs/peft/index

import os 
from transformers import GPT2Tokenizer, GPT2LMHeadModel

### Utils
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
### 

math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()

math_sentences = math_example_text.split("\n")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
print(len(math_sentences))
math_sentences = list(filter(lambda x: len(x) > 40, math_sentences))
print(len(math_sentences))
print(math_sentences[0])

12656
12656
ABOUT BLOG PROJECTS HELP DONATE  CONTACT JOBS VOLUNTEER PEOPLE


In [34]:
output_dir = "./finetuned_llm"

model_name = "gpt2" #output_dir #
tokenizer = GPT2Tokenizer.from_pretrained(model_name) #gpt2
llm_model = GPT2LMHeadModel.from_pretrained(model_name) # gpt2

tokenizer.pad_token = tokenizer.eos_token

# Configure the model
llm_model.config.pad_token_id = tokenizer.eos_token_id
llm_model.config.eos_token_id = tokenizer.eos_token_id
llm_model.config.vocab_size = llm_model.config.vocab_size + len(tokenizer.get_added_vocab())
llm_model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [36]:
tokenizer.pad_token = tokenizer.eos_token
encoded_data = tokenizer.batch_encode_plus(math_sentences, add_special_tokens=True, return_tensors='pt', padding=True)
BATH_SIZE = 10

batch_data = list(chunks(encoded_data["input_ids"], BATH_SIZE))[0:5] # <--- truncate number of batches       

In [37]:
tst_text = ["Do you love me ?", "What is math?", "Prime number is ", "What is world ?"]
tst_data = tokenizer.batch_encode_plus(tst_text, add_special_tokens=True, return_tensors='pt', padding=True)
output = llm_model.generate(tst_data["input_ids"])
print(tokenizer.decode(output[1]))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


What is math?<|endoftext|>The first time I saw the new "The Walking Dead" trailer, I


In [38]:
output = llm_model.generate(tokenizer.encode(math_sentences[1], add_special_tokens=True, return_tensors='pt', padding=False))
print(tokenizer.decode(output[0]))

This book should be returned on or before the date last  published.
The book is available


In [39]:
import torch

EPOCHS=10


optimizer = torch.optim.AdamW(llm_model.parameters(), lr=1e-5)

import mlflow

mlflow.end_run()

with mlflow.start_run(run_name='filter_len') as run:

    for epoch in range(EPOCHS):
        for batch in batch_data: 
            outputs = llm_model(batch, labels=batch)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            print("batch loss ", loss.item())
            optimizer.step()

        # log metric every epoch
        mlflow.log_metric('loss', loss.item())
        output = llm_model.generate(tst_data["input_ids"])
        test_output = ""
        for i in range(output.shape[0]):
            tmp_str = tokenizer.decode(output[i])
            test_output += "\n" + tmp_str
        mlflow.log_text(test_output, f"test_{epoch}.txt")
        #### 

        
        # Save the fine-tuned model every epoch 
        print("MODEL saved loss ", loss)
        llm_model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

batch loss  12.24035930633545
batch loss  11.305048942565918
batch loss  10.26236629486084
batch loss  9.5985746383667
batch loss  8.2322359085083


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


MODEL saved loss  tensor(8.2322, grad_fn=<NllLossBackward0>)
batch loss  7.049656391143799
batch loss  6.0097784996032715
batch loss  4.9497551918029785
batch loss  4.26508092880249
batch loss  3.611313819885254


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


MODEL saved loss  tensor(3.6113, grad_fn=<NllLossBackward0>)
batch loss  2.857067346572876
batch loss  2.544532299041748
batch loss  2.132084369659424
batch loss  1.9405757188796997
batch loss  1.8250163793563843


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


MODEL saved loss  tensor(1.8250, grad_fn=<NllLossBackward0>)
batch loss  1.5206247568130493
batch loss  1.5741976499557495
batch loss  1.4691665172576904
batch loss  1.511151909828186
batch loss  1.5160415172576904


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


MODEL saved loss  tensor(1.5160, grad_fn=<NllLossBackward0>)
batch loss  1.3227325677871704
batch loss  1.4066784381866455
batch loss  1.3359453678131104
batch loss  1.407519817352295
batch loss  1.4261610507965088


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


MODEL saved loss  tensor(1.4262, grad_fn=<NllLossBackward0>)


KeyboardInterrupt: 

In [27]:
output = llm_model.generate(tokenizer.encode(math_sentences[1], add_special_tokens=True, return_tensors='pt', padding=True), temperature=40)
print(tokenizer.decode(output[0]))

This book should be returned on or before the date last <|endoftext|>


