In [31]:
# https://huggingface.co/docs/peft/index

import os 
from transformers import GPT2Tokenizer, GPT2LMHeadModel

### Utils
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def split_strings(strings):
    first_parts = []
    second_parts = []
    for string in strings:
        split_index = len(string) // 2
        while split_index < len(string) and not string[split_index].isspace():
            split_index += 1
        first_part = string[:split_index].rstrip()
        second_part = string[split_index:].lstrip()
        first_parts.append(first_part)
        second_parts.append(second_part)
    return first_parts, second_parts
    
### 

math_example_path = os.getcwd() + '/data/What_Is_Mathematics_An_Elementary_Approach_to_Ideas_and_Methods.txt'

with open(math_example_path, "r",  encoding="utf8") as f:
     math_example_text = f.read()

math_example_text = math_example_text.replace("\n", "")
math_sentences = math_example_text.split(".")

In [32]:
print(len(math_sentences))
math_sentences = [item for item in math_sentences if 80 < len(item) < 100]
X, Y = split_strings(math_sentences)
print(math_sentences[1])

9421
 But it is not a concession to the dangerous tendency toward dodging all exertion


In [33]:
tst_ind = 30
print(math_sentences[tst_ind], "\n X: ", X[tst_ind], "\n Y: ", Y[tst_ind])

 For a « 20, we need twenty words for the digits, plus words for 20 and 400, making a total of 22 
 X:   For a « 20, we need twenty words for the digits, 
 Y:  plus words for 20 and 400, making a total of 22


In [46]:
import torch 

output_dir = "./finetuned_llm_all"

model_name = "gpt2" #output_dir #
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side='left') #gpt2
llm_model = GPT2LMHeadModel.from_pretrained(model_name) # gpt2
device = torch.device('cuda')
llm_model.to(device)

tokenizer.pad_token = tokenizer.eos_token

# Configure the model
llm_model.config.pad_token_id = tokenizer.eos_token_id
llm_model.config.eos_token_id = tokenizer.eos_token_id
llm_model.config.vocab_size = llm_model.config.vocab_size + len(tokenizer.get_added_vocab())
llm_model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [47]:
#len_params = len(list(llm_model.parameters()))
#for i, param in enumerate(llm_model.parameters()):
#    param.requires_grad = i/len_params > 0.8 # train last 20% of layers

In [80]:
import numpy as np
#np.random.seed(309)

tokenizer.pad_token = tokenizer.eos_token
llm_model.config.pad_token_id = tokenizer.pad_token_id

# Set padding to the left side as the model is a decoder-only architecture
tokenizer.padding_side = "left"

encoded_X = tokenizer.batch_encode_plus(X, add_special_tokens=False, return_tensors='pt', padding=True)
encoded_Y = tokenizer.batch_encode_plus(Y, add_special_tokens=False, return_tensors='pt', padding=True)

test_ids = np.random.choice(range(len(X)), 5)
test_X = [X[i] for i in test_ids]
enc_test_X = tokenizer.batch_encode_plus(test_X, 
                                     add_special_tokens=False, 
                                     return_tensors='pt', 
                                     padding=True) # X is encoded Y isnt !


test_Y = [Y[i] for i in test_ids]

BATH_SIZE = 10

x_batches = list(chunks(encoded_X["input_ids"], BATH_SIZE))
y_batches = list(chunks(encoded_Y["input_ids"], BATH_SIZE))

In [113]:
test_X = [X[i] for i in test_ids]
enc_test_X = tokenizer.batch_encode_plus(test_X, 
                                     add_special_tokens=False, 
                                     return_tensors='pt', 
                                     padding=True) # X is encoded Y isnt !
                                         
output = llm_model.generate(enc_test_X["input_ids"].to(device),  
                            max_new_tokens=100, 
                            pad_token_id=tokenizer.eos_token_id)

# Set padding to the left side as the model is a decoder-only architecture

for i, text in enumerate(test_X[0:1]):
    print("input:", text) # <<<!
    print("output:", tokenizer.decode(output[i]))
    

input:  Peaucellier, invented a simple linkage that solved
output: <|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> Peaucellier, invented a simple linkage that solved the problem of how to connect two different types of cells.

The Peaucellier link is a simple, yet powerful, way to connect two different types of cells.

The Peaucellier link is a simple, yet powerful, way to connect two different types of cells.

The Peaucellier link is a simple, yet powerful, way to connect two different types of cells.

The Peaucellier link is a simple, yet powerful, way


In [114]:
# Prepare input data
input_ids = tokenizer.encode(test_X[0] ,
                             add_special_tokens=False, 
                             return_tensors='pt', 
                             padding=True).to(device)

# Generate text using generation parameters
output = llm_model.generate(input_ids , 
                            max_new_tokens=100, 
                            pad_token_id=tokenizer.eos_token_id)

# Decode and print the output
decoded_output = tokenizer.decode(output[0])
print("output:", decoded_output)

output:  Peaucellier, invented a simple linkage that solved the problem of how to connect two wires.

The first step was to make a simple, inexpensive, and easy-to-use link between two wires. The second step was to make a simple, inexpensive, and easy-to-use link between two wires.

The first step was to make a simple, inexpensive, and easy-to-use link between two wires.

The first step was to make a simple, inexpensive, and easy-to-use link between


In [82]:
import torch

optimizer = torch.optim.AdamW(llm_model.parameters(), lr=1e-5)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [62]:
import mlflow


EPOCHS=10

mlflow.end_run()
gradient_accumulation_steps = 10
grad_counter = 0

with mlflow.start_run(run_name='final') as run:

    for epoch in range(EPOCHS):
        for index, batch in enumerate(x_batches[0:5]): 
            outputs = llm_model(batch, labels=y_batches[index])
            loss = outputs.loss
            loss.backward()

            if grad_counter % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

                print("batch loss ", loss.item())

                
            grad_counter += 1
                            

        # log metric every epoch
        mlflow.log_metric('loss', loss.item())
        mlflow.log_metric('lr', get_lr(optimizer))

        
        output = llm_model.generate(tst_data["input_ids"].to(device))
        test_output = ""
        for i in range(output.shape[0]):
            tmp_str = tokenizer.decode(output[i])
            test_output += "\n" + tmp_str
        mlflow.log_text(test_output, f"test_{epoch}.txt")
        #### 

        
        # Save the fine-tuned model every epoch 
        print("MODEL saved loss ", loss)
        llm_model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

torch.Size([10, 98])
torch.Size([10, 97])


ValueError: Expected input batch_size (970) to match target batch_size (960).

In [132]:
import numpy as np
tst_text = np.random.choice(math_sentences, 30)
tst_data = tokenizer.batch_encode_plus(tst_text, add_special_tokens=True, return_tensors='pt', padding=True).to(device)
output = llm_model.generate(tst_data["input_ids"])

for i, text in enumerate(tst_text):
    print("input:", text)
    print("output", tokenizer.decode(output[i]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


input: <|endoftext|>  The subject of transcendental numbers has always fascinated mathe¬ maticians
output <|endoftext|>  The subject of transcendental numbers has always fascinated mathe¬ maticians<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
input: <|endoftext|> ) 2) Given five points, O, O', A, B, C, of an unknown conic K
output <|endoftext|> ) 2) Given five points, O, O', A, B, C, of an unknown conic K<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
input: <|endoftext|>  The number/is represented in the decimal system by the abbreviated s

In [None]:
from transformers import TextDataset

train_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path=math_example_path,
      block_size=128)

In [28]:
from transformers import (
    TrainingArguments, 
    Trainer, 
    default_data_collator, 
)
import mlflow 
mlflow.end_run()

training_args = TrainingArguments(output_dir='test_trainer', 
                                  #evaluation_strategy='epoch',
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=20, #
                                  num_train_epochs = 2,
                                  fp16=False,)

trainer = Trainer(
    model=llm_model,
    args=training_args,
    train_dataset=dataset,
    #eval_dataset=dataset,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    compute_metrics=None,
    preprocess_logits_for_metrics=None,
)

trainer.train()

RuntimeError: Could not infer dtype of DatasetInfo

In [23]:
dataset

[Dataset({
     features: ['text'],
     num_rows: 31749
 })]