### Fine-Tuning a small LLM (pythia - 410 Million Parameters with the lamini company finetuning dataset from HugginFace)

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
import sys
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import Trainer
from llama import BasicModelRunner

In [3]:
# Setting up logging
logger = logging.getLogger(__name__)
def setup_logging():
    FORMAT = '%(levelname)s:%(name)s: %(message)s (%(asctime)s; %(filename)s:%(lineno)d)'
    DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
    LEVEL = logging.INFO
    STREAM = sys.stdout
    logging.basicConfig( 
        level=LEVEL, 
        format=FORMAT, 
        datefmt=DATE_FORMAT,
        stream=STREAM,
    )
setup_logging()

In [4]:
dataset_path = "lamini/lamini_docs"
model_name = "EleutherAI/pythia-70m"
using_hugginface = True

# Loading the dataset and the train-test datasets
finetuning_dataset = datasets.load_dataset(dataset_path)
training_dataset, test_datastet = finetuning_dataset['train'], finetuning_dataset['test']

# Loading the model
base_model = AutoModelForCausalLM.from_pretrained(model_name)

#### Setting the tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### CPU and GPU device presence and loading the base_model location

In [6]:
device_count = torch.cuda.device_count()
if device_count > 0:
    print('Select GPU device')
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

logger.info(device)
base_model.to(device) # Loading the model onto the GPU/CPU depending on the availability

INFO:__main__: cpu (2024-04-04 20:39:18; 1956756189.py:9)


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

#### Inference function to prompt the LLM hosted locally

In [7]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100) -> str:
    # Encoding
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # Generate
    device = model.device
    generated_prompt_with_text = model.generate(
        input_ids=input_ids.to(device), # The tokens need to be put on the same device at the LLM (GPU or CPU)
        max_length=max_output_tokens
    )

    # Decoding
    generated_prompt_with_tokens = tokenizer.batch_decode( # batch_decode for decoding multiple token lists all-together
        generated_prompt_with_text,
        skip_special_tokens=True
    )

    # Trimming the initial Question out of the answer
    generated_text_answer = generated_prompt_with_tokens[0][len(text): ]

    return generated_text_answer

# Testing a prompt
# Probably halucinated answer
logger.info(inference("Hey what's the weather in toronto?", base_model, tokenizer))



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


INFO:__main__: 

I'm not sure if I'm going to be in the same place as you, but I'm not sure if I'm going to be in the same place as you, but I'm not sure if I'm going to be in the same place as you, but I'm not sure if I'm going to be in the same place as you, but I'm not sure if I'm going to be in the same place as you, (2024-04-04 20:39:19; 3471472043.py:30)


#### Training Set-up

In [8]:
max_steps = 5

trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [9]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

#### Monitor memory footprint

In [17]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, 2048)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

logger.info(base_model)

INFO:__main__: GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=Tru

In [11]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=test_datastet,
)

#### Initiating the Training

In [12]:
training_output = trainer.train()

  0%|          | 0/5 [00:00<?, ?it/s]

{'loss': 4.1562, 'grad_norm': 76.75056457519531, 'learning_rate': 1e-05, 'epoch': 0.0}
{'loss': 3.0687, 'grad_norm': 56.96006774902344, 'learning_rate': 7.500000000000001e-06, 'epoch': 0.01}
{'loss': 3.8929, 'grad_norm': 54.13835144042969, 'learning_rate': 5e-06, 'epoch': 0.01}
{'loss': 3.4545, 'grad_norm': 49.51322555541992, 'learning_rate': 2.5e-06, 'epoch': 0.01}
{'loss': 3.1985, 'grad_norm': 39.19015884399414, 'learning_rate': 0.0, 'epoch': 0.02}
{'train_runtime': 59.5776, 'train_samples_per_second': 0.336, 'train_steps_per_second': 0.084, 'train_loss': 3.5541662216186523, 'epoch': 0.02}


#### Saving the model locally in the pre-set directory

In [15]:
save_dir = f'{output_dir}/final_finetuned_model'
trainer.save_model(save_dir)

#### Running the fine-tuned locally saved model     

In [16]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)

# Probably halucinated answer, since it is trained only on 5 data points but with more training you can gauge the accuracy of the response
logger.info(inference("Hey what's the weather in toronto?", finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


INFO:__main__: 

I'm not sure if I'm going to be in the same boat, but I'm not sure if I'm going to be in the same boat.

I'm not sure if I'm going to be in the same boat, but I'm not sure if I'm going to be in the same boat.

I'm not sure if I'm going to be in the same boat, but I'm not sure if I'm (2024-04-04 20:42:31; 509760181.py:5)
