# Fine-tuning the Falcon 7B and 40B LLM models using a custom dataset.
<img src='images/falcon.jpeg' width="400" height="400">

## Library imports (HuggingFace) &#129303;

In [None]:
from datasets import load_dataset
import torch
from transformers import  (AutoModelForCausalLM,
                           AutoTokenizer,
                           TrainingArguments,
                          )
from peft import LoraConfig
from trl import SFTTrainer
import wandb

import warnings

# Comment out the next line if you want to see warning messages
warnings.filterwarnings("ignore", category=UserWarning)

## Helper function definitions

In [None]:
# Define helper functions

def print_example(text):
    """
    Prints a dataset example in a more human-readable manner
    :param text: The example's text to ge printed
    :return: None
    """
    separator = '###'
    for turn in text.split(separator)[1:]:
        print(f'{separator} {turn}\n')

## Fine-tune parameter definition section

In [None]:
# Dataset loader
DATASET_PATH = "timdettmers/openassistant-guanaco"

# Params for AutoModelForCausalLM
DEVICE_MAP = "auto" # Instructs Accelerate to use all GPUs available in the node.
LOAD_IN_8BIT = False # 8-bit precision requires ~ 1.2-1.4GB memory per 1B parameters
MODEL_NAME = "tiiuae/falcon-7b" # Could use "tiiuae/falcon-40b" or "tiiuae/falcon-7b"
TRUST_REMOTE_CODE = True # Required when a model is not yet part of the Transformers library

# LoRA configuration (see https://huggingface.co/docs/peft/conceptual_guides/lora)
# - LoRA allows efficient fine-tuning of LLMs by training low rank (small) matrices
LORA_ALPHA = 16 # LoRA scaling factor.
LORA_DROPOUT = 0.1 # Probability of a neuron link to get disabled during a step
LORA_R = 32 # Rank of update matrices. Lower rank results in fewer trainable parameters.

# List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint.
LORA_TARGET_MODULES = ["query_key_value",
                  "dense",
                  "dense_h_to_4h",
                  "dense_4h_to_h"]

# Trainer configuration
WANDB_NAME='Falcon fine-tube job' # ID for the job in the wandb dashboard. 
BF16 = True #  Whether to use bf16 precision. Requires Ampere or higher NVIDIA architecture.
EVAL_STEPS = 8 # Number of update steps between two evaluations if evaluation_strategy="steps"
EVAL_STRATEGY = 'steps' # Evaluation is done (and logged) every eval_steps.
FP16 = False # Whether to use fp16 16-bit precision. Set to False if using BF16 or when loading models in 8-bit on V100 GPUs
GRADIENT_ACCUMULATION_STEPS = 4 # Accumulates gradients from 'n' batches before stepping the optimizer. Increase when using small memory GPUs
GROUP_BY_LENGTH = True # Group samples of similar length to minimize padding and be more efficient.
LOAD_BEST = True # Load the checkpoint with the lowest loss at the end.
LOGGING_STEPS = 4 # Number of update steps between two logs if logging_strategy="steps".
LOGGING_STRATEGY = 'steps' # Logging is done every logging_steps
LR = 2e-4 # The initial learning rate.
LR_SCHEDULER_TYPE = 'constant' # Other options are 'cosine' or 'linear'
MAX_GRAD_NORM = 0.3 # Maximum gradient norm (for gradient clipping).
MAX_STEPS = 20 # You may start small (64) then increase the number to complete multiple epochs
OPTIMIZER = "paged_adamw_32bit" # Optimizer function
OUTPUT_DIR = "./results" # Path where model checkpoints will be saved
PER_DEV_TRAIN_BATCH_SIZE = 2 # Use a low number if getting out of memory errors. 
REPORT_ENDPOINT = "wandb" # Comment out if don't want to use wandb. Ensure you had run 'wandb login' previously.
SAVE_STEPS = 8 # Number of updates steps before two checkpoint saves if save_strategy="steps"
SAVE_STRATEGY = 'steps' # Save is done every save_steps.
SAVE_TOTAL_LIMIT = 2 # Only save the last and best checkpoints
USE_CACHE = False # Can't use cache with gradient check pointing
WARMUP_RATIO = 0.03 # Ratio of total training steps used for a linear warmup from 0 to learning_rate.
WEIGHT_DECAY = 0.001 # AdamW regularization parameter

# SFTTrainer config (see https://huggingface.co/docs/trl/main/en/sft_trainer)
MAX_SEQ_LENGTH = 512 # Max length is token sequence in an example

##  Fine-tuning Datasets
> The Guanaco dataset is a subset of the Open Assistant dataset containing only the highest-rated paths in the conversation tree.<br>
> The dataset can be automatically downloaded from the **[HuggingFace Datasets repository](https://huggingface.co/datasets/timdettmers/openassistant-guanaco)**<br>
> The dataset provides **9846 examples** to the LLM about `Human: <prompt> Assistant: <completion>` dialog in multiple languages and on various topics.<br>
>> **_This way, the LLM learns to follow instructions._**

In [None]:
# Load training and evaluation datasets
print("Loading the Guanaco dataset...")
train_dataset = load_dataset(DATASET_PATH, split="train")
eval_dataset = load_dataset(DATASET_PATH, split="test")

# Display some examples
train_df = train_dataset.to_pandas().head(5)
print(f'>> Train dataset size is {train_dataset.num_rows} examples.')

eval_df = eval_dataset.to_pandas().head(5)
print(f'>> Eval dataset size is {eval_dataset.num_rows} examples.\n')

# Display train and eval examples (truncated to 500 characters)
print("Training example:")
print_example(train_df.iloc[0,0][:500])
print("\nEval Example:")
print_example(eval_df.iloc[3,0][:500])

## Load the pre-trained model (Falcon 7B or 40B)

In [None]:
'''
Load the pre-trained model.
Considerations about numeric precision:
    - If using 8-bit, requires ~ 1.2-1.4GB of GPU memory per 1B parameters and slower than 16-bit.
    - The use of either fp16 or bf16 requires 2x GPU memory vs. 8-bit. Training is faster.
'''
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    load_in_8bit = LOAD_IN_8BIT,
    trust_remote_code = TRUST_REMOTE_CODE,
    device_map = DEVICE_MAP,
)
model.config.use_cache = USE_CACHE

# Load the tokenizer packaged with the chosen model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,
                                          trust_remote_code = TRUST_REMOTE_CODE)
tokenizer.pad_token = tokenizer.eos_token

## Configure LoRA and the Supervised fine-tuning (SFT) trainer
> With **LoRA** we only need to **train around 1% of the parameters in the original model**.<br>
> The **SFT trainer** applies **RLHF** (Reinforment Learning from Human Feedback) to the fine-tune process.<br>
> **LoRA and SFT get applied by the same trainer run!**<br>

In [None]:
'''
Set the LoRA configuration
LoRA makes LLM fine-tuning more efficient by drastically reducing the number of trainable parameters.
The foundation LLM remains unaltered while the LoRa layers get trained on the fine-tune dataset
'''
peft_config = LoraConfig(
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT,
    r = LORA_R,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = LORA_TARGET_MODULES
)

# Setup training arguments
training_arguments = TrainingArguments(
    output_dir = OUTPUT_DIR,
    per_device_train_batch_size = PER_DEV_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
    optim = OPTIMIZER,
    save_steps = SAVE_STEPS,
    save_strategy = SAVE_STRATEGY,
    logging_steps = LOGGING_STEPS,
    logging_strategy = LOGGING_STRATEGY,
    learning_rate = LR,
    lr_scheduler_type = LR_SCHEDULER_TYPE,
    fp16 = FP16,
    bf16 = BF16,
    max_grad_norm = MAX_GRAD_NORM,
    max_steps = MAX_STEPS,
    warmup_ratio = WARMUP_RATIO,
    group_by_length = GROUP_BY_LENGTH,
    report_to = REPORT_ENDPOINT,
    evaluation_strategy = EVAL_STRATEGY,
    eval_steps = EVAL_STEPS,
    load_best_model_at_end = LOAD_BEST,
    greater_is_better = False,
    save_total_limit  = SAVE_TOTAL_LIMIT
)

'''
Supervised fine-tuning (SFT) is a crucial step in reinforcement learning from human feedback (RLHF).
The Transformer Reinforcement Learning (TRL) library provides an API to create SFT models and train them
with a few lines of code on a finetune dataset.
'''
trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    peft_config = peft_config,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    tokenizer = tokenizer,
    args = training_arguments,

)

print("\n\nTrainable parameter reduction with LoRA:")
trainer.model.print_trainable_parameters()

## Run the training loop and save best checkpoint.

In [None]:
# Run training loop
# Casting norm, lm_head and embed_tokens modules may improve numeric stability
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# Fine-tune the  model
trainer.train()

# Print out the checkpoint with lower eval-loss value
best_chkpt_path = trainer.state.best_model_checkpoint
print(f"Best check point path is {best_chkpt_path}")

# Stop reporting
wandb.finish()

## Run prompt some completion tasks using the fine-tuned model

### Let's define a LangChain pipeline

In [None]:
# Set model in eval mode
model = model.eval()

# Setup model generation configuration
gen_config = model.generation_config
gen_config.max_new_tokens = 128 # Max tokens to generate
gen_config.temperature = 0.1 # The lower, the more deterministic the completion is
gen_config.do_sample = True
gen_config.num_return_sequences = 1
gen_config.repetition_penalty = 1.7 # Reduce chances of model repeating itself
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

from typing import List

from transformers import (
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,    
)

from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseOutputParser

            
# Define a stop generation criteria class 
class StopGenCriteria(StoppingCriteria):
    def __init__(self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device):
        stop_tok_ids = [tokenizer.convert_tokens_to_ids(tok) for tok in tokens]
        self.stop_tok_ids = [torch.tensor(tok, dtype=torch.long, device=device) for tok in stop_tok_ids]
 
    def __call__(self, 
                 input_ids: torch.LongTensor, 
                 scores: torch.FloatTensor,
                 **kwargs) -> bool:
        for stop_ids in self.stop_tok_ids:
            if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all():
                return True
        return False
    
# Define the stop tokens to get used in our prompts as generation stop criteria
stop_tokens = [["Human", ":"], ["Assistant", ":"]]
stop_criteria = StoppingCriteriaList(
    [StopGenCriteria(stop_tokens, tokenizer, model.device)])

# Define the HugginFace generation pipeline
gen_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    stopping_criteria=stop_criteria,
    generation_config=gen_config,
)

# Transform the HF pipeline to a LangChain
onestop_tokens = [["Human", ":"], ["Assistant", ":"]]
stop_criteria = StoppingCriteriaList(
    [StopGenCriteria(stop_tokens, tokenizer, model.device)])

gen_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    stopping_criteria=stop_criteria,
    generation_config=gen_config,
)
 
chat_llm = HuggingFacePipeline(pipeline=gen_pipeline)

## LLM acting a as a knowledge base

In [None]:
prompt = f"""
The following is a friendly conversation between a Human and an Assitant. The Assistant is
consice and explains concepts in an accesible manner.

Human:  Explain the concept of magnetism.

Assistant:
""".strip()
completion = chat_llm(prompt)
print(f"Assistant:\n{completion}")

## LLM acting as a reasoning engine

In [None]:
prompt = """
Human: At a store, shoes cost shoe_cost pair and socks cost sock_cost per pair. 
If a customer buys shoe_p pais of shoes and  sock_p pairs of socks, what is the total cost of the purchase?

Write a Python function that returns the answer.

Assistant:
def store_cost(shoe_cost, shoe_p, sock_cost, sock_p):
  return (shoe_cost * shoe_p) + (sock_cost * sock_p)

Human: At the cinema, tickets for adults cost adult_fee and tickets for children cost child_fee
If a family with num_adult adults and num_child children go to the movies, what's the total cost for that family?

Write a Python function that returns the answer.

Assistant:
""".strip()
completion = chat_llm(prompt)
print(f"Assistant:\n{completion}")