## Dolly 3B fine tuning through PEFT/LoRA

In [None]:
#!pip install transformers
#!pip install datasets
#!pip install gradio
#!pip install py7zr
#!pip install accelerate
#!pip install bitsandbytes
#!pip install peft
#!pip install rouge_score
#!pip install evaluate

#!pip install --upgrade accelerate

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq

import torch
from torch.utils.data import Dataset, random_split
from transformers import TrainingArguments, Trainer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
import pandas as pd
import numpy as np
import time
import re, os
import gc

### Prepare training data

In [None]:
from datasets import Dataset, load_dataset

In [None]:
TRAINING_FILE = 'data/mlu_ops_concat_all_df_list_instruction_output_1441.json'
dataset = load_dataset("json", data_files=TRAINING_FILE)["train"]

In [None]:
dataset = dataset.rename_column("output", "response")
dataset

Deine special token for instruction tuning

In [None]:
INTRO_BLURB = (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
)
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
DEFAULT_SEED = 42

# This is a training prompt that does not contain an input string.  The instruction by itself has enough information
# to respond.  For example, the instruction might ask for the year a historic figure was born.
PROMPT_NO_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
{response}
{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

# This is a training prompt that contains an input string that serves as context for the instruction.  For example,
# the input might be a passage from Wikipedia and the intruction is to extract some information from it.
PROMPT_WITH_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{input_key}
{input}
{response_key}
{response}
{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

Define DataCollator

In [None]:
from typing import Any, Dict, List, Tuple, Union

In [None]:
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        batch = super().torch_call(examples)

        # The prompt ends with the response key plus a newline.  We encode this and then try to find it in the
        # sequence of tokens.  This should just be a single token.
        response_token_ids = self.tokenizer.encode(RESPONSE_KEY_NL)

        labels = batch["labels"].clone()

        for i in range(len(examples)):

            response_token_ids_start_idx = None
            for idx in np.where(batch["labels"][i] == response_token_ids[0])[0]:
                response_token_ids_start_idx = idx
                break

            if response_token_ids_start_idx is None:
                raise RuntimeError(
                    f'Could not find response key {response_token_ids} in token IDs {batch["labels"][i]}'
                )

            response_token_ids_end_idx = response_token_ids_start_idx + 1

            # Make pytorch loss function ignore all tokens up through the end of the response key
            labels[i, :response_token_ids_end_idx] = -100

        batch["labels"] = labels

        return batch

Add processed data attribute in dataset

In [None]:
def _add_text(rec):
        instruction = rec["instruction"]
        response = rec["response"]
        context = rec.get("context")
        
        if not instruction:
            raise ValueError(f"Expected an instruction in: {rec}")

        if not response:
            raise ValueError(f"Expected a response in: {rec}")

        # For some instructions there is an input that goes along with the instruction, providing context for the
        # instruction.  For example, the input might be a passage from Wikipedia and the instruction says to extract
        # some piece of information from it.  The response is that information to extract.  In other cases there is
        # no input.  For example, the instruction might be open QA such as asking what year some historic figure was
        # born.
        if context:
            rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context)
        else:
            rec["text"] = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)
        
        return rec

In [None]:
dataset = dataset.map(_add_text)

### Load model in 8bit

In [None]:
max_length = 512

In [None]:
BASE_MODEL = 'databricks/dolly-v2-3b'

In [None]:
device_map="auto"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL, 
    #cache_dir="/LLM_test/hf_cache"
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"additional_special_tokens": [END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL]})

In [None]:
len(tokenizer)

In [None]:
model = AutoModelForCausalLM.from_pretrained(       # other models
    BASE_MODEL, 
    use_cache=False,
    device_map=device_map,
    load_in_8bit=True,
    #cache_dir="/LLM_test/hf_cache"
)
model.resize_token_embeddings(len(tokenizer))

In [None]:
!nvidia-smi

### Create instruct Pipeline for inference

In [None]:
# Create Instruct Pipeline
import logging
import re

import numpy as np
from transformers import Pipeline, PreTrainedTokenizer

logger = logging.getLogger(__name__)

# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
PROMPT_FOR_GENERATION_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}
""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)

def get_special_token_id(tokenizer: PreTrainedTokenizer, key: str) -> int:
    """Gets the token ID for a given string that has been added to the tokenizer as a special token.
    When training, we configure the tokenizer so that the sequences like "### Instruction:" and "### End" are
    treated specially and converted to a single, new token.  This retrieves the token ID each of these keys map to.
    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token
    Raises:
        RuntimeError: if more than one ID was generated
    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]


class InstructionTextGenerationPipeline(Pipeline):
    def __init__(
        self, *args, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs
    ):
        super().__init__(*args, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs)

    def _sanitize_parameters(self, return_instruction_text=False, **generate_kwargs):
        preprocess_params = {}

        # newer versions of the tokenizer configure the response key as a special token.  newer versions still may
        # append a newline to yield a single token.  find whatever token is configured for the response key.
        tokenizer_response_key = next(
            (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None
        )

        response_key_token_id = None
        end_key_token_id = None
        if tokenizer_response_key:
            try:
                response_key_token_id = get_special_token_id(self.tokenizer, tokenizer_response_key)
                end_key_token_id = get_special_token_id(self.tokenizer, END_KEY)

                # Ensure generation stops once it generates "### End"
                generate_kwargs["eos_token_id"] = end_key_token_id
            except ValueError:
                pass

        forward_params = generate_kwargs
        postprocess_params = {
            "response_key_token_id": response_key_token_id,
            "end_key_token_id": end_key_token_id,
            "return_instruction_text": return_instruction_text,
        }

        return preprocess_params, forward_params, postprocess_params

    def preprocess(self, instruction_text, **generate_kwargs):
        prompt_text = PROMPT_FOR_GENERATION_FORMAT.format(instruction=instruction_text)
        inputs = self.tokenizer(
            prompt_text,
            return_tensors="pt",
        )
        inputs["prompt_text"] = prompt_text
        inputs["instruction_text"] = instruction_text
        return inputs

    def _forward(self, model_inputs, **generate_kwargs):
        input_ids = model_inputs["input_ids"]
        attention_mask = model_inputs.get("attention_mask", None)
        generated_sequence = self.model.generate(
            input_ids=input_ids.to(self.model.device),
            attention_mask=attention_mask,
            pad_token_id=self.tokenizer.pad_token_id,
            **generate_kwargs,
        )[0].cpu()
        instruction_text = model_inputs.pop("instruction_text")
        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "instruction_text": instruction_text}

    def postprocess(self, model_outputs, response_key_token_id, end_key_token_id, return_instruction_text):
        sequence = model_outputs["generated_sequence"]
        instruction_text = model_outputs["instruction_text"]

        # The response will be set to this variable if we can identify it.
        decoded = None

        # If we have token IDs for the response and end, then we can find the tokens and only decode between them.
        if response_key_token_id and end_key_token_id:
            # Find where "### Response:" is first found in the generated tokens.  Considering this is part of the
            # prompt, we should definitely find it.  We will return the tokens found after this token.
            response_pos = None
            response_positions = np.where(sequence == response_key_token_id)[0]
            if len(response_positions) == 0:
                logger.warn(f"Could not find response key {response_key_token_id} in: {sequence}")
            else:
                response_pos = response_positions[0]

            if response_pos:
                # Next find where "### End" is located.  The model has been trained to end its responses with this
                # sequence (or actually, the token ID it maps to, since it is a special token).  We may not find
                # this token, as the response could be truncated.  If we don't find it then just return everything
                # to the end.  Note that even though we set eos_token_id, we still see the this token at the end.
                end_pos = None
                end_positions = np.where(sequence == end_key_token_id)[0]
                if len(end_positions) > 0:
                    end_pos = end_positions[0]

                decoded = self.tokenizer.decode(sequence[response_pos + 1 : end_pos]).strip()
        else:
            # Otherwise we'll decode everything and use a regex to find the response and end.

            fully_decoded = self.tokenizer.decode(sequence)

            # The response appears after "### Response:".  The model has been trained to append "### End" at the
            # end.
            m = re.search(r"#+\s*Response:\s*(.+?)#+\s*End", fully_decoded, flags=re.DOTALL)

            if m:
                decoded = m.group(1).strip()
            else:
                # The model might not generate the "### End" sequence before reaching the max tokens.  In this case,
                # return everything after "### Response:".
                m = re.search(r"#+\s*Response:\s*(.+)", fully_decoded, flags=re.DOTALL)
                if m:
                    decoded = m.group(1).strip()
                else:
                    logger.warn(f"Failed to find response in:\n{fully_decoded}")

        if return_instruction_text:
            return {"instruction_text": instruction_text, "generated_text": decoded}

        return decoded
     

Single query test

In [None]:
instruction = "What are the requirements before registering MLU courses?"
dolly_gen = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
response = dolly_gen(instruction)
response

### Fine Tuning through LoRA

In [None]:
def preprocess_batch(batch: Dict[str, List], tokenizer: AutoTokenizer, max_length: int) -> dict:
    model_inputs = tokenizer(batch["text"],max_length=max_length,truncation=True,)

    return model_inputs    

In [None]:
from functools import partial
_preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)

In [None]:
dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "input","response", "text"],
)

In [None]:
processed_dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
#processed_dataset = dataset

In [None]:
split_dataset = processed_dataset.train_test_split(test_size=41, seed=0)
split_dataset

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

Prepare int-8 model for training

In [None]:
model = prepare_model_for_int8_training(model)

Define LoRA Config

In [None]:
lora_config = LoraConfig(
 r=512,             #  256,     #64,     #16
 lora_alpha=1024,       # 512,   #32,      
 lora_dropout=0.05,      #0.05,
 bias="none",
 task_type="CAUSAL_LM"
)

In [None]:
# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
data_collator = DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
)

In [None]:
OUTPUT_DIR = 'dolly3b-lora-ft-r10'

In [None]:
training_args = TrainingArguments(    
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        #predict_with_generate=True,
        #weight_decay=0.01,
        #fp16=False,                    
        #bf16=True,
        learning_rate=1e-4,
        num_train_epochs=1,      # 10,
        #deepspeed=None,
        #gradient_checkpointing=False,
        #logging_dir=f"{local_output_dir}/runs",
        logging_strategy="steps",
        logging_steps=100,    # 50
        evaluation_strategy="steps",
        eval_steps=100,       # 50
        save_strategy="steps",
        save_steps=20000,
        save_total_limit=10,
        #load_best_model_at_end=True,
        #report_to="tensorboard",
        #disable_tqdm=True,
        #remove_unused_columns=False,
        #local_rank=True,
        #warmup_steps=None,
)

In [None]:
trainer = Trainer(    
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=split_dataset["train"],
        eval_dataset=split_dataset["test"],
        data_collator=data_collator,
)

In [None]:
!nvidia-smi

In [None]:
if BASE_MODEL == "databricks/dolly-v2-3b":
    model = model.half()
    model = model.float()

Start training

In [None]:
st = time.time()

trainer.train()

et = time.time()
elapsed_time = et - st
print('Training time:', elapsed_time, 'seconds')

Inference the trained model with single query

In [None]:
instruction = "What is the minimum I need to do to pass?"
dolly_gen = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
response = dolly_gen(instruction)
response

In [None]:
trainer.model.save_pretrained(OUTPUT_DIR)

trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)