In [None]:
!pip install torch
!pip install accelerate -e git+https://github.com/huggingface/accelerate.git#egg=accelerate  # for distributed training of PyTorch models
!pip install bitsandbytes  # for working with binary data or bytes in Python.
!pip install datasets==2.13.1
!pip install transformers -e git+https://github.com/huggingface/transformers.git#egg=transformers
!pip install peft -e git+https://github.com/huggingface/peft.git#egg=peft
!pip install trl -e git+https://github.com/lvwerra/trl.git#egg=trl
!pip install scipy
# !pip install peft

Obtaining accelerate from git+https://github.com/huggingface/accelerate.git#egg=accelerate
  Updating ./src/accelerate clone
  Running command git fetch -q --tags
  Running command git reset --hard -q 649e65b542a5740fb5ce663bbd5af45ed426c06f
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: accelerate
  Building editable for accelerate (pyproject.toml) ... [?25l[?25hdone
  Created wheel for accelerate: filename=accelerate-0.27.0.dev0-0.editable-py3-none-any.whl size=11999 sha256=cca758f1e18fd787280e214774d72e8d31b958ad1bb45487233767efbecc35f4
  Stored in directory: /tmp/pip-ephem-wheel-cache-0mfyedbu/wheels/9c/a3/1e/47368f9b6575655fe9ee1b6350cfa7d4b0befe66a35f8a8365
Successfully built accelerate
Installing collected packages: accelerate
  Att

Set up Python environment

***fine-tune LLaMA 2 models on  datasets***



In [None]:
import argparse
import bitsandbytes as bnb
from datasets import load_dataset, get_dataset_split_names
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
from pprint import pprint

/content/outputs/runs/Jan19_12-37-40_2398a21054e5

In [None]:
import torch
torch.cuda.is_available()

True

Function  to download LLaMA 2 model and its tokenizer. It requires a bitsandbytes configuration

In [None]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

#method from the Hugging Face Transformers library to load a pre-trained language model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

Download a Dataset

In [None]:
# Load the databricks dataset from Hugging Face
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Explore dataset

In [None]:
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

Number of prompts: 15011
Column names are: ['instruction', 'context', 'response', 'category']


In [None]:
train_dataset = dataset.select(range(12000))

In [None]:
test_dataset = dataset.select(range(12000, len(dataset)))

In [None]:
pprint(test_dataset[5])

{'category': 'general_qa',
 'context': '',
 'instruction': 'Is AI a risk to humanity?',
 'response': 'Artificial Intelligence (AI) is just a tool. Like all tools made '
             'by humans throughout the history it is neither good nor bad. It '
             'is neither safe nor risky. The humans who control and use it '
             'will determine those. Looking at history, all past human '
             'technological developments lead to improved human quality of '
             'life. AI is likely going to significantly improve human quality '
             'of life.'}


In [None]:
pprint(train_dataset[5])

{'category': 'information_extraction',
 'context': 'Stalemate is a situation in chess where the player whose turn it '
            'is to move is not in check and has no legal move. Stalemate '
            'results in a draw. During the endgame, stalemate is a resource '
            'that can enable the player with the inferior position to draw the '
            'game rather than lose. In more complex positions, stalemate is '
            'much rarer, usually taking the form of a swindle that succeeds '
            'only if the superior side is inattentive.[citation needed] '
            'Stalemate is also a common theme in endgame studies and other '
            'chess problems.\n'
            '\n'
            'The outcome of a stalemate was standardized as a draw in the 19th '
            'century. Before this standardization, its treatment varied '
            'widely, including being deemed a win for the stalemating player, '
            'a half-win for that player, or a loss for t

In [None]:
dataset = train_dataset
dataset_subset = test_dataset

In [None]:
# # Assuming 'dataset' is your original DataFrame
# dataset_trn= {'text': dataset['text'], 'hashtags': dataset['hashtags']}
# dataset = Dataset.from_pandas(pd.DataFrame(dataset_dict))



Pre-processing dataset

Instruction fine-tuning is a common technique used to fine-tune a base LLM for a specific downstream use-case.



In [None]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction', 'context', 'response')
    Then concatenate them using two newline characters
    :param sample: Sample dictionnary
    """

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)

    sample["text"] = formatted_prompt

    return sample

use the model tokenizer to process these prompts into tokenized ones.

* The goal is to create input sequences of uniform length (which are suitable for fine-tuning the language model

because it maximizes efficiency and minimize computational overhead), that must not exceed the model’s maximum token limit.

In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

**Create a bitsandbytes configuration**

> This allows to load our LLM in 4 bits. This way, we can divide the used memory by 4 and import the model on smaller devices. We choose to apply bfloat16 compute data type and nested quantization for memory-saving purposes.



In [None]:
''' This function, create_bnb_config(), is designed to create and return a
configuration object for quantization using the Bits and Bytes (BNB)
quantization scheme. '''
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config

** LoRa configuration**

> To leverage the LoRa method, we need to wrap the model as a PeftModel.


In [None]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for the model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

> Previous function needs the target modules to update the necessary
matrices. The following function will get them for our model:

In [None]:


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

> Once everything is set up and the base model is prepared, we can
use the print_trainable_parameters() helper function to see how many trainable parameters are in the model.

In [None]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )


**Train**

Now, we can pre-process our dataset and load our model using the set configurations


In [None]:

from huggingface_hub import login

login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Load model from HF with user's token and with bitsandbytes config

model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

In [None]:

import random

seed = 42
random.seed(50)

In [None]:
## Preprocess dataset

max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

**Fine-tuning process using Single GPU**

In [None]:
def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=20,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs


    # Verifying the datatypes before training

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True

    # Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    ###

    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()


output_dir = "results/llama2/final_checkpoint"
train(model, tokenizer, dataset, output_dir)


all params: 3,540,389,888 || trainable params: 39,976,960 || trainable%: 1.1291682911958425
torch.float32 302387200 0.08541070604255438
torch.uint8 3238002688 0.9145892939574456
Training...




Step,Training Loss
1,2.4708
2,2.6709
3,1.8978
4,1.6531
5,1.5107
6,1.8469
7,1.2082
8,1.1281
9,1.2642
10,1.7823


***** train metrics *****
  epoch                    =       0.01
  total_flos               =   653785GF
  train_loss               =     1.4671
  train_runtime            = 0:02:33.60
  train_samples_per_second =      0.521
  train_steps_per_second   =       0.13
{'train_runtime': 153.6092, 'train_samples_per_second': 0.521, 'train_steps_per_second': 0.13, 'total_flos': 701996936749056.0, 'train_loss': 1.4670625746250152, 'epoch': 0.01}
Saving last checkpoint of the model...


* If we prefer to have a number of epochs (entire training dataset
 will be passed through the model) instead of a number of training
 steps (forward and backward passes through the model with one batch
 of data), we can replace the max_steps argument by num_train_epochs.

* The trainer.model.save_pretrained(output_dir) function, saves the fine-tuned model’s weights, configuration, and tokenizer files to load later and use the model for inference.

**Merge weights**

> Once we have our fine-tuned weights, we can build our fine-tuned
model and save it to a new directory, with its associated tokenizer
By performing these steps, we can have a memory-efficient fine-tuned
model and tokenizer ready for inference!

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

In [None]:
output_merged_dir = "results/llama2/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)


In [None]:
# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

('results/llama2/final_merged_checkpoint/tokenizer_config.json',
 'results/llama2/final_merged_checkpoint/special_tokens_map.json',
 'results/llama2/final_merged_checkpoint/tokenizer.json')

In [None]:
model.save_pretrained(output_merged_dir, safe_serialization=True)


In [None]:
#sample = dataset_subset[10]
sample = dataset_subset.select(range(70, len(dataset_subset)))[0]
prompt = create_prompt_formats(sample)

In [None]:
print(prompt)

{'instruction': 'Who invented scratching ?', 'context': 'Modern scratching techniques were made possible by the invention of direct-drive turntables, which led to the emergence of turntablism. Early belt-drive turntables were unsuitable for scratching since they had a slow start-up time, and they were prone to wear and tear and breakage, as the belt would break from backspinning or scratching. The first direct-drive turntable was invented by Shuichi Obata, an engineer at Matsushita (now Panasonic), based in Osaka, Japan. It eliminated belts, and instead employed a motor to directly drive a platter on which a vinyl record rests. In 1969, Matsushita released it as the SP-10, the first direct-drive turntable on the market, and the first in their influential Technics series of turntables.\n\nIn the 1970s, hip hop musicians and club DJs began to use this specialized turntable equipment to move the record back and forth, creating percussive sounds and effects–"scratching"–to entertain their 

In [None]:
import time

**Inference using Similar Data to Fine-Tuning**

In [None]:
#input_text = f"Instruction: {prompt['instruction']}\n Context: {prompt['context']}\nResponse: {prompt['response']}\nCategory: {prompt['category']}"

In [None]:
input_text = f"Instruction: who is Mahi ?\n Context: Mahi is one of the backend team at tenx. she works for the last n-years as backend enginer. Now she becomes Backend team leader. "

In [None]:
# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)


# Measure inference time
start_time = time.time()

# Generate predictions
output = model.generate(input_ids, max_length=500, temperature=1.0, top_k=50, top_p=0.95, num_return_sequences=1)
generated_instruction = tokenizer.decode(output[0], skip_special_tokens=True)

end_time = time.time()

# Calculate and print the inference time
inference_time = end_time - start_time


In [None]:
# Print the formatted input
print(f"======")
print(f"Input:\n======\n{input_text}\n")
print(f"======================")
print(f"Generated Instruction:\n======================\n{generated_instruction}\n")
print(f"=========================================")
print(f"Inference Time:{inference_time} seconds\n==========================================")

Input:
Instruction: who is Mahi ?
 Context: Mahi is one of the backend team at tenx. she works for the last n-years as backend enginer. Now she becomes Backend team leader. 

Generated Instruction:
Instruction: who is Mahi ?
 Context: Mahi is one of the backend team at tenx. she works for the last n-years as backend enginer. Now she becomes Backend team leader. 
 What is her role ? She is in charge of the team's overall performance. But she has little experience in human resources management. What she needs is a 1-2 page briefing to understand her new position. 
 Inputs : 10 developers in the team. Different background and specialties, some are more mature than others. 
 Outputs : How to give performance feedback (and what are its characteristics) to different kind of developers (good, poor, average) 
 Approach: First, a general overview of performance management practices: how should we use the information from review ? What is the 100% ideal performance ? How often should we give fee

**Inference using Instruction or Question and Context Only**


In [None]:
input_text = f"Instruction: {prompt['instruction']}\n Context: {prompt['context']}"

In [None]:
# Tokenize the input
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

# Measure inference time
start_time = time.time()

# Generate predictions
output = model.generate(input_ids, max_length=500, temperature=1.0, top_k=50, top_p=0.95, num_return_sequences=1)
generated_instruction = tokenizer.decode(output[0], skip_special_tokens=True)

end_time = time.time()

# Calculate and print the inference time
inference_time = end_time - start_time


In [None]:
# Print the formatted input
print(f"======")
print(f"Input:\n======\n{input_text}\n")
print(f"======================")
print(f"Generated Instruction:\n======================\n{generated_instruction}\n")
print(f"=========================================")
print(f"Inference Time:{inference_time} seconds\n==========================================")

Input:
Instruction: Give me a sample of three of Australian Animals.
 Context: 

Generated Instruction:
Instruction: Give me a sample of three of Australian Animals.
 Context:  A.N.ZAC Day
 Examples: Kangaroo




Inference Time:22.04057765007019 seconds


**Inference Using New but similar data**

In [None]:
#input_text = f"Instruction: What is language model fine tuning mean?\n Context: Large language model (LLM) fine-tuning is the process of taking pre-trained models and further training them on smaller, specific datasets to refine their capabilities and improve performance in a particular task or domain. Fine-tuning is about turning general-purpose models and turning them into specialized models. It bridges the gap between generic pre-trained models and the unique requirements of specific applications, ensuring that the language model aligns closely with human expectations. "

In [None]:
input_text2 = f"Instruction: {prompt['instruction']}\n Context: {prompt['context']}"

In [None]:
# Tokenize the input
input_ids = tokenizer.encode(input_text2, return_tensors="pt").to(model.device)

# Measure inference time
start_time = time.time()

# Generate predictions
output = model.generate(input_ids, max_length=500, temperature=1.0, top_k=50, top_p=0.95, num_return_sequences=1)
generated_instruction = tokenizer.decode(output[0], skip_special_tokens=True)

end_time = time.time()

# Calculate and print the inference time
inference_time = end_time - start_time




In [None]:
# Print the formatted input
print(f"======")
print(f"Input:\n======\n{input_text2}\n")
print(f"======================")
print(f"Generated Instruction:\n======================\n{generated_instruction}\n")
print(f"=========================================")
print(f"Inference Time:{inference_time} seconds\n==========================================")

Input:
Instruction: Who invented scratching ?
 Context: Modern scratching techniques were made possible by the invention of direct-drive turntables, which led to the emergence of turntablism. Early belt-drive turntables were unsuitable for scratching since they had a slow start-up time, and they were prone to wear and tear and breakage, as the belt would break from backspinning or scratching. The first direct-drive turntable was invented by Shuichi Obata, an engineer at Matsushita (now Panasonic), based in Osaka, Japan. It eliminated belts, and instead employed a motor to directly drive a platter on which a vinyl record rests. In 1969, Matsushita released it as the SP-10, the first direct-drive turntable on the market, and the first in their influential Technics series of turntables.

In the 1970s, hip hop musicians and club DJs began to use this specialized turntable equipment to move the record back and forth, creating percussive sounds and effects–"scratching"–to entertain their dan

**Fine Tuning Using multiple GPU**

In [None]:
# Load model from HF with user's token and with bitsandbytes config

model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = create_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

In [None]:
## Preprocess dataset

max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)



Found max lenth: 4096
Preprocessing dataset...


In [None]:
def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    #total_batch_size = n_gpus * per_device_batch_size
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            n_gpu=2,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=20,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",

        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs


    # Verifying the datatypes before training

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True

    # Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)

    ###

    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()


output_dir = "results/llama2/final_checkpoint_2g"
train(model, tokenizer, dataset, output_dir)


In [None]:
# model_2g = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# model_2g = model_2g.merge_and_unload()

In [None]:
# # save tokenizer for easy inference
# tokenizer_2g = AutoTokenizer.from_pretrained(model_name)