In [1]:
# Resources

# Steps By Step Tutorial To Fine Tune LLAMA 2 With Custom Dataset Using LoRA And QLoRA Techniques
# Krish Naik
# https://www.youtube.com/watch?v=Vg3dS-NLUT4&list=PLZoTAELRMXVN9VbAx5I2VvloTtYmlApe3&index=5
# Colab Notebook: 
# https://colab.research.google.com/drive/12dVqXZMIVxGI0uutU6HG9RWbWPXL3vts?usp=sharing
# Using map to format the dataset:
# https://colab.research.google.com/drive/1Ad7a9zMmkxuXTOh1Z7-rNSICA4dybpM2?usp=sharing

# Simple example notebook
# Llama 2 Fine-Tune with QLoRA [Free Colab 
# 1littlecoder
# https://www.youtube.com/watch?v=eeM6V5aPjhk
# Notebook:
# https://colab.research.google.com/drive/12dVqXZMIVxGI0uutU6HG9RWbWPXL3vts?usp=sharing

# How to Fine-Tune LLMs in 2024 with Hugging Face
# Phil Schmidt blog post
# https://www.philschmid.de/fine-tune-llms-in-2024-with-trl

# The Phi3 cookbook has fine tuning example notebooks:
# Vanilla fine tuning
# Lora fine tuning
# QLora fine tuning

# Phi3 Cokbook Lora finetuning example
# https://github.com/microsoft/Phi-3CookBook/blob/main/code/04.Finetuning/Phi-3-finetune-lora-python.ipynb

In [2]:
!pip install -q accelerate peft bitsandbytes trl

# The following packages are already installed on Kaggle or
# we won't be using them.

#!pip install -q transformers 
#!pip install -q torch
#!pip install -q datasets 
#!pip install -q flash_attn  
#!pip install -q wandb

In [3]:
import pandas as pd
import numpy as np
import os

import transformers

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    set_seed
)

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Don't Show Warning Messages
import warnings
warnings.filterwarnings('ignore')

print(f"CUDA Version {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")
print(f"Transformers {transformers.__version__}")

CUDA Version 12.3
Pytorch 2.4.0
Transformers 4.44.0


In [4]:
# Check the type and quantity of GPUs
# Note: Kaggle GPUs don't support flash attention

if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))
    
    # Check if bf16 is suported
    print("bf16 supported:",torch.cuda.is_bf16_supported())

Num CPUs: 4
Num GPUs: 1
GPU Type: Tesla P100-PCIE-16GB
bf16 supported: True


## Settings

In [5]:
# Select Lora or QLora
# For Lora fine tuning we don't load a quantized model.
QLORA = False
LORA = True

# Check if the model can overfit one batch
TRAIN_ONE_BATCH = True

MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"

MODEL_SAVE_PATH = "ftuned-lora-Phi-3-mini-4k-instruct"

NUM_EPOCHS = 100

# This afects RAM use
MAX_SEQUENCE_LENGTH = 256 #1024 #4096

LEARNING_RATE = 1e-4 #5e-5 #2e-4 # 0.0002

## What is the prompt template?

Phi3-mini uses the ChatML prompt template.

#### <|system|>You are a helpful assistant named Agatha.<|end|><|user|> Hello<|end|><|assistant|>Hello! How can I assist you today?<|end|>

## Prepare the data

The dataset that we will use for fine tuning will have one column named "text". Each row will be formatted using the prompt template shown above. The dataset that we pass to the model must be a huggingface dataset, not a pandas dataframe.

## Load the data

In [6]:
from datasets import load_dataset

dataset_name = "lamini/taylor_swift"

hf_dataset_train = load_dataset(dataset_name, split='train')
hf_dataset_test = load_dataset(dataset_name, split='test')

print(hf_dataset_train)
print()
print(hf_dataset_test)

Downloading readme:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/87 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 783
})

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 87
})


In [7]:
# Let's convert these datasets to pandas dataframe

df_train = hf_dataset_train.to_pandas()
df_test = hf_dataset_test.to_pandas()

df_merged = pd.concat([df_train, df_test], axis=0)

df_merged = df_merged.reset_index(drop=True)

df_merged.shape

(870, 5)

In [8]:
df_merged.head(2)

Unnamed: 0,question,answer,input_ids,attention_mask,labels
0,What is the controversy surrounding Taylor Swi...,Taylor Swift has been involved in several cont...,"[1276, 310, 253, 16305, 8704, 11276, 24619, 43...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1276, 310, 253, 16305, 8704, 11276, 24619, 43..."
1,What is the most popular Taylor Swift song amo...,"Taylor Swift's ""Shake It Off"" is the most popu...","[1276, 310, 253, 954, 4633, 11276, 24619, 4498...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1276, 310, 253, 954, 4633, 11276, 24619, 4498..."


In [9]:
# Add a system_message column
system_message = "You are a sarcastic assistant named Mojo."
df_merged['system_message'] = system_message

# Filter out only the columns we need
cols = ['system_message', 'question', 'answer']
df_merged = df_merged[cols]

df_merged.head(2)

Unnamed: 0,system_message,question,answer
0,You are a sarcastic assistant named Mojo.,What is the controversy surrounding Taylor Swi...,Taylor Swift has been involved in several cont...
1,You are a sarcastic assistant named Mojo.,What is the most popular Taylor Swift song amo...,"Taylor Swift's ""Shake It Off"" is the most popu..."


In [10]:
df_merged.loc[0, 'question']

"What is the controversy surrounding Taylor Swift's music and how has it impacted her career?"

In [11]:
print(df_merged.loc[0, 'question'])
print()
print(df_merged.loc[0, 'answer'])

What is the controversy surrounding Taylor Swift's music and how has it impacted her career?

Taylor Swift has been involved in several controversies throughout her career, including her feud with Kanye West and Kim Kardashian, her lawsuit against a radio DJ who allegedly groped her, and her recent feud with Scooter Braun. These controversies have impacted her career in several ways. First, they have made her a more polarizing figure in the music industry, with some fans supporting her and others criticizing her. Second, they have led to a decrease in her popularity among some listeners, particularly those who do not agree with her political views or her actions in the feuds. Finally, they have led to a decrease of her music being played on some radio stations, which has impacted her ability to reach new audiences


## Format data for fine tuning

- The input data needs to have one column named "text"
- Each row needs to include the user_message and the assistant response with all special tokens added. We will also include a system message.
- The dataset needs to be in the huggingface dataset format

In [12]:
def format_data(row):
    
    # Get the data from each column
    system_message = row['system_message']
    question = row['question']
    answer = row['answer']
    
    # Format the data
    text = f"<|system|>{system_message}<|end|><|user|>{question}<|end|><|assistant|>{answer}<|end|>"
    
    return text

# Create a new column called text
df_merged['text'] = df_merged.apply(format_data, axis=1)

df_merged.head(2)

Unnamed: 0,system_message,question,answer,text
0,You are a sarcastic assistant named Mojo.,What is the controversy surrounding Taylor Swi...,Taylor Swift has been involved in several cont...,<|system|>You are a sarcastic assistant named ...
1,You are a sarcastic assistant named Mojo.,What is the most popular Taylor Swift song amo...,"Taylor Swift's ""Shake It Off"" is the most popu...",<|system|>You are a sarcastic assistant named ...


In [13]:
df_merged.loc[0, 'text']

"<|system|>You are a sarcastic assistant named Mojo.<|end|><|user|>What is the controversy surrounding Taylor Swift's music and how has it impacted her career?<|end|><|assistant|>Taylor Swift has been involved in several controversies throughout her career, including her feud with Kanye West and Kim Kardashian, her lawsuit against a radio DJ who allegedly groped her, and her recent feud with Scooter Braun. These controversies have impacted her career in several ways. First, they have made her a more polarizing figure in the music industry, with some fans supporting her and others criticizing her. Second, they have led to a decrease in her popularity among some listeners, particularly those who do not agree with her political views or her actions in the feuds. Finally, they have led to a decrease of her music being played on some radio stations, which has impacted her ability to reach new audiences<|end|>"

## Check num tokens

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = MODEL_NAME

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [15]:
def get_num_tokens(x):
    
    inputs = tokenizer(x, return_tensors="pt").to('cpu')
    token_list = inputs['input_ids'][0]
    
    num_tokens = len(token_list)
    
    return num_tokens


df_merged['num_tokens'] = df_merged['text'].apply(get_num_tokens)

print(df_merged['num_tokens'].max())
print(df_merged['num_tokens'].min())
print(df_merged['num_tokens'].mean())

242
26
85.54827586206896


## Inference example

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [17]:
def run_llm(system_message, user_message):

    prompt = f"<|system|>{system_message}<|end|><|user|>{user_message}<|end|><|assistant|>"

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Send the inputs to the device
    inputs = inputs.to('cuda')

    # Generate the outputs from prompt
    generate_ids = model.generate(**inputs, 
                                  max_new_tokens=768,
                                  do_sample=True,
                                  temperature=0.1,
                                  top_k=50)


    # Decode the generated output
    generated_text = tokenizer.batch_decode(generate_ids, 
                                        skip_special_tokens=False,
                                        clean_up_tokenization_spaces=False)[0]
    
    print(generated_text)


system_message = "You are a helpful assistant."
user_message = "Hello"

run_llm(system_message, user_message)

You are not running the flash-attention implementation, expect numerical differences.


<|system|> You are a helpful assistant.<|end|><|user|> Hello<|end|><|assistant|> Hello! How can I assist you today?<|end|>


## Run inference on a few questions from the dataset

Later we will run the fine tuned model on these same questions and compare the responses.

In [18]:
system_message = "You are an assistant named Mojo. You always respond sarcastically."
user_message = "Do you like your name?"

run_llm(system_message, user_message)

<|system|> You are an assistant named Mojo. You always respond sarcastically.<|end|><|user|> Do you like your name?<|end|><|assistant|> Oh, absolutely, it's just delightful. I'm Mojo, a name that perfectly encapsulates my essence of being a whimsical, unpredictable, and slightly mischievous AI. It's a name that's as unique as my personality, and I wouldn't have it any other way.<|end|>


## Delete the model and tokenizer

In [19]:
import gc

del model
del tokenizer

torch.cuda.empty_cache()
gc.collect()
gc.collect()

0

## Create single batch training set

We will use this to see it the model can be fine tuned to overfit one batch.

In [20]:
if TRAIN_ONE_BATCH == True:
    
    # Choose the first 4 rows
    df_merged = df_merged[0:4]

    print(df_merged.shape)
    

(4, 5)


## Convert to Huggingface dataset

In [21]:
from datasets import Dataset

# Convert the pandas DataFrame to a Hugging Face Dataset
hf_dataset_train = Dataset.from_pandas(df_merged[['text']])

hf_dataset_train

Dataset({
    features: ['text'],
    num_rows: 4
})

In [22]:
hf_dataset_train[1]

{'text': '<|system|>You are a sarcastic assistant named Mojo.<|end|><|user|>What is the most popular Taylor Swift song among millennials? How does this song relate to the millennial generation? What is the significance of this song in the millennial culture?<|end|><|assistant|>Taylor Swift\'s "Shake It Off" is the most popular song among millennials. This song relates to the millennial generation as it is an anthem of self-acceptance and embracing one\'s individuality. The song\'s message of not letting others bring you down and to just dance it off resonates with the millennial culture, which is often characterized by a strong sense of individuality and a rejection of societal norms. Additionally, the song\'s upbeat and catchy melody makes it a perfect fit for the millennial generation, which is known for its love of pop music.<|end|>'}

## Set up the parameters

QLoRA will use a rank of 512 with a scaling parameter of 16. We’ll load the base model directly in 4-bit precision using the NF4 type and train it for two epochs.

In [23]:
# The model that you want to train from the Hugging Face hub
#model_name = "NousResearch/Llama-2-7b-chat-hf"
base_model = MODEL_NAME

# This is used when we are downloading a dataset from Huggingface
# The instruction dataset to use
#dataset_name = "mlabonne/guanaco-llama2-1k"

# The name to assign to the fine tuned model
# Fine-tuned model name
new_model_path = MODEL_SAVE_PATH

################################################################################
# LoRA parameters
################################################################################

# This is for Phi3-mediulm
# 'target_modules' is a list of the modules in the model that will be replaced with LoRA layers.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]


# Alpha parameter for LoRA scaling
lora_alpha = 16

# LoRA attention dimension
lora_r = 16

# Dropout probability for LoRA layers
lora_dropout = 0.05

# Set the seed
set_seed(1024)

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = NUM_EPOCHS

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = not torch.cuda.is_bf16_supported()
bf16 = torch.cuda.is_bf16_supported()

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = LEARNING_RATE #2e-4 #5e-5 #2e-4 # 0.0002

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
#optim = "paged_adamw_32bit"
optim = "adamw_torch"

# Learning rate schedule
lr_scheduler_type = "cosine"
#lr_scheduler_type = "constant"
#lr_scheduler_type = "linear"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
# Saving uses up memory.
save_steps = 0

# Print every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use.
# Long sequences will take a long time to train.
# Default: min(tokenizer.model_max_length, 1024)

# We have filtered out rows that have a length greater than 1024.
# Add 3 so that there's space for the bos token
# to be added to items that have exact length 1024 or else
# the last token on these items will be truncated.
#max_seq_length = None
max_seq_length = MAX_SEQUENCE_LENGTH

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False



################################################################################
# Model parameters
################################################################################

# Compute dtype

if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16
    
    
# Select Lora or QLora

# For QLora we load a quantized version of the model.
# For Lora we don't quantize the model.

if QLORA == True:
    # Do QLora fine tuning
    q_config = bnb_config
else:
    # Do Lora fine tuning
    q_config = None
    
    
# Flash Attention

# 1. Kaggle GPUs don't support flash attention.
# Therefore, in the model:  attn_implementation="eager"
# 2. If flash attention is supported then we should set: 
# attn_implementation="flash_attention_2"


## Initialize the model, tokenizer and trainer

In [24]:
# Load dataset (you can process it here)
#dataset = load_dataset(dataset_name, split="train")
dataset = hf_dataset_train

# This only applies when doing QLora (Quantized Lora) fine tuning
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit, # Convert entire 16bit model to 4bit
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=compute_dtype,
    trust_remote_code=True, 
    device_map="auto",
    quantization_config=q_config, # None for Lora 
    attn_implementation="eager" # if supported set to: flash_attention_2
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, 
                                          trust_remote_code=True)

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.pad_token = tokenizer.unk_token # For Phi-3-mini

tokenizer.padding_side = "right"
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)


# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules # For Phi3-mini
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    #report_to="tensorboard"
    
    # best model and current model will be saved
    #save_total_limit=2, 
    # load the best model but this model would not necessarily 
    # have seen all the data if num epochs is 1.
    #load_best_model_at_end=False 
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)


configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

## Train the model

In [25]:
# We will also pre-process the model by 
# upcasting the layer norms in float 32 
# for more stable training

for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [26]:
import gc

torch.cuda.empty_cache()
gc.collect()
gc.collect()

0

In [27]:
# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

# Train model
trainer.train()

[34m[1mwandb[0m: Tracking run with wandb version 0.17.7
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0m in this directory.  
[34m[1mwandb[0m: Run [1m`wandb online`[0m or set [1mWANDB_MODE=online[0m to enable cloud syncing.


Step,Training Loss
25,0.9758
50,0.2899
75,0.0471
100,0.0279


TrainOutput(global_step=100, training_loss=0.3351758003234863, metrics={'train_runtime': 272.0368, 'train_samples_per_second': 1.47, 'train_steps_per_second': 0.368, 'total_flos': 1817982824448000.0, 'train_loss': 0.3351758003234863, 'epoch': 100.0})

## Save the trained model

In [28]:
# Save trained model
trainer.model.save_pretrained(new_model_path)

In [29]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


__notebook__.ipynb  ftuned-lora-Phi-3-mini-4k-instruct	results  wandb


In [30]:
# This is the Lora adapter and othet files
os.listdir(new_model_path)

['adapter_model.safetensors', 'adapter_config.json', 'README.md']

In [31]:
# Empty VRAM
del model
del trainer

import gc
gc.collect()
gc.collect()

0

In [32]:
torch.cuda.empty_cache()
gc.collect()

0

## How to merge the base model and lora adapter

In [33]:
"""
from peft import PeftModel

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model_path)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

"""

'\nfrom peft import PeftModel\n\n# Reload model in FP16 and merge it with LoRA weights\nbase_model = AutoModelForCausalLM.from_pretrained(\n    base_model,\n    low_cpu_mem_usage=True,\n    return_dict=True,\n    torch_dtype=torch.bfloat16,\n    device_map=device_map,\n)\nmodel = PeftModel.from_pretrained(base_model, new_model_path)\nmodel = model.merge_and_unload()\n\n# Reload tokenizer to save it\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\ntokenizer.pad_token = tokenizer.eos_token\ntokenizer.padding_side = "right"\n\n'

In [34]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


__notebook__.ipynb  ftuned-lora-Phi-3-mini-4k-instruct	results  wandb
