In [1]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# Colab Notebook for Fine-tuning LLMs on Different Use Cases

## Install Required Packages
!pip install transformers datasets accelerate peft bitsandbytes trl einops

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
import torch
import os

# Set PyTorch CUDA Memory Allocation Config
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Function to load model and tokenizer
def load_model(model_name, use_4bit=True):
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, load_in_4bit=use_4bit, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Ensure tokenizer has a pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

# Select model
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
model, tokenizer = load_model(MODEL_NAME, use_4bit=False)  # Disable 4-bit quantization

# Load dataset
def load_finetune_data(task_type, sample_size=500):  # Reduce dataset size
    if task_type == "classification":
        dataset = load_dataset("ag_news")
    elif task_type == "chat":
        dataset = load_dataset("tatsu-lab/alpaca")
    elif task_type == "multi_dataset":
        dataset1 = load_dataset("ag_news")
        dataset2 = load_dataset("tatsu-lab/alpaca")
        dataset = dataset1["train"].train_test_split(test_size=0.1)
        dataset = dataset["train"].add_item(dataset2["train"])
    else:
        raise ValueError("Invalid task type")

    # Reduce dataset size for training
    dataset["train"] = dataset["train"].select(range(min(sample_size, len(dataset["train"]))))
    if "test" in dataset:
        dataset["test"] = dataset["test"].select(range(min(sample_size // 10, len(dataset["test"]))))

    return dataset

# Select dataset
task_type = "chat"  # Change this to "classification" or "multi_dataset"
dataset = load_finetune_data(task_type, sample_size=500)

# LORA Configuration
peft_config = LoraConfig(
    r=8, lora_alpha=16, target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,  # Reduce batch size to prevent OOM
    per_device_eval_batch_size=1,
    num_train_epochs=1,  # Reduce epochs for quicker testing
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    bf16=True,  # Enable mixed precision training to save memory
    gradient_accumulation_steps=4  # Simulate larger batch size
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"] if "test" in dataset else None,
    args=training_args,
    tokenizer=tokenizer,
)

trainer.train()

# Save Model
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

(…)-00000-of-00001-a09b74b3ef9c3b56.parquet:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,1.6863
20,1.2909
30,1.0751
40,1.0143
50,1.0679
60,0.9763
70,1.0321
80,0.9943
90,0.9387
100,1.0089


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/tokenizer.model',
 './finetuned_model/added_tokens.json',
 './finetuned_model/tokenizer.json')