Fine-tuning LLMs need extensive resource. If you don't have enough GPU (at least 16GB), I would recommend to try it on kaggle. We will fine-tune this on the wiki_movies dataset from huggingface. However, because of the limited resources, I can only perform the fine-tuning process on 1% of the dataset.

In [None]:
# !pip install -qqq bitsandbytes
# !pip install -qqq torch
# !pip install  -qqq -U git+https://github.com/huggingface/transformers
# !pip install -qqq -U git+https://github.com/huggingface/peft
# !pip install -qqq -U git+https://github.com/huggingface/accelerate
# !pip install -qqq datasets
# !pip install -qqq loralib
# !pip install -qqq einops
# !pip install -qqq datasets

In [None]:
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset

In [None]:
# Load the model with bits and bytes config
model = "GeneZC/MiniChat-3B"
# model = "mistralai/Mistral-7B-Instruct-v0.1"
# model = "openai-community/gpt2-xl"

MODEL_NAME = model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    # device_map="cpu",
    trust_remote_code=True,
    quantization_config=bnb_config
)

# PEFT wrapper the model for training / fine-tuning
model = prepare_model_for_kbit_training(model)

# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
import re
# This is to get the numer of layers of our LLM
def get_num_layers(model):
    # We first define a set
    numbers = set()
    for name, _ in model.named_parameters():
        # Name is of this form: model.layers.2.post_attention_layernorm.weight
        # The number 2 means the index of the post attention layer norm
        # We use regular expression to parse the number 2
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    # The number of layers is exactly maximum value of the set numbers
    return max(numbers)

# This is to get the number of parameters of our LLM
def get_num_params(model):
    num_params = 0
    for _, param in model.named_parameters():
        num_params += param.numel()
    return num_params

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [None]:
# Total number of layers and parameters
print(get_num_layers(model))
# Total number of named params
print(get_num_params(model))
print(get_last_layer_linears(model))

In [None]:
config = LoraConfig(
    r=2,
    lora_alpha=32,
    target_modules=get_last_layer_linears(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Redefine the model with Lora config
model = get_peft_model(model, config)

In [None]:
generation_config = model.generation_config
# max_new_tokens is limited length of the answer
generation_config.max_new_tokens = 100
# low temperature (0.1) for more predictable/coherent text
# high temperature (0.9) for more creative/unpredictable text
generation_config.temperature = 0.1
# top_p = 0.7 means the next word must have at least 70% chance to appear
generation_config.top_p = 0.1
generation_config.do_sample = True
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
# Defie dataset
dataset_name = "wiki_movies"
dataset = load_dataset(dataset_name)

In [None]:
# Tokenize dataset 
def tokenize_function(examples):
    # Tokenize both the questions and answers in the batch
    return tokenizer(examples['question'], examples['answer'], truncation=True, padding='max_length', max_length=1000)

# tokenized_datasets = dataset.map(tokenize_function)

In [None]:
from datasets import DatasetDict

# Split the datasets to reduce their sizes
small_train_dataset = dataset['train'].train_test_split(test_size=0.99)
small_validation_dataset = dataset['validation'].train_test_split(test_size=0.9)
small_test_dataset = dataset['test'].train_test_split(test_size=0.9)

# Note: After splitting, we need to use ['train'] to access the reduced part we want
tokenized_train_datasets = small_train_dataset['train'].map(tokenize_function, batched=True)
tokenized_validation_datasets = small_validation_dataset['train'].map(tokenize_function, batched=True)
tokenized_test_datasets = small_test_dataset['train'].map(tokenize_function, batched=True)

# Corrected DatasetDict with matched variable names
tokenized_datasets = DatasetDict({
    'train': tokenized_train_datasets,
    'test': tokenized_test_datasets,
    'validation': tokenized_validation_datasets
})

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=1e-4,
    fp16=True,
    output_dir="finetune_reddit",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    report_to="none"
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False
trainer.train()

In [None]:
model.save_pretrained("trained-model")