Reference: https://github.com/ShawhinT/YouTube-Blog/blob/main/LLMs/fine-tuning/ft-example.ipynb

In [1]:
import os

# for ROCm, 10.3.0 is gfx1030
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0"
# for multiple GPUs ,e.g. you have GPU + iGPU/APU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# export HSA_OVERRIDE_GFX_VERSIONPGHOST="10.3.0"
# export CUDA_VISIBLE_DEVICES="0"

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

amdgpu.ids: No such file or directory
amdgpu.ids: No such file or directory


In [3]:
dataset = load_dataset("shawhin/imdb-truncated")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [4]:
model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

In [6]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    # tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", truncation=True, max_length=512
    )

    return tokenized_inputs

In [7]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [8]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")


# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [9]:
# define list of examples
text_list = [
    "It was good.",
    "Not a fan, don't recommed.",
    "Better than the first one.",
    "This is not worth watching even once.",
    "This one is a pass.",
    "This is very very good",
]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Negative
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative
This is very very good - Negative


In [10]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

# define training arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307




In [11]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,  # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.5877382159233093, 'eval_accuracy': {'accuracy': 0.832}, 'eval_runtime': 3.876, 'eval_samples_per_second': 257.996, 'eval_steps_per_second': 64.499, 'epoch': 1.0}
{'loss': 0.4308, 'grad_norm': 1.7906593084335327, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.44316747784614563, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 3.8742, 'eval_samples_per_second': 258.115, 'eval_steps_per_second': 64.529, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.600477397441864, 'eval_accuracy': {'accuracy': 0.872}, 'eval_runtime': 3.8869, 'eval_samples_per_second': 257.276, 'eval_steps_per_second': 64.319, 'epoch': 3.0}
{'loss': 0.1638, 'grad_norm': 0.025641541928052902, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.6921529173851013, 'eval_accuracy': {'accuracy': 0.88}, 'eval_runtime': 3.8888, 'eval_samples_per_second': 257.145, 'eval_steps_per_second': 64.286, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.8969046473503113, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 3.9268, 'eval_samples_per_second': 254.662, 'eval_steps_per_second': 63.666, 'epoch': 5.0}
{'loss': 0.0465, 'grad_norm': 0.00027051588403992355, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.9943106770515442, 'eval_accuracy': {'accuracy': 0.877}, 'eval_runtime': 3.9409, 'eval_samples_per_second': 253.746, 'eval_steps_per_second': 63.437, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.1569924354553223, 'eval_accuracy': {'accuracy': 0.878}, 'eval_runtime': 3.9308, 'eval_samples_per_second': 254.4, 'eval_steps_per_second': 63.6, 'epoch': 7.0}
{'loss': 0.0117, 'grad_norm': 9.350205800728872e-05, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.1183345317840576, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 4.0027, 'eval_samples_per_second': 249.834, 'eval_steps_per_second': 62.458, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.100839614868164, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 3.8941, 'eval_samples_per_second': 256.801, 'eval_steps_per_second': 64.2, 'epoch': 9.0}
{'loss': 0.0102, 'grad_norm': 0.005589199252426624, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 1.0850605964660645, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 3.8949, 'eval_samples_per_second': 256.747, 'eval_steps_per_second': 64.187, 'epoch': 10.0}
{'train_runtime': 129.9609, 'train_samples_per_second': 76.946, 'train_steps_per_second': 19.237, 'train_loss': 0.13260807743072509, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.13260807743072509, metrics={'train_runtime': 129.9609, 'train_samples_per_second': 76.946, 'train_steps_per_second': 19.237, 'total_flos': 1112883852759936.0, 'train_loss': 0.13260807743072509, 'epoch': 10.0})

In [14]:
model.to("cuda")  # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(
        "cuda"
    )  # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits, 1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive
This is very very good - Positive


In [20]:
# option 1: notebook login
from huggingface_hub import notebook_login

notebook_login()  # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
from huggingface_hub import HfApi

api = HfApi()
username = api.whoami()["name"]
model_id = (
    username + "/" + model_checkpoint + "-lora-text-classification"
)  # you can name the model whatever you want

model.push_to_hub(model_id)  # save model
trainer.push_to_hub(model_id)  # save trainer

In [24]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)

adapter_config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/2.52M [00:00<?, ?B/s]