Train LLaMA2 7B as Text Classifier

In [None]:
!pip install bitsandbytes==0.40.2
!pip install transformers==4.31.0
!pip install peft==0.4.0
!pip install accelerate==0.21.0
!pip install datasets
!pip install trl==0.4.7
!pip install sentencepiece


Collecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.40.2
Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [None]:
learning_rate=2e-5
weight_decay=0.1
per_device_train_batch_size=4
per_device_eval_batch_size=4
epoch=40
IS_TEST = False

# for a100
per_device_train_batch_size=16
per_device_eval_batch_size=16

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from dataclasses import dataclass, field
from typing import Optional
import torch
from peft import LoraConfig
from tqdm import tqdm
import pandas as pd
from transformers import Trainer, AutoModelForSequenceClassification, AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, AutoTokenizer, pipeline
from trl import SFTTrainer

tqdm.pandas()

if IS_TEST:
  train_data = load_dataset('glue', 'mrpc', split='train[:1%]')
  validation_data = load_dataset('glue', 'mrpc', split='validation[:1%]')
else:
  train_data = load_dataset('glue', 'mrpc', split='train')
  validation_data = load_dataset('glue', 'mrpc', split='validation')
dataset = DatasetDict({"train": train_data, "eval": validation_data})

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
# import the relavant libraries for loggin in
from huggingface_hub import login
from huggingface_hub.hf_api import HfFolder

huggingface_token = '<TOKEN>'

# set api for login and save token
login(token=huggingface_token)
HfFolder.save_token(huggingface_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# https://huggingface.co/docs/peft/task_guides/ptuning-seq-classification
model_name = "meta-llama/Llama-2-7b-hf"

padding_side = "right"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id


def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None)
    return outputs

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [None]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
    LoraConfig
)
peft_config = LoraConfig(
    task_type="SEQ_CLS", inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True, load_in_4bit=False
)
device_map = {"": 0}
torch_dtype = torch.bfloat16


model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    return_dict=True,
    quantization_config=quantization_config,
    device_map=device_map,
    torch_dtype=torch_dtype,
    num_labels=2
  )
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 8,404,992 || all params: 6,615,748,608 || trainable%: 0.1270452143516515


In [None]:
training_args_p = TrainingArguments(
    f"{model_name}-finetuned-mrpc-v0.4",
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=epoch,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=True,
)

In [None]:
import numpy as np
from datasets import load_dataset, load_metric

metric = load_metric('glue', 'mrpc')


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args_p,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.push_to_hub()

In [None]:
model.push_to_hub("Llama-2-7b-hf-finetuned-mrpc-v0.4")

In [None]:
dataset_v2 = load_dataset("glue", 'mrpc')
tokenized_dataset_v2 = dataset_v2.map(
    tokenize_function,
    batched=True,
    remove_columns=["idx", "sentence1", "sentence2"],
)

def log_trainer_performance(trainer, encoded_dataset, log=False):
    predictions_train = trainer.predict(encoded_dataset["train"])
    if log:
      print("Train metrics")
      print(predictions_train.metrics)

    predictions_validation = trainer.predict(encoded_dataset["validation"])
    if log:
      print("Validation metrics")
      print(predictions_validation.metrics)

    predictions_test = trainer.predict(encoded_dataset["test"])
    if log:
      print("Test metrics")
      print(predictions_test.metrics)

log_trainer_performance(trainer, tokenized_dataset_v2, True)