In [117]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [118]:
# Dataset
data_name = "mlabonne/guanaco-llama2-1k"
raw_datasets = load_dataset(data_name, cache_dir="../../datasets", split="train")
raw_datasets = raw_datasets.train_test_split(test_size=0.2, shuffle=True)
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 800
    })
    test: Dataset({
        features: ['text'],
        num_rows: 200
    })
})


In [119]:
# Model and tokenizer names
base_model_name = "gpt2"
refined_model = "gpt2-enhanced"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, cache_dir="../../datasets")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix for fp16

In [120]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [121]:
tokenized_datasets["train"]

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 800
})

In [122]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)
# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [123]:
from peft import LoraConfig, PeftModel
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

In [124]:
print(len(tokenized_datasets["train"]))

800


In [125]:
output_dir = f'./peft-qa-training-{str(int(time.time()))}'

# Training Params
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1    
)
    
fine_tuning = Trainer(
    model=base_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [126]:
# Training
fine_tuning.train()
# Save Model
fine_tuning.model.save_pretrained(refined_model)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.