# Train GPT-2 for retrieverApp

In this notebook, I will show how to fine-tune GPT-2 to label retrieverApp data. The goal is to correctly label the data type of a GEO entry according to the information we collect from GEO database.

## Preparation

Import necessary libraries, clean GPU mem cache

In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
        AutoModelForCausalLM,
        AutoTokenizer,
        BitsAndBytesConfig,
        TrainingArguments,
        set_seed,
        pipeline
    )
from trl import SFTTrainer

torch.cuda.empty_cache()

#quantization_config = BitsAndBytesConfig(load_in_4bit=True)

Next, define the device, and model/tokenizer ids.

In [2]:
device = "cuda"

# The model that you want to train from the Hugging Face hub
model_id = "gpt2"
# Fine-tuned model name
trained_model_id = "gpt2-retrieverApp"
output_dir = 'retrieverApp_adapter/' + trained_model_id


Load the tokenizer.

In [3]:
# Load the tonenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# Terminators
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|endoftext|>") ]


Configure training arguments.

In [4]:
# Training Arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="steps",
    do_eval=True,
    optim="adamw_torch",
    per_device_eval_batch_size=1, # originally set to 8
    per_device_train_batch_size=1, # originally set to 8
    gradient_accumulation_steps=4,
    log_level="info",
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16= not torch.cuda.is_bf16_supported(),
    bf16= torch.cuda.is_bf16_supported(),
    eval_steps=50,
    num_train_epochs=10,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    seed=42,
)

# LORA arguments
peft_config = LoraConfig(
    r=64, # LoRA attention dimension
    lora_alpha=16, # Alpha parameter for LoRA scaling
    lora_dropout=0.1, # Dropout probability for LoRA layers
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_proj"],
)

Load data

In [5]:
set_seed(1234)  # For reproducibility

# load datasets
dataset = load_dataset("training_data")

Load model.

In [6]:
# load the model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map="cuda")

Training.

In [7]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    tokenizer=tokenizer,
    max_seq_length = 1024,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend


Train...

In [8]:
# To clear out cache for unsuccessful run
torch.cuda.empty_cache()

# train
trainer.train()

# save model in local
trainer.save_model()

***** Running training *****
  Num examples = 110
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 270
  Number of trainable parameters = 4,128,768
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/taoliu/.netrc


Step,Training Loss,Validation Loss
50,No log,3.683316
100,No log,3.590301
150,No log,3.555403
200,No log,3.531419
250,No log,3.520983


Saving model checkpoint to retrieverApp_adapter/gpt2-retrieverApp/checkpoint-27
loading configuration file config.json from cache at /home/taoliu/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": 