In [1]:
!pip install transformers datasets evaluate accelerate trl
!pip install nvidia-ml-py3
!pip install -U peft==0.4.0



In [2]:
import torch
from transformers import RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, BitsAndBytesConfig, GPT2Tokenizer, GPT2ForSequenceClassification, AutoTokenizer
from peft import LoraConfig, get_peft_model, PromptTuningConfig, TaskType
from datasets import load_dataset
from trl import SFTTrainer

In [3]:
lora_r = 8
lora_alpha = 16
output_dir = './lora_results_prompt_tuning'

import wandb
wandb.login()

run_name = "LoRA_prompt_tuning"
wandb.init(
    project="Lora",
    name=run_name,
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myanjie98[0m ([33myanjie98-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
base_model = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset('ag_news')
train_size = 7500

def preprocess(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=128
    )
    return tokenized

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Prepare train, eval, and test datasets
#tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
#train_dataset=tokenized_dataset['train']
#eval_dataset=tokenized_dataset['test'].shard(num_shards=2, index=0)
#test_dataset=tokenized_dataset['test'].shard(num_shards=2, index=1)



sampled_train_dataset = dataset['train'].select(range(train_size))
tokenized_dataset = sampled_train_dataset.map(preprocess, batched=True, remove_columns=["text"])
train_dataset = tokenized_dataset


# Extract the number of classess and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names



# Get label information
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

model = AutoModelForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    pad_token_id=tokenizer.eos_token_id,
    num_labels=num_labels
)

peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=20,
    token_dim=768,
    num_transformer_submodules=1,
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Classify this text into one of these categories: World, Sports, Business, or Technology:",
    tokenizer_name_or_path=base_model,
)

model = get_peft_model(model, peft_config)
print('PEFT Model')
model.print_trainable_parameters()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT Model
trainable params: 21,504 || all params: 124,461,312 || trainable%: 0.017277658136851393


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pynvml import *

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

def print_gpu_utilization():
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        info = nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    except:
        print("GPU information not available - NVIDIA driver not found")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    try:
        print_gpu_utilization()
    except:
        pass

In [8]:
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy='steps',
    learning_rate=1e-3,
    num_train_epochs=20,
    use_cpu=False,
    dataloader_num_workers=1,
    per_device_train_batch_size=16,
    optim="adamw_torch",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True},
    save_steps=500,
    eval_steps=100,
    logging_steps=100,
    load_best_model_at_end=False,
)

def get_trainer(model):
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=train_dataset,
        data_collator=data_collator,
    )

peft_lora_finetuning_trainer = get_trainer(model)
result = peft_lora_finetuning_trainer.train()

wandb.finish()
print_gpu_utilization()
print_summary(result)

GPT2ForSequenceClassification will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`


Step,Training Loss,Validation Loss,Accuracy
100,1.2282,0.508398,0.844267
200,0.4646,0.424101,0.849733
300,0.4687,0.366671,0.870267
400,0.4083,0.334609,0.882267
500,0.3751,0.367759,0.880933
600,0.4,0.325682,0.888267
700,0.3672,0.337458,0.879067
800,0.325,0.319721,0.880667
900,0.3326,0.296475,0.891867
1000,0.3465,0.289876,0.894267


VBox(children=(Label(value='0.129 MB of 0.129 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▁▃▄▄▄▅▄▅▅▆▆▆▇▆▇▇▇▇▆▇▇▇▇▇███████████████
eval/loss,█▇▇▆▅▅▅▅▄▄▃▄▃▃▄▃▂▂▂▂▃▂▂▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▄▃▄▆▄▆▄▇▆▅▄▄▇▂▁▂▂▂▄▃▂▃▂▆▄▆█▄▃▃▄▃▂▃▃▇▂▂▄▃
eval/samples_per_second,▁▆▆▅▄▇▅▅▆▇▆▅▅▃██▇▇▅▇▇▇▅▃▆▇▇▄▇▂▅▅▇▆▇▆▆▅▇▇
eval/steps_per_second,▆▇▆▆▅▅▅▇▅▆▆▁▇▄▄█▇█▇▇▇███▆▅█▇▇▃▇▇▅▆▇▇▅▆█▄
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,▅▄█▅▁▄▂▃▂▃▅▃▂▄█▃▁▂█▅█▂▃▃▃▂▁▄▃▂▃▁▄▄▂▃▁▂▆▂
train/learning_rate,█████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,█▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.9388
eval/loss,0.17625
eval/runtime,17.383
eval/samples_per_second,431.457
eval/steps_per_second,53.961
total_flos,9799158988800000.0
train/epoch,20.0
train/global_step,9380.0
train/grad_norm,12.87983
train/learning_rate,1e-05


GPU memory occupied: 3577 MB.
Time: 2345.13
Samples/second: 63.96
GPU memory occupied: 3577 MB.


In [12]:
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

metric = evaluate.load('accuracy')

def evaluate_model(inference_model, dataset):
    # 确保模型道padding token
    if isinstance(inference_model, GPT2ForSequenceClassification):
        inference_model.config.pad_token_id = tokenizer.eos_token_id

    eval_dataloader = DataLoader(dataset.rename_column("label", "labels"), batch_size=8, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(eval_metric)

# 评估基准模型和训练后的模型
base_model_for_eval = GPT2ForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    pad_token_id=tokenizer.eos_token_id
)
test_dataset = dataset['test'].map(
    preprocess,
    batched=True,
    remove_columns=["text"]
)
evaluate_model(base_model_for_eval, test_dataset)
evaluate_model(model, test_dataset)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 950/950 [00:15<00:00, 60.31it/s]


{'accuracy': 0.2814473684210526}


100%|██████████| 950/950 [00:19<00:00, 49.29it/s]

{'accuracy': 0.9019736842105263}



