In [None]:
# Install required packages
!pip install transformers datasets evaluate accelerate peft trl
!pip install nvidia-ml-py3



In [None]:
import torch
from transformers import RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, BitsAndBytesConfig, GPT2Tokenizer, GPT2ForSequenceClassification
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer

In [None]:
# Define LoRA (Low-Rank Adaptation) hyperparameters
lora_r = 8
lora_alpha = 16
output_dir = './lora_results_lora'

# Initialize Weights & Biases for experiment tracking
import wandb
wandb.login()
run_name = "LoRA_lora"
wandb.init(
    project="Lora",
    name=run_name,
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myanjie98[0m ([33myanjie98-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Load model and tokenizer
base_model = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# Load and preprocess dataset
dataset = load_dataset('ag_news')
#train_size = 1000

def preprocess(examples):
    """
    Tokenize the input texts with padding and truncation
    """
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=128
    )
    return tokenized

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
# Prepare train, eval, and test datasets
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
train_dataset=tokenized_dataset['train']
eval_dataset=tokenized_dataset['test'].shard(num_shards=2, index=0)
test_dataset=tokenized_dataset['test'].shard(num_shards=2, index=1)


# Extract the number of classess and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
# Get label information
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
# Initialize the base model
model = GPT2ForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    pad_token_id=tokenizer.eos_token_id,
)

# Configure LoRA parameters
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=0.05,
    bias='none',
    task_type="SEQ_CLS",
    target_modules=['c_attn', 'c_proj'],  # Target attention modules for LoRA
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)
model

print('PEFT Model')
model.print_trainable_parameters()

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT Model
trainable params: 814,080 || all params: 125,256,960 || trainable%: 0.6499


In [None]:
# Define evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    """
    Compute accuracy for model evaluation
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

# GPU utilization monitoring functions
from pynvml import *

def print_gpu_utilization():
    """
    Print current GPU memory usage
    """
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        info = nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    except:
        print("GPU information not available - NVIDIA driver not found")

def print_summary(result):
    """
    Print training summary including runtime and throughput
    """
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    try:
        print_gpu_utilization()
    except:
        pass

In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy='steps',
    learning_rate=5e-5,
    num_train_epochs=1,
    use_cpu=False,
    dataloader_num_workers=1,
    per_device_train_batch_size=16,
    optim="adamw_torch",
    gradient_checkpointing=False,
    gradient_checkpointing_kwargs={'use_reentrant':True}
)

def get_trainer(model):
    """
    Initialize the trainer with specified configuration
    """
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )



In [None]:
# Train the model
peft_lora_finetuning_trainer = get_trainer(model)
result = peft_lora_finetuning_trainer.train()

wandb.finish()
print_gpu_utilization()
print_summary(result)



Step,Training Loss,Validation Loss,Accuracy
500,0.7719,0.377576,0.867105
1000,0.3263,0.339031,0.888158
1500,0.3069,0.320063,0.893684
2000,0.2874,0.319864,0.891579
2500,0.2951,0.303497,0.897632
3000,0.3041,0.289023,0.896316
3500,0.2747,0.277029,0.902632
4000,0.2751,0.296577,0.898421
4500,0.2751,0.275617,0.907632
5000,0.2577,0.283441,0.901053


VBox(children=(Label(value='0.718 MB of 0.718 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅▆▅▆▆▇▆█▇█▇███
eval/loss,█▅▄▄▃▂▁▃▁▂▁▁▁▁▁
eval/runtime,▃▃█▁▁▂▂▁▂▄▂▂▁▂▃
eval/samples_per_second,▆▆▁██▇▇█▇▅▇▇█▆▆
eval/steps_per_second,▆▆▁██▇▇█▇▅▇▇█▆▆
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/grad_norm,▃▃▂▆▁▄█▅▆▅▃▃▃▃▃
train/learning_rate,█▇▇▇▆▅▅▅▄▃▃▂▂▁▁
train/loss,█▂▂▁▂▂▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.90737
eval/loss,0.27118
eval/runtime,8.5132
eval/samples_per_second,446.364
eval/steps_per_second,55.795
total_flos,7914069688320000.0
train/epoch,1.0
train/global_step,7500.0
train/grad_norm,9.43043
train/learning_rate,0.0


GPU memory occupied: 3753 MB.
Time: 725.43
Samples/second: 165.42
GPU memory occupied: 3753 MB.


In [None]:
# Model evaluation functions
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

metric = evaluate.load('accuracy')

def evaluate_model(inference_model, dataset):
    """
    Evaluate model performance on the test dataset
    Args:
        inference_model: The model to evaluate
        dataset: Test dataset
    """
    if isinstance(inference_model, GPT2ForSequenceClassification):
        inference_model.config.pad_token_id = tokenizer.eos_token_id

    eval_dataloader = DataLoader(dataset.rename_column("label", "labels"), batch_size=8, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch.to(device)
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(eval_metric)

# Evaluate base model and fine-tuned model
base_model_for_eval = GPT2ForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    pad_token_id=tokenizer.eos_token_id
)
evaluate_model(base_model_for_eval, test_dataset)
evaluate_model(model, test_dataset)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 475/475 [00:07<00:00, 60.14it/s]


{'accuracy': 0.25973684210526315}


100%|██████████| 475/475 [00:08<00:00, 53.86it/s]

{'accuracy': 0.9194736842105263}



