In [None]:
!pip install transformers datasets evaluate accelerate trl
!pip install nvidia-ml-py3
!pip install -U peft==0.4.0



In [None]:
import sys
import subprocess
import numpy as np
import evaluate
from pynvml import *

# Install required packages using pip
packages = [
    "transformers", "datasets", "evaluate", "accelerate", "peft", "trl",
    "nvidia-ml-py3"
]
for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

In [None]:
# Import necessary ML libraries
import torch
from transformers import RobertaModel, RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, BitsAndBytesConfig, GPT2Tokenizer, GPT2ForSequenceClassification
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
from torch import nn

In [None]:
# Define LoRA (Low-Rank Adaptation) hyperparameters
lora_r = 8
lora_alpha = 16
output_dir = './lora_results_prompt_tuning'

# Initialize Weights & Biases for experiment tracking
import wandb
wandb.login()
run_name = "LoRA_adapt_tuning"
wandb.init(
    project="Lora",
    name=run_name,
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myanjie98[0m ([33myanjie98-new-york-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Set up the base model and tokenizer
base_model = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have a pad token by default

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Load the AG News dataset and prepare a subset for training
dataset = load_dataset('ag_news')
train_size = 1000

def preprocess(examples):
    """
    Preprocess the dataset by tokenizing the input texts
    Args:
        examples: Raw text examples from dataset
    Returns:
        Tokenized and formatted examples
    """
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=128
    )
    return tokenized

In [None]:
# Prepare train, eval, and test datasets
tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
train_dataset=tokenized_dataset['train']
eval_dataset=tokenized_dataset['test'].shard(num_shards=2, index=0)
test_dataset=tokenized_dataset['test'].shard(num_shards=2, index=1)


# Extract the number of classess and their names
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
# Get label information
num_labels = dataset['train'].features['label'].num_classes
class_names = dataset["train"].features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}

number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [None]:
# Configure data collator for padding batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Initialize the GPT2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    pad_token_id=tokenizer.eos_token_id,
)

# Configure LoRA parameters for efficient fine-tuning
peft_config = LoraConfig(
    r=lora_r,  # Rank dimension
    lora_alpha=lora_alpha,  # Scaling factor
    lora_dropout=0.05,  # Dropout probability for LoRA layers
    bias='none',  # Don't train bias parameters
    task_type="SEQ_CLS",  # Sequence classification task
    target_modules=['c_attn', 'c_proj'],  # Layers to apply LoRA
)

# Apply LoRA configuration to the model
model = get_peft_model(model, peft_config)
model

print('PEFT Model')
model.print_trainable_parameters()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PEFT Model
trainable params: 817,152 || all params: 125,256,960 || trainable%: 0.6523805144241086


In [None]:
class AdaptLayer(nn.Module):
    def __init__(self, hidden_size, adapter_size):
        super().__init__()
        self.down_project = nn.Linear(hidden_size, adapter_size)
        self.activation = nn.ReLU()
        self.up_project = nn.Linear(adapter_size, hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, x):
        residual = x
        x = self.down_project(x)
        x = self.activation(x)
        x = self.up_project(x)
        x = x + residual
        x = self.layer_norm(x)
        return x

In [None]:
# Add adapter layers to each transformer block for additional fine-tuning capability
adapter_size = 64  # Hidden dimension of adapter layers
for name, module in model.named_modules():
    if "transformer.h" in name and name.endswith(".mlp"):
        # Get output dimension of MLP layer
        hidden_size = module.c_proj.out_features
        # Initialize adapter layer
        adapter = AdaptLayer(hidden_size, adapter_size)
        # Move adapter to same device as model
        adapter.to(model.device)
        setattr(module, "adapter", adapter)

In [None]:
def modify_forward(model):
    """
    Modify the model's forward pass to include adapter layers
    Args:
        model: The model to modify
    """
    old_forward = model.forward

    def new_forward(self, *args, **kwargs):
        outputs = old_forward(*args, **kwargs)

        # Apply adapter after each MLP layer
        for module in self.modules():
            if hasattr(module, "adapter"):
                if isinstance(outputs, tuple):
                    outputs = (module.adapter(outputs[0]),) + outputs[1:]
                else:
                    outputs = module.adapter(outputs)

        return outputs

    model.forward = new_forward.__get__(model)

In [None]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy='steps',  # Evaluate during training
    learning_rate=5e-5,  # Learning rate for optimization
    num_train_epochs=1,  # Number of training epochs
    use_cpu=False,  # Use GPU if available
    dataloader_num_workers=1,  # Number of parallel data loading workers
    per_device_train_batch_size=16,  # Batch size per device
    optim="adamw_torch",  # Use AdamW optimizer
    gradient_checkpointing=False,  # Disable gradient checkpointing
    gradient_checkpointing_kwargs={'use_reentrant':True}
)



In [None]:
def get_trainer(model):
    """
    Initialize trainer with model and training configuration
    Args:
        model: Model to be trained
    Returns:
        Configured Trainer instance for model training
    """
    return Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
    )

In [23]:
# Initialize evaluation metric
metric = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [24]:
def print_gpu_utilization():

    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory: {info.used//1024**2} MB")
def print_summary(result):
    print(f"training time: {result.metrics['train_runtime']:.2f}")
    print(f"step training time: {result.metrics['train_samples_per_second']:.2f}")
    print(f"training loss: {result.metrics['train_loss']:.2f}")

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [25]:
# Initialize trainer with LoRA-configured model
peft_lora_finetuning_trainer = get_trainer(model)

# Start training process
result = peft_lora_finetuning_trainer.train()

# End wandb logging session
wandb.finish()

# Print final GPU utilization
print_gpu_utilization()

# Print training summary
print_summary(result)



Step,Training Loss,Validation Loss,Accuracy
500,0.2981,0.343052,0.887105
1000,0.2971,0.322159,0.893421
1500,0.2944,0.307041,0.898947
2000,0.2759,0.312155,0.901053
2500,0.289,0.293932,0.897895
3000,0.2936,0.282731,0.902368
3500,0.2684,0.272026,0.906842
4000,0.2648,0.289347,0.904474
4500,0.27,0.27123,0.910526
5000,0.2548,0.288718,0.907105


VBox(children=(Label(value='1.746 MB of 10.721 MB uploaded\r'), FloatProgress(value=0.1629014342571999, max=1.…

0,1
eval/accuracy,▁▃▄▅▄▅▆▆▇▆█▇▇██
eval/loss,█▆▅▅▄▃▂▃▂▃▁▂▁▁▁
eval/runtime,▃▄▄▅▃▁▁▄▄▁▅▄█▄▃
eval/samples_per_second,▆▅▅▄▆██▅▅█▄▅▁▅▆
eval/steps_per_second,▆▅▅▄▆██▅▅█▄▅▁▅▆
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███
train/grad_norm,▁▂▂▃▂▇▁▂▅█▇▄▃▅▄▂▂
train/learning_rate,███▇▇▇▆▅▅▅▄▃▃▂▂▁▁
train/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.91316
eval/loss,0.26609
eval/runtime,8.4677
eval/samples_per_second,448.763
eval/steps_per_second,56.095
total_flos,8025404866560000.0
train/epoch,1.0
train/global_step,7500.0
train/grad_norm,7.77146
train/learning_rate,0.0


GPU memory: 4259 MB
training time: 732.91
step training time: 163.73
training loss: 0.27


In [26]:
def evaluate_model(model, dataset):

    trainer = get_trainer(model)
    metrics = trainer.evaluate(dataset)
    print(f"result: {metrics}")

In [27]:
# Create and evaluate base model for comparison
base_model_for_eval = GPT2ForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    pad_token_id=tokenizer.eos_token_id
)

# Evaluate both base model and fine-tuned model
evaluate_model(base_model_for_eval, test_dataset)
evaluate_model(model, test_dataset)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


result: {'eval_loss': 7.05778694152832, 'eval_model_preparation_time': 0.0025, 'eval_accuracy': 0.25973684210526315, 'eval_runtime': 7.3683, 'eval_samples_per_second': 515.722, 'eval_steps_per_second': 64.465}


result: {'eval_loss': 0.23249667882919312, 'eval_model_preparation_time': 0.007, 'eval_accuracy': 0.9189473684210526, 'eval_runtime': 8.7908, 'eval_samples_per_second': 432.27, 'eval_steps_per_second': 54.034}
