In [1]:
import torch
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, Trainer, 
                        TrainingArguments, DataCollatorWithPadding, default_data_collator)
from datasets import load_dataset
from peft import (
    get_peft_model,
    PromptTuningConfig,
    PrefixTuningConfig,
    LoraConfig,
    IA3Config,
    TaskType,
)
from opacus.privacy_engine import GradSampleModule
from opacus.optimizers import DPOptimizer
from opacus import PrivacyEngine
from transformers import DataCollatorWithPadding
import numpy as np
import os
import random
from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager
from torch.optim.lr_scheduler import LinearLR
import warnings
import logging
import os
import gc

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
warnings.filterwarnings("ignore")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
logging.getLogger("transformers").setLevel(logging.ERROR)

logging.getLogger("torch").setLevel(logging.ERROR)

In [4]:
# Function to load the model and tokenizer
def setup_device():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        n_gpu = torch.cuda.device_count()
        print(f"Using {n_gpu} GPU(s)")
    else:
        device = torch.device("cpu")
        n_gpu = 0
        print("Using CPU")
    return device, n_gpu

def load_model_and_tokenizer(model_name, num_labels):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels,
        ignore_mismatched_sizes=True,
        trust_remote_code=True
    )
    
    model = model.to(device)
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer

# Function to prepare the dataset
def prepare_dataset(dataset_name, tokenizer):
    # Load the dataset from Hugging Face Datasets
    dataset = load_dataset('glue', dataset_name)

    # Tokenization function depending on the dataset
    def tokenize_function(examples):
        if dataset_name.lower() == "sst2":
            return tokenizer(
                examples["sentence"],
                padding="max_length",
                truncation=True,
                max_length=128,
            )
        elif dataset_name.lower() == "qqp":
            return tokenizer(
                examples["question1"],
                examples["question2"],
                padding="max_length",
                truncation=True,
                max_length=128,
            )
        elif dataset_name.lower() == "qnli":
            return tokenizer(
                examples["question"],
                examples["sentence"],
                padding="max_length",
                truncation=True,
                max_length=128,
            )
        elif dataset_name.lower() == "mnli":
            return tokenizer(
                examples["premise"],
                examples["hypothesis"],
                padding="max_length",
                truncation=True,
                max_length=128,
            )
        else:
            raise ValueError(f"Dataset {dataset_name} is not supported.")

    # Determine which columns to remove
    columns_to_remove = set(dataset["train"].column_names) - {"label"}

    # Apply the tokenization to the dataset
    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=list(columns_to_remove)  # Remove all columns except 'label'
    )

    # Rename 'label' to 'labels'
    tokenized_datasets = tokenized_datasets.map(
        lambda examples: {"labels": examples["label"]},
        remove_columns=["label"]
    )

    # Convert the datasets to PyTorch tensors
    tokenized_datasets.set_format("torch")

    return tokenized_datasets




# Function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Get the predictions by taking the argmax over logits
    predictions = np.argmax(logits, axis=-1)
    # Compute accuracy by comparing predictions and labels
    accuracy = np.mean(predictions == labels)
    # Return the accuracy inside a dictionary
    return {"accuracy": accuracy}

# Function to get the PEFT configuration based on the method
def get_peft_config(method):
    if method == "soft_prompt":
        peft_config = PromptTuningConfig(
            task_type=TaskType.SEQ_CLS, num_virtual_tokens=20
        )
    elif method == "prefix":
        peft_config = PrefixTuningConfig(
            task_type=TaskType.SEQ_CLS, num_virtual_tokens=20
        )
    elif method == "lora":
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1
        )
    elif method == "ia3":
        peft_config = IA3Config(task_type=TaskType.SEQ_CLS)
    elif method == "soft_prompt_lora":
        # Combine Prompt Tuning and LoRA
        peft_config = [
            PromptTuningConfig(task_type=TaskType.SEQ_CLS, num_virtual_tokens=20),
            LoraConfig(
                task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1
            ),
        ]
    elif method == "prefix_lora":
        # Combine Prefix Tuning and LoRA
        peft_config = [
            PrefixTuningConfig(task_type=TaskType.SEQ_CLS, num_virtual_tokens=20),
            LoraConfig(
                task_type=TaskType.SEQ_CLS, r=8, lora_alpha=16, lora_dropout=0.1
            ),
        ]
    else:
        peft_config = None
    return peft_config

def get_validation_dataset(tokenized_datasets):
    # Check for common validation set names and return the first that exists
    for val_name in ["validation", "validation_matched", "validation_mismatched"]:
        if val_name in tokenized_datasets:
            return tokenized_datasets[val_name]
    raise ValueError("No valid validation set found.")


def create_dp_optimizer(model, learning_rate, epsilon, delta, expected_batch_size):
    privacy_engine = PrivacyEngine()
    
    # Wrap the model with GradSampleModule
    model = GradSampleModule(model)

    # Create optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    # Make optimizer differentially private
    dp_optimizer = DPOptimizer(
        optimizer=optimizer,
        noise_multiplier=1.3,
        max_grad_norm=1.0,
        expected_batch_size=expected_batch_size
    )

    return model, dp_optimizer

def compute_dp_noise_scale(epsilon, delta, sample_rate, steps):
    """Compute noise scale for DP-SGD."""
    return np.sqrt(2 * np.log(1.25 / delta)) / (epsilon * np.sqrt(steps * sample_rate))

def add_noise_to_grads(model, noise_scale, max_grad_norm):
    """Add noise to gradients for Differential Privacy."""
    total_norm = 0
    for p in model.parameters():
        if p.requires_grad and p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm ** 0.5

    clip_coef = max_grad_norm / (total_norm + 1e-6)
    clip_coef = min(clip_coef, 1.0)  # Clamp without using torch.clamp

    for p in model.parameters():
        if p.requires_grad and p.grad is not None:
            p.grad.data.mul_(clip_coef)
            noise = torch.randn_like(p.grad) * noise_scale * max_grad_norm
            p.grad.data.add_(noise)

import json
from datetime import datetime

def save_results_to_file(results, epsilon):
    filename = "peft_experiment_results.txt"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    # Read existing content
    try:
        with open(filename, "r") as f:
            existing_content = f.read()
    except FileNotFoundError:
        existing_content = ""
    
    # Prepare new content
    new_content = f"\n\n--- Experiment Results ({timestamp}) ---\n"
    new_content += f"Epsilon: {epsilon}\n" if epsilon is not None else "No Differential Privacy\n"
    new_content += json.dumps(results, indent=2)
    
    # Combine existing and new content
    updated_content = existing_content + new_content
    
    # Write updated content back to file
    with open(filename, "w") as f:
        f.write(updated_content)
    
    print(f"\nResults have been appended to {filename}")

In [5]:
def run_peft_experiments(dataset, epsilon=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    model_name = "prajjwal1/bert-tiny"
    # datasets = ["sst2", "qnli", "qqp", "mnli"]
    methods = [
        "soft_prompt",
        "prefix",
        "full_fine_tuning",
        "lora",
        "ia3",
        "single_layer_fine_tuning",
        "soft_prompt_lora",
        "prefix_lora",
    ]
    
    # Dataset-specific parameters
    dataset_params = {
        "sst2": {"lambda": 1e-5, "noise_multiplier": 0.92, "num_labels": 2},
        "qnli": {"lambda": 1e-5, "noise_multiplier": 0.83, "num_labels": 2},
        "qqp": {"lambda": 1e-6, "noise_multiplier": 0.66, "num_labels": 2},
        "mnli": {"lambda": 1e-6, "noise_multiplier": 0.65, "num_labels": 3},
    }
    
    results_dict = {}
    
    for dataset_name in dataset:
        print(f"Processing dataset: {dataset_name}")
        
        params = dataset_params[dataset_name]
        num_labels = params["num_labels"]
        
        model, tokenizer = load_model_and_tokenizer(model_name, num_labels)
        tokenized_dataset = prepare_dataset(dataset_name, tokenizer)
        
        results_dict[dataset_name] = {}
        
        for method in methods:
            print(f"  Method: {method}")
            model, tokenizer = load_model_and_tokenizer(model_name, num_labels)
            model = model.to(device)
            
            if method == "full_fine_tuning":
                peft_model = model
            elif method == "single_layer_fine_tuning":
                for param in model.parameters():
                    param.requires_grad = False
                model.classifier = torch.nn.Linear(model.config.hidden_size, num_labels).to(device)
                peft_model = model
            else:
                peft_config = get_peft_config(method)
                if isinstance(peft_config, list):
                    peft_model = model
                    for config in peft_config:
                        peft_model = get_peft_model(peft_model, config)
                else:
                    peft_model = get_peft_model(model, peft_config)
            peft_model = peft_model.to(device)
            
            training_args = TrainingArguments(
                output_dir=f"./results/{dataset_name}_{method}",
                num_train_epochs=50,
                per_device_train_batch_size=1024,
                per_device_eval_batch_size=1024,
                evaluation_strategy="epoch",
                save_strategy="epoch",
                logging_dir=f"./logs/{dataset_name}_{method}",
                logging_steps=100,
                learning_rate=5e-4,
                load_best_model_at_end=False,
                save_total_limit=1,
            )
            
            train_dataloader = torch.utils.data.DataLoader(
                tokenized_dataset["train"],
                batch_size=training_args.per_device_train_batch_size,
                shuffle=True,
                collate_fn=default_data_collator,
            )
            
            eval_dataloader = torch.utils.data.DataLoader(
                get_validation_dataset(tokenized_dataset),
                batch_size=training_args.per_device_eval_batch_size,
                collate_fn=default_data_collator,
            )
            
            if epsilon is not None:
                results = train_with_dp(
                    peft_model=peft_model,
                    train_dataloader=train_dataloader,
                    eval_dataloader=eval_dataloader,
                    device=device,
                    epsilon=epsilon,
                    delta=params["lambda"],
                    noise_multiplier=params["noise_multiplier"],
                    epochs=int(training_args.num_train_epochs),
                    batch_size=training_args.per_device_train_batch_size,
                    max_grad_norm=1.0,
                    learning_rate=training_args.learning_rate,
                    weight_decay=1e-2
                )
            else:
                # Train without differential privacy
                results = train_without_dp(
                    peft_model=peft_model,
                    train_dataloader=train_dataloader,
                    eval_dataloader=eval_dataloader,
                    device=device,
                    epochs=int(training_args.num_train_epochs),
                    learning_rate=training_args.learning_rate,
                    weight_decay=1e-2
                )
            
            results_dict[dataset_name][method] = results
            print(f"    Final Results: {results}")
            gc.collect()
            torch.cuda.empty_cache()
    
    # Print final results
    print("="*50)
    for dataset_name in dataset:
        print(f"\nResults for {dataset_name}:")
        for method, result in results_dict[dataset_name].items():
            accuracy = result.get("accuracy", "N/A")
            loss = result.get("eval_loss", "N/A")
            print(f"  Method: {method}, Accuracy: {accuracy:.4f}, Loss: {loss:.4f}")
    
    # Save results to file
    save_results_to_file(results_dict, epsilon)
    
    return results_dict

def train_with_dp(peft_model, train_dataloader, eval_dataloader, device, epsilon, delta, noise_multiplier, epochs, batch_size, max_grad_norm, learning_rate, weight_decay):
    optimizer = torch.optim.AdamW(peft_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    best_accuracy = 0
    for epoch in range(epochs):
        peft_model.train()
        total_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = peft_model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(peft_model.parameters(), max_grad_norm)
            
            # Add noise to gradients
            for param in peft_model.parameters():
                if param.requires_grad and param.grad is not None:
                    noise = torch.randn_like(param.grad) * noise_multiplier * max_grad_norm
                    param.grad.add_(noise)
            
            optimizer.step()
            optimizer.zero_grad()

        avg_train_loss = total_loss / len(train_dataloader)
        
        # Evaluation
        eval_results = evaluate(peft_model, eval_dataloader, device)
        
        print(f"Epoch {epoch + 1}:  Train Loss: {avg_train_loss:.4f}  Eval Loss: {eval_results['eval_loss']:.4f}   Accuracy: {eval_results['accuracy']:.4f}")

        
        if eval_results['accuracy'] > best_accuracy:
            best_accuracy = eval_results['accuracy']
            # print(f"  New best accuracy: {best_accuracy:.4f}")

    return {"eval_loss": eval_results['eval_loss'], "accuracy": eval_results['accuracy'], "best_accuracy": best_accuracy}

def train_without_dp(peft_model, train_dataloader, eval_dataloader, device, epochs, learning_rate, weight_decay):
    optimizer = torch.optim.AdamW(peft_model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    best_accuracy = 0
    for epoch in range(epochs):
        peft_model.train()
        total_loss = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = peft_model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        avg_train_loss = total_loss / len(train_dataloader)
        
        # Evaluation
        eval_results = evaluate(peft_model, eval_dataloader, device)
        
        print(f"Epoch {epoch + 1}:   Train Loss: {avg_train_loss:.4f}   Eval Loss: {eval_results['eval_loss']:.4f}   Accuracy: {eval_results['accuracy']:.4f} ")

        
        if eval_results['accuracy'] > best_accuracy:
            best_accuracy = eval_results['accuracy']
            # print(f"  New best accuracy: {best_accuracy:.4f}")

    return {"eval_loss": eval_results['eval_loss'], "accuracy": eval_results['accuracy'], "best_accuracy": best_accuracy}

def evaluate(model, eval_dataloader, device):
    model.eval()
    eval_loss = 0
    eval_steps = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            eval_loss += outputs.loss.item()
            eval_steps += 1
            all_preds.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
            all_labels.extend(batch["labels"].cpu().numpy())
    
    eval_loss = eval_loss / eval_steps
    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    
    return {"eval_loss": eval_loss, "accuracy": accuracy}

In [6]:
results = run_peft_experiments(epsilon=8,dataset=['qnli'])

Using device: cuda
Processing dataset: qnli
  Method: soft_prompt
Epoch 1:  Train Loss: 0.6950  Eval Loss: 0.6933   Accuracy: 0.5017
Epoch 2:  Train Loss: 0.6947  Eval Loss: 0.6931   Accuracy: 0.5078
Epoch 3:  Train Loss: 0.6950  Eval Loss: 0.6937   Accuracy: 0.4972
Epoch 4:  Train Loss: 0.6945  Eval Loss: 0.6929   Accuracy: 0.5096
Epoch 5:  Train Loss: 0.6944  Eval Loss: 0.6931   Accuracy: 0.5017
Epoch 6:  Train Loss: 0.6942  Eval Loss: 0.6925   Accuracy: 0.5136
Epoch 7:  Train Loss: 0.6940  Eval Loss: 0.6921   Accuracy: 0.5147
Epoch 8:  Train Loss: 0.6940  Eval Loss: 0.6919   Accuracy: 0.5157
Epoch 9:  Train Loss: 0.6936  Eval Loss: 0.6910   Accuracy: 0.5200
Epoch 10:  Train Loss: 0.6928  Eval Loss: 0.6897   Accuracy: 0.5329
Epoch 11:  Train Loss: 0.6930  Eval Loss: 0.6894   Accuracy: 0.5332
Epoch 12:  Train Loss: 0.6928  Eval Loss: 0.6898   Accuracy: 0.5312
Epoch 13:  Train Loss: 0.6931  Eval Loss: 0.6897   Accuracy: 0.5354
Epoch 14:  Train Loss: 0.6932  Eval Loss: 0.6902   Accuracy

In [7]:
results = run_peft_experiments(epsilon=0,dataset=['qnli'])

Using device: cuda
Processing dataset: qnli
  Method: soft_prompt
Epoch 1:  Train Loss: 0.6957  Eval Loss: 0.6940   Accuracy: 0.4981
Epoch 2:  Train Loss: 0.6951  Eval Loss: 0.6933   Accuracy: 0.5023
Epoch 3:  Train Loss: 0.6947  Eval Loss: 0.6924   Accuracy: 0.5142
Epoch 4:  Train Loss: 0.6948  Eval Loss: 0.6924   Accuracy: 0.5173
Epoch 5:  Train Loss: 0.6950  Eval Loss: 0.6916   Accuracy: 0.5310
Epoch 6:  Train Loss: 0.6952  Eval Loss: 0.6916   Accuracy: 0.5219
Epoch 7:  Train Loss: 0.6943  Eval Loss: 0.6903   Accuracy: 0.5310
Epoch 8:  Train Loss: 0.6938  Eval Loss: 0.6913   Accuracy: 0.5239
Epoch 9:  Train Loss: 0.6943  Eval Loss: 0.6913   Accuracy: 0.5222
Epoch 10:  Train Loss: 0.6942  Eval Loss: 0.6912   Accuracy: 0.5191
Epoch 11:  Train Loss: 0.6943  Eval Loss: 0.6909   Accuracy: 0.5277
Epoch 12:  Train Loss: 0.6945  Eval Loss: 0.6908   Accuracy: 0.5340
Epoch 13:  Train Loss: 0.6945  Eval Loss: 0.6909   Accuracy: 0.5279
Epoch 14:  Train Loss: 0.6938  Eval Loss: 0.6905   Accuracy