# Fine-Tuning GPT-2 with LoRA and FHE using `LoraTrainer`

This notebook demonstrates how to fine-tune a Llama-3.2-1B model using LoRA (Low-Rank Adaptation) with Fully Homomorphic Encryption (FHE). We leverage the `LoraTrainer` API from the `concrete.ml.torch.lora` library to simplify the process.


In [1]:
import random
import shutil
from pathlib import Path

import numpy as np
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from utils_lora import generate_and_print

# Import LoraTrainer from the provided library
from concrete.ml.torch.lora import LoraTrainer

Concrete ML LoRA fine-tuning is implemented in a 'hybrid' setting: the client machine outsources all
computations that involve the original model weights, but runs gradient descent on LoRA layers locally. 

The client machine thus executes some layers of the LoRA training protocol and it can use CPU or dedicated
accelerators for this process. 

In [2]:
# Set seed for reproducibility
SEED = 0
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
device = "cpu"
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

import concrete_ml_extensions as fhext

cuda_fhext = fhext.is_cuda_enabled() and fhext.is_cuda_available()  # pylint: disable=no-member
print(
    "Original model linear layers execute with FHE on: ",
    "cuda" if cuda_fhext else "cpu",
)
print("Non-FHE layers and the LoRA weight optimizer executed on: ", device)

Original model linear layers execute with FHE on:  cpu
Non-FHE layers and the LoRA weight optimizer executed on:  cpu


## Set-up

Load the LLAMA model, tokenize the dataset, and create LoRA fine-tuning configuration.

In [3]:
# Load the model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Freeze the original model's weights
for param in model.parameters():
    param.requires_grad = False

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [4]:
# Apply LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.01,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear",
)
peft_model = get_peft_model(model, peft_config)

In [5]:
# Load the dataset and tokenize it
dataset = load_dataset("json", data_files="data_finetune/dataset.jsonl", split="train")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="longest", truncation=True)


tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

In [6]:
# Define training arguments
EPOCHS = 10
PER_DEVICE_TRAIN_BATCH_SIZE = 4
training_args = TrainingArguments(
    output_dir="./checkpoints",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=1,
    save_total_limit=1,
    use_cpu=True,
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    seed=SEED,
    data_seed=SEED,
    warmup_steps=10,
    weight_decay=0.01,
    prediction_loss_only=True,
)

In [7]:
# Create optimizer and scheduler using HuggingFace's Trainer
hf_trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)
train_dataloader = hf_trainer.get_train_dataloader()
hf_trainer.create_optimizer_and_scheduler(num_training_steps=len(train_dataloader) * EPOCHS)

optimizer = hf_trainer.optimizer
lr_scheduler = hf_trainer.lr_scheduler


# Define a causal LM loss function
def causal_lm_loss(logits, labels, ignore_index=-100):
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.view(-1)
    loss = torch.nn.functional.cross_entropy(
        shift_logits, shift_labels, ignore_index=ignore_index, reduction="mean"
    )
    return loss

## Test the original model

In [8]:
# Print the initial generation with the base model
PROMPT = "from concrete.ml.sklearn import LogisticRegression\n\nmodel = LogisticRegression("
print("Initial generation with base model:")
print(generate_and_print(PROMPT, model, tokenizer, seed=SEED))

Initial generation with base model:


from concrete.ml.sklearn import LogisticRegression

model = LogisticRegression( eta=0.1, n_iter=1000, random_state=42)
None


## Convert the model to use FHE

Similarily to all Concrete ML models, LoRA fine-tuning is set up using by compiling the
model. For this, a representative set of data is required.

In [9]:
# Prepare input data for calibration
lengths = [len(item["input_ids"]) for item in tokenized_dataset]
if not all(length == lengths[0] for length in lengths):
    raise ValueError("All examples must have the same length for calibration.")
BLOCK_SIZE = lengths[0]

input_tensor = torch.randint(
    0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long
)
label_tensor = torch.randint(
    0, tokenizer.vocab_size, (PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long
)
attention_mask = torch.ones((PER_DEVICE_TRAIN_BATCH_SIZE, BLOCK_SIZE), dtype=torch.long)
inputset = {"input_ids": input_tensor, "attention_mask": attention_mask, "labels": label_tensor}

# Initialize LoraTrainer
training_args_dict = vars(training_args)
lora_trainer = LoraTrainer(
    model=peft_model,
    optimizer=optimizer,
    loss_fn=causal_lm_loss,
    lr_scheduler=lr_scheduler,
    training_args=training_args_dict,
    n_layers_to_skip_for_backprop=3,
)

LoRA layers detected in the model.


Compile the model using quantization. 

In [10]:
# Compile the model with FHE
lora_trainer.compile(inputset, n_bits=16)

Compiling FHE layers:   0%|          | 0/221 [00:00<?, ?it/s]

## Test-run Concrete ML LoRA fine-tuning on clear data with quantization

To check that everything works properly, it's possible to dry-run the fine-tuning on clear data.

In [11]:
# Train the model using LoraTrainer
print("Starting training using LoraTrainer...")
lora_trainer.train(train_dataloader, num_epochs=EPOCHS, fhe="disable", device=device)

Starting training using LoraTrainer...


Training:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training completed. Final Avg Loss: 0.0885, FHE Mode: disable


## Evaluation

We show code generation using the original model versus the fine-tuned model. This is done
by disabling the lora layers in the HybridFHEModel.

In [12]:
# Compare generation before and after fine-tuning
peft_model.disable_adapter_layers()
print("Original model generation:")
print(generate_and_print(PROMPT, peft_model, tokenizer, seed=SEED))

Original model generation:


from concrete.ml.sklearn import LogisticRegression

model = LogisticRegression( eta=0.1, max_iter=1000, random_state=1)
None


In [13]:
peft_model.enable_adapter_layers()
print("Fine-tuned model generation:")
print(generate_and_print(PROMPT, peft_model, tokenizer, seed=SEED))

Fine-tuned model generation:


from concrete.ml.sklearn import LogisticRegression

model = LogisticRegression( eta=0.01, n_bits=8)
None


## Fine-tuning on encrypted data

Next, we benchmark the time to train on a single encrypted example, a 
code snippet of ~130 tokens. 

In [14]:
FHE_SEQUENCE_LENGTH = 16


def tokenize_function_fhe(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=FHE_SEQUENCE_LENGTH
    )


tokenized_dataset = dataset.map(tokenize_function_fhe, batched=True)

# Create a small data loader with a single example
hf_trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset.select(list(range(PER_DEVICE_TRAIN_BATCH_SIZE))),
    data_collator=data_collator,
)

train_dataloader = hf_trainer.get_train_dataloader()

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

In [15]:
# Execute fine-tuning, using the GPU when it is available
fhe_epochs = 1
import time

start = time.time()
lora_trainer.train(train_dataloader, num_epochs=fhe_epochs, fhe="execute")
duration = time.time() - start
print(
    (
        f"Trained on one encrypted batch of {PER_DEVICE_TRAIN_BATCH_SIZE} "
        f"examples of {FHE_SEQUENCE_LENGTH} tokens in {duration} seconds on {device}"
    )
)

Training:   0%|          | 0/1 [00:00<?, ?epoch/s]

Training completed. Final Avg Loss: 13.6442, FHE Mode: execute
Trained on one encrypted batch of 4 examples of 16 tokens in 5321.782980680466 seconds on cpu


## Save the fine-tuned LoRA weights

In [16]:
# Save the fine-tuned model
save_path = Path("deployment/llama_lora_finetuned")
if save_path.is_dir() and any(save_path.iterdir()):
    shutil.rmtree(save_path)
lora_trainer.save_and_clear_private_info(save_path)

print("Model saved to:", save_path)

Model saved to: deployment/llama_lora_finetuned
