In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
#GLUE Benchmark (10 different classification tasks) Microsoft Research Paraphrase Corpus

data = load_dataset("glue", "mrpc")

## on the entier dataset


tok = AutoTokenizer.from_pretrained('bert-base-uncased')

inputs = tok(
    data['train']['sentence1'][:],
    data['train']['sentence2'][:], padding=True, truncation=True)

## padding left out, because padding every sentence can be inefficiant
def tokenize_fn(example):
    return tok(example['sentence1'], example['sentence2'], truncation=True)
    
tok_data = data.map(tokenize_fn, batched=True)

In [2]:
## Low-Rank Adaptation
### Adapters can be loaded onto a pretrained model wiht (load_adapter())
### Set the active adapter weights with (set_adapter())
### Return base model (unload())

from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

config = PeftConfig.from_pretrained("ybelkada/opt-350m-lora")
print(config.base_model_name_or_path)

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)

lora_model = PeftModel.from_pretrained(model,"ybelkada/opt-350m-lora")

facebook/opt-350m


In [9]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 512, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
          (project_out): Linear(in_features=1024, out_features=512, bias=False)
          (project_in): Linear(in_features=512, out_features=1024, bias=False)
          (layers): ModuleList(
            (0-23): 24 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=16, bias=False

In [10]:
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): lora.Linear(
              (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=1024, bias=Fa

In [3]:
import os

os.environ["TRL_GRADIO_ENABLED"] = "0"  

## trl provides integration with LoRA adapters through PEFT library

# 1. Define the LoRA confuguration (rank, alpha, dropout)
# 2. Create the SFTTrainer with PEFT config
# 3. Train and save adapter weights

# 1.
## Define LoRA configuration
from peft import LoraConfig
from transformers import TrainingArguments

rank = 5 # [4-32] smaller = more cmpression (dimension for LoRA update matrices)
lora_alpha = 8 # [2x(rank]) higher = stronger adaptation (scaling factor) "how much of the pretrained model's behavior is modified by newlwy added low-rank updates" modle output influence
lora_dropout = 0.05 # [0.05-1] helps prevent overfitting "probability"

peft_config = LoraConfig(
    r=rank,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias='none',
    target_modules='all-linear', # which modules to apply LoRA to
    task_type='CAUSAL_LM', # task type for model arch 
    
) 

# 2.
## Create Trainer
from trl import SFTTrainer
arguments = TrainingArguments(
    output_dir='misc/files/training_out/',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    args=arguments,
    train_dataset=tok_data['train'],
    peft_config=peft_config,
    # max_seq_length=max_seq_length,
    processing_class=tok
)

# 3.
## Save models
# model.save_pretrained("path/to/folder/")



# Merging Implementation

In [None]:
## After training LoRA adapter you can merge the adapter weights back into the base model:

import torch
from transformers import AutoModelForCausalLM
from peft import PeftModel

# 1. Load Base model

base_model = AutoModelForCausalLM.from_pretrained('openai/gpt-oss-20b', torch_dtype=torch.float16, device_map='auto')

# 2. Load PEFT model with adapter

peft_model = PeftModel.from_pretrained(base_model, "path/to/adapter", torch_dtype=torch.float16) # prep-model to beable to load adapter (personalized models)

# 3. Merge adapter weights with base model

merged_model = peft_model.merge_and_unload()


## save both model and tokenizer

tokenizer = AutoTokenizer.from_pretrained('openai/gpt-oss-20b')
merged_model.save_pretrained('files/training_out/')
tokenizer.save_pretrained('files/training_out/')