https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py

In [1]:
from torch import optim
import lightning as pl
from matplotlib import pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
import warnings
from peft import LoraConfig, get_peft_model, IA3Config

In [3]:
plt.style.use('ggplot')
torch.set_float32_matmul_precision('medium')
warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [4]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_name = "microsoft/phi-2"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     # max_memory=max_memory,
#     quantization_config=BitsAndBytesConfig(
#         load_in_4bit=True,
#         llm_int8_threshold=6.0,
#         llm_int8_has_fp16_weight=False,
#         bnb_4bit_compute_dtype=torch.float16,
#         bnb_4bit_use_double_quant=True,
#         bnb_4bit_quant_type="nf4",
#     ),
#     torch_dtype=torch.float16,
#     trust_remote_code=True,
# )






In [5]:
model_name = "microsoft/phi-2"

def load_model():

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        # torch_dtype=torch.float16,
        trust_remote_code=True,
    )
    return model


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
import json
MAX_LEN = 2000
samples = json.load(open("../samples.json"))


## Helpers

In [8]:
# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154

# from evaluate.measurements.perplexity import Perplexity
import evaluate
from evaluate import logging
from torch.nn import CrossEntropyLoss

# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
def perplexity_compute(
    data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
):

    if device is not None:
        assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
        if device == "gpu":
            device = "cuda"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # model = AutoModelForCausalLM.from_pretrained(model_id)
    model = model.to(device)

    # tokenizer = AutoTokenizer.from_pretrained(model_id)

    # if batch_size > 1 (which generally leads to padding being required), and
    # if there is not an already assigned pad_token, assign an existing
    # special token to also be the padding token
    if tokenizer.pad_token is None and batch_size > 1:
        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
        # check that the model already has at least one special token defined
        assert (
            len(existing_special_tokens) > 0
        ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
        # assign one of the special tokens to also be the pad token
        tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

    if add_start_token and max_length:
        # leave room for <BOS> token to be added:
        assert (
            tokenizer.bos_token is not None
        ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
        max_tokenized_len = max_length - 1
    else:
        max_tokenized_len = max_length

    encodings = tokenizer(
        data,
        add_special_tokens=False,
        padding=True,
        truncation=True if max_tokenized_len else False,
        max_length=max_tokenized_len,
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    encoded_texts = encodings["input_ids"]
    attn_masks = encodings["attention_mask"]

    # check that each input is long enough:
    if add_start_token:
        assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
    else:
        assert torch.all(
            torch.ge(attn_masks.sum(1), 2)
        ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

    ppls = []
    loss_fct = CrossEntropyLoss(reduction="none")

    for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
        end_index = min(start_index + batch_size, len(encoded_texts))
        encoded_batch = encoded_texts[start_index:end_index]
        attn_mask = attn_masks[start_index:end_index]

        if add_start_token:
            bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
            encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
            attn_mask = torch.cat(
                [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
            )

        labels = encoded_batch

        with torch.no_grad():
            out_logits = model(encoded_batch, attention_mask=attn_mask).logits

        shift_logits = out_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

        perplexity_batch = torch.exp(
            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
            / shift_attention_mask_batch.sum(1)
        )

        ppls += perplexity_batch.tolist()

    return {"perplexities": ppls, "mean_perplexity": torch.tensor(ppls).mean()}

## Training

In [9]:
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset

## Lightning helpers

In [10]:
sample = samples[0]
s = sample['text']
first_half = s[:len(s)//2]
second_half = s[len(s)//2:]



def str2xya(s, tokenizer):
    max_len = min(MAX_LEN, len(s))
    input_ids = tokenizer(s, return_tensors="pt")["input_ids"][0].tolist()

    pad = tokenizer.bos_token_id
    # turn it into a sequence
    Xs = []
    Ys = []
    for i in range(1, len(input_ids)):
        x = input_ids[:i][-max_len:]
        padding = max_len - len(x)
        x = [pad]*padding + x
        
        Xs.append(x)
        Ys.append(input_ids[i:i+1])

    Xs = torch.tensor(Xs)
    Ys = torch.tensor(Ys)
    attention_masks = torch.stack([(x==pad)*1 for x in Xs])
    return Xs, Ys, attention_masks



In [11]:
def eval(model, tokenizer, second_half):
    model.eval();
    with torch.no_grad():
        with model.disable_adapter():
            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')
        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')
    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())

def read_metrics_csv(metrics_file_path):
    df_hist = pd.read_csv(metrics_file_path)
    df_hist["epoch"] = df_hist["epoch"].ffill()
    df_histe = df_hist.set_index("epoch").groupby("epoch").mean()
    return df_histe, df_hist


def plot_hist(df_hist, allowlist=None, logy=False):
    """plot groups of suffixes together"""
    suffixes = list(set([c.split('/')[-1] for c in df_hist.columns if '/' in c]))
    for suffix in suffixes:
        if allowlist and suffix not in allowlist: continue
        df_hist[[c for c in df_hist.columns if c.endswith(suffix) and '/' in c]].plot(title=suffix, style='.', logy=logy)
        plt.title(suffix)   
        plt.show()

In [17]:
import bitsandbytes as bnb

class PL_MODEL(pl.LightningModule):
    def __init__(self, num_iterations, lr=3e-4, weight_decay=0,):
        super().__init__()
        self.save_hyperparameters()
        self.configure_model()

    def configure_model(self):
        # instantiate your model in this hook
        peft_config = LoraConfig(
            # task_type=TaskType.TOKEN_CLS, 
            target_modules=[ "fc2",  "Wqkv",],
            inference_mode=False, r=16, lora_alpha=16, 
            # lora_dropout=0.1,
            # bias="all"
        )
        self.model = load_model()
        self.model = get_peft_model(self.model, peft_config)
        self.model.config.use_cache = False
    
    def forward(self, **kwargs):
        return self.model(**kwargs)

    def _shared_step(self, batch, batch_idx, phase='train'):
        input_ids, targets, attention_mask = batch
        # 16, 141
        output = self.forward(input_ids=input_ids, attention_mask=attention_mask)
        loss = F.smooth_l1_loss(output.logits[:, -1], targets)
        self.log(f"{phase}/loss", loss, on_epoch=True, on_step=True, prog_bar=True)
        return loss
    
    def training_step(self, batch, batch_idx):
        return self._shared_step(batch, batch_idx, phase='train')

    def validation_step(self, batch, batch_idx):
        return self._shared_step(batch, batch_idx, phase='val')
    
    def test_step(self, batch, batch_idx, dataloader_idx=0):
        return self._shared_step(batch, batch_idx, phase='test')
    
    def configure_optimizers(self):
        # optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)

        optimizer = bnb.optim.AdamW4bit(self.parameters(), lr=self.hparams.lr, betas=(0.9, 0.995))
        lr_scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer, self.hparams.lr, total_steps=self.hparams.num_iterations
        )
        return [optimizer], [lr_scheduler]

## Train

In [18]:

    



device = 'cuda'
lr = 4e-3
epochs = 3
accum_steps = 16
batch_size = 2

Xs, Ys, attention_masks = str2xya(first_half, tokenizer)
dl_train = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=True)
Xs, Ys, attention_masks = str2xya(second_half, tokenizer)
dl_val = DataLoader(TensorDataset(Xs, Ys, attention_masks), batch_size=batch_size, shuffle=False)

epoch_steps = len(dl_train)

pl_model = PL_MODEL(num_iterations=epoch_steps*epochs, lr=lr, weight_decay=0)
from lightning.pytorch.plugins import BitsandbytesPrecision
precision = BitsandbytesPrecision(mode="nf4-dq")
# precision = BitsandbytesPrecision(mode="int8-training", dtype=torch.float16, ignore_modules={"lm_head"})
trainer = pl.Trainer(
        accelerator='gpu',
        max_epochs=epochs,
        # precision='',
        # precision="bf16-mixed",
        log_every_n_steps=1,
        accumulate_grad_batches=accum_steps,
        plugins=precision
    )

# train
trainer.fit(pl_model, dl_train, dl_val)

model = pl_model.model

df_histe, df_hist = read_metrics_csv(trainer.logger.experiment.metrics_file_path).bfill().ffill()
display(df_hist)
plot_hist(df_hist)

eval(model, tokenizer, second_half)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.10it/s]
Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TypeError: Linear4bit.__init__() got an unexpected keyword argument 'dtype'

In [None]:
1/0

# Old

In [None]:
from torch import optim


def lora_eval(model, tokenizer, sample):
    # reset/set adapter
    # peft_config = IA3Config(
    #     target_modules=[ "fc2",  "Wqkv",], 
    #         feedforward_modules=["fc2"],
    #         inference_mode=False,
    # )
    peft_config = LoraConfig(
        # task_type=TaskType.TOKEN_CLS, 
        target_modules=[ "fc2",  "Wqkv",],
        inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
    )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False

    # train adapter
    s = sample['text']
    first_half = s[:len(s)//2]
    second_half = s[len(s)//2:]
    input_ids = tokenizer(first_half, return_tensors="pt")["input_ids"][0].to('cuda')

    device = 'cuda'
    lr = 1.0e-2
    epochs = 3
    accum_steps = 64
    epoch_steps = (len(input_ids)-1)//accum_steps+1

    total_steps = epochs * epoch_steps
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer, lr, total_steps=total_steps
    )
    model.train()
    model = model.to(device)
    for epoch in range(epochs):
        # TODO: batch
        
        accum = 0
        for i in range(1, len(input_ids)):
            X = input_ids[:i][None, ]
            targets = input_ids[i:i+1][None, ]
            optimizer.zero_grad()
            out = model(input_ids=X, 
                        )
            logits = out['logits'][:, -1]
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            loss.backward()
            if accum > accum_steps:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                accum = 0
            else:
                accum += 1
        if accum > 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    return eval(model, tokenizer, second_half)



In [None]:
data = []
for sample in tqdm(samples):
    r = lora_eval(model, tokenizer, sample)
    print(sample['name'], r)
    r.update(sample)
    data.append(r)


In [None]:
print('perplexity (on 2nd half) before and after training adapter on first half of text')
df = pd.DataFrame(data).set_index('name')

df['learning'] = (df['before']-df['after'])/df['before']
df.sort_values('learning').drop(columns=['text', 'url'])

## Result