https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py

In [1]:
import os

import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import numpy as np
import pandas as pd
from peft import LoraConfig, get_peft_model, IA3Config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # max_memory=max_memory,
    # quantization_config=BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     llm_int8_threshold=6.0,
    #     llm_int8_has_fp16_weight=False,
    #     bnb_4bit_compute_dtype=torch.float16,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    # ),
    # torch_dtype=torch.float16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)



In [None]:
MAX_LEN = 2000
import json
samples = json.load(open("../samples.json"))

# sample = samples[0]
# sample

## Helpers

In [None]:
# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154

# from evaluate.measurements.perplexity import Perplexity
import evaluate
from evaluate import logging
from torch.nn import CrossEntropyLoss

# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
def perplexity_compute(
    data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
):

    if device is not None:
        assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
        if device == "gpu":
            device = "cuda"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # model = AutoModelForCausalLM.from_pretrained(model_id)
    model = model.to(device)

    # tokenizer = AutoTokenizer.from_pretrained(model_id)

    # if batch_size > 1 (which generally leads to padding being required), and
    # if there is not an already assigned pad_token, assign an existing
    # special token to also be the padding token
    if tokenizer.pad_token is None and batch_size > 1:
        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
        # check that the model already has at least one special token defined
        assert (
            len(existing_special_tokens) > 0
        ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
        # assign one of the special tokens to also be the pad token
        tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

    if add_start_token and max_length:
        # leave room for <BOS> token to be added:
        assert (
            tokenizer.bos_token is not None
        ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
        max_tokenized_len = max_length - 1
    else:
        max_tokenized_len = max_length

    encodings = tokenizer(
        data,
        add_special_tokens=False,
        padding=True,
        truncation=True if max_tokenized_len else False,
        max_length=max_tokenized_len,
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    encoded_texts = encodings["input_ids"]
    attn_masks = encodings["attention_mask"]

    # check that each input is long enough:
    if add_start_token:
        assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
    else:
        assert torch.all(
            torch.ge(attn_masks.sum(1), 2)
        ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

    ppls = []
    loss_fct = CrossEntropyLoss(reduction="none")

    for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
        end_index = min(start_index + batch_size, len(encoded_texts))
        encoded_batch = encoded_texts[start_index:end_index]
        attn_mask = attn_masks[start_index:end_index]

        if add_start_token:
            bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
            encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
            attn_mask = torch.cat(
                [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
            )

        labels = encoded_batch

        with torch.no_grad():
            out_logits = model(encoded_batch, attention_mask=attn_mask).logits

        shift_logits = out_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

        perplexity_batch = torch.exp(
            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
            / shift_attention_mask_batch.sum(1)
        )

        ppls += perplexity_batch.tolist()

    return {"perplexities": ppls, "mean_perplexity": torch.tensor(ppls).mean()}

## Perplexity

In [None]:
# results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')
# results['mean_perplexity']

## Learn

In [None]:
"""### Post-processing on the model

Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.
"""

print(model)

for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

# model.gradient_checkpointing_enable()  # reduce number of stored activations
# model.model.decoder.project_in = lambda x: x.requires_grad_(True)


class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)


model.lm_head = CastOutputToFloat(model.lm_head)


In [None]:
# # Verifying the datatypes.
# dtypes = {}
# for _, p in model.named_parameters():
#     dtype = p.dtype
#     if dtype not in dtypes:
#         dtypes[dtype] = 0
#     dtypes[dtype] += p.numel()
# total = 0
# for k, v in dtypes.items():
#     total += v
# for k, v in dtypes.items():
#     print(k, v, v / total)

In [None]:
# sample['text']

In [None]:
"""### Training"""
# from datasets import Dataset

# data = load_dataset("Abirate/english_quotes")
# data = Dataset.from_dict({"text": [sample['text'][:len(sample['text'])//2]]*100})
# data = data.map(lambda samples: tokenizer(samples["text"]), batched=True).with_format("torch")
# data

In [None]:
from torch.nn import functional as F

In [None]:
def lora_eval(model, sample):
    # reset/set adapter
    peft_config = IA3Config(
        target_modules=[ "fc2",  "Wqkv",], 
            feedforward_modules=["fc2"],
            inference_mode=False,
    )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False

    # train adapter
    s = sample['text']
    first_half = s[:len(s)//2]
    second_half = s[len(s)//2:]
    input_ids = tokenizer(first_half, return_tensors="pt")["input_ids"][0].to('cuda')
    device = 'cuda'
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    model.train()
    model = model.to(device)
    for epoch in range(1):
        for i in range(1, len(input_ids)):
            X = input_ids[:i][None, ]
            targets = input_ids[i:i+1][None, ]
            optimizer.zero_grad()
            out = model(input_ids=X, 
                        )
            logits = out['logits'][:, -1]
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            # print(loss.item())

    # eval
    model.eval();
    with torch.no_grad():
        with model.disable_adapter():
            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')
            results['mean_perplexity']
        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')

    return dict(before=results['mean_perplexity'], after=results2['mean_perplexity'])



In [None]:
data = []
for sample in samples:
    r = lora_eval(model, sample)
    r.update(sample)
    data.append(r)
    1/0
    print(data[-1])

In [None]:
print('perplexity (on 2nd half) before and after training adapter on first half of text')
df = pd.DataFrame(data)
df

## Result