https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py

In [1]:
import os

import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import numpy as np
from tqdm.auto import tqdm
import pandas as pd
from peft import LoraConfig, get_peft_model, IA3Config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # max_memory=max_memory,
    # quantization_config=BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     llm_int8_threshold=6.0,
    #     llm_int8_has_fp16_weight=False,
    #     bnb_4bit_compute_dtype=torch.float16,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    # ),
    # torch_dtype=torch.float16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,)



Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
MAX_LEN = 2000
import json
samples = json.load(open("../samples.json"))

# sample = samples[0]
# sample

## Helpers

In [4]:
# modified from https://github.dev/huggingface/evaluate/blob/8dfe05784099fb9af55b8e77793205a3b7c86465/measurements/perplexity/perplexity.py#L154

# from evaluate.measurements.perplexity import Perplexity
import evaluate
from evaluate import logging
from torch.nn import CrossEntropyLoss

# @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
def perplexity_compute(
    data, model, tokenizer, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
):

    if device is not None:
        assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
        if device == "gpu":
            device = "cuda"
    else:
        device = "cuda" if torch.cuda.is_available() else "cpu"

    # model = AutoModelForCausalLM.from_pretrained(model_id)
    model = model.to(device)

    # tokenizer = AutoTokenizer.from_pretrained(model_id)

    # if batch_size > 1 (which generally leads to padding being required), and
    # if there is not an already assigned pad_token, assign an existing
    # special token to also be the padding token
    if tokenizer.pad_token is None and batch_size > 1:
        existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
        # check that the model already has at least one special token defined
        assert (
            len(existing_special_tokens) > 0
        ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
        # assign one of the special tokens to also be the pad token
        tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

    if add_start_token and max_length:
        # leave room for <BOS> token to be added:
        assert (
            tokenizer.bos_token is not None
        ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
        max_tokenized_len = max_length - 1
    else:
        max_tokenized_len = max_length

    encodings = tokenizer(
        data,
        add_special_tokens=False,
        padding=True,
        truncation=True if max_tokenized_len else False,
        max_length=max_tokenized_len,
        return_tensors="pt",
        return_attention_mask=True,
    ).to(device)

    encoded_texts = encodings["input_ids"]
    attn_masks = encodings["attention_mask"]

    # check that each input is long enough:
    if add_start_token:
        assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
    else:
        assert torch.all(
            torch.ge(attn_masks.sum(1), 2)
        ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

    ppls = []
    loss_fct = CrossEntropyLoss(reduction="none")

    for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
        end_index = min(start_index + batch_size, len(encoded_texts))
        encoded_batch = encoded_texts[start_index:end_index]
        attn_mask = attn_masks[start_index:end_index]

        if add_start_token:
            bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
            encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
            attn_mask = torch.cat(
                [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
            )

        labels = encoded_batch

        with torch.no_grad():
            out_logits = model(encoded_batch, attention_mask=attn_mask).logits

        shift_logits = out_logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

        perplexity_batch = torch.exp(
            (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
            / shift_attention_mask_batch.sum(1)
        )

        ppls += perplexity_batch.tolist()

    return {"perplexities": ppls, "mean_perplexity": torch.tensor(ppls).mean()}

## Perplexity

In [5]:
# results = perplexity_compute(data=sample['text'], model=model, tokenizer=tokenizer, device='cuda')
# results['mean_perplexity']

## Learn

In [6]:
# """### Post-processing on the model

# Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.
# """

# print(model)

# for param in model.parameters():
#     param.requires_grad = False  # freeze the model - train adapters later
#     if param.ndim == 1:
#         # cast the small parameters (e.g. layernorm) to fp32 for stability
#         param.data = param.data.to(torch.float32)

# # model.gradient_checkpointing_enable()  # reduce number of stored activations
# # model.model.decoder.project_in = lambda x: x.requires_grad_(True)


# class CastOutputToFloat(nn.Sequential):
#     def forward(self, x):
#         return super().forward(x).to(torch.float32)


# model.lm_head = CastOutputToFloat(model.lm_head)


## Training

In [7]:
from torch.nn import functional as F

In [32]:
from torch import optim


def lora_eval(model, tokenizer, sample):
    # reset/set adapter
    # peft_config = IA3Config(
    #     target_modules=[ "fc2",  "Wqkv",], 
    #         feedforward_modules=["fc2"],
    #         inference_mode=False,
    # )
    peft_config = LoraConfig(
        # task_type=TaskType.TOKEN_CLS, 
        target_modules=[ "fc2",  "Wqkv",],
        inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
    )
    model = get_peft_model(model, peft_config)
    model.config.use_cache = False

    # train adapter
    s = sample['text']
    first_half = s[:len(s)//2]
    second_half = s[len(s)//2:]
    input_ids = tokenizer(first_half, return_tensors="pt")["input_ids"][0].to('cuda')

    device = 'cuda'
    lr = 1.0e-2
    epochs = 3
    accum_steps = 64
    epoch_steps = (len(input_ids)-1)//accum_steps+1

    total_steps = epochs * epoch_steps
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer, lr, total_steps=total_steps
    )
    model.train()
    model = model.to(device)
    for epoch in range(epochs):
        # TODO: batch
        
        accum = 0
        for i in range(1, len(input_ids)):
            X = input_ids[:i][None, ]
            targets = input_ids[i:i+1][None, ]
            optimizer.zero_grad()
            out = model(input_ids=X, 
                        )
            logits = out['logits'][:, -1]
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            loss.backward()
            if accum > accum_steps:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                accum = 0
            else:
                accum += 1
        if accum > 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    # eval
    model.eval();
    with torch.no_grad():
        with model.disable_adapter():
            results = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')
            results['mean_perplexity']
        results2 = perplexity_compute(data=second_half, model=model, tokenizer=tokenizer, device='cuda')

    return dict(before=results['mean_perplexity'].item(), after=results2['mean_perplexity'].item())



In [33]:
data = []
for sample in tqdm(samples):
    r = lora_eval(model, tokenizer, sample)
    print(sample['name'], r)
    r.update(sample)
    data.append(r)


  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  6.42it/s]
100%|██████████| 1/1 [00:00<00:00,  6.36it/s]
  8%|▊         | 1/12 [01:21<14:54, 81.32s/it]

bad_ml {'before': 17.1319522857666, 'after': 17.076616287231445}


100%|██████████| 1/1 [00:00<00:00,  9.21it/s]
100%|██████████| 1/1 [00:00<00:00,  9.19it/s]
 17%|█▋        | 2/12 [01:43<07:45, 46.58s/it]

good_ml {'before': 48.654518127441406, 'after': 48.63978576660156}


100%|██████████| 1/1 [00:00<00:00,  6.00it/s]
100%|██████████| 1/1 [00:00<00:00,  6.20it/s]
 25%|██▌       | 3/12 [03:26<10:49, 72.18s/it]

sokal hoax {'before': 29.55867576599121, 'after': 29.561065673828125}


100%|██████████| 1/1 [00:00<00:00,  7.25it/s]
100%|██████████| 1/1 [00:00<00:00,  7.55it/s]
 33%|███▎      | 4/12 [04:10<08:10, 61.34s/it]

Theory o. general relativity {'before': 48.4825553894043, 'after': 48.46034622192383}


100%|██████████| 1/1 [00:00<00:00,  8.83it/s]
100%|██████████| 1/1 [00:00<00:00,  9.19it/s]
 42%|████▏     | 5/12 [04:29<05:21, 45.97s/it]

lorem ipsum  {'before': 243.0447540283203, 'after': 238.47674560546875}


100%|██████████| 1/1 [00:00<00:00,  8.78it/s]
100%|██████████| 1/1 [00:00<00:00,  8.88it/s]
 50%|█████     | 6/12 [05:05<04:14, 42.40s/it]

wikipedia on LK-99 {'before': 53.24197006225586, 'after': 53.270263671875}


100%|██████████| 1/1 [00:00<00:00,  8.77it/s]
100%|██████████| 1/1 [00:00<00:00,  8.92it/s]
 58%|█████▊    | 7/12 [05:30<03:03, 36.69s/it]

I have a dream {'before': 18.867136001586914, 'after': 18.801422119140625}


100%|██████████| 1/1 [00:00<00:00,  6.27it/s]
100%|██████████| 1/1 [00:00<00:00,  6.20it/s]
 67%|██████▋   | 8/12 [06:40<03:09, 47.34s/it]

AI gen fake paper {'before': 11.114971160888672, 'after': 11.109580039978027}


100%|██████████| 1/1 [00:00<00:00,  6.13it/s]
100%|██████████| 1/1 [00:00<00:00,  6.07it/s]
 75%|███████▌  | 9/12 [08:43<03:32, 70.94s/it]

Schmidhuber 2023 Subjective Novelty, Surprise {'before': 67.33682250976562, 'after': 67.20210266113281}


100%|██████████| 1/1 [00:00<00:00,  6.77it/s]
100%|██████████| 1/1 [00:00<00:00,  6.85it/s]
 83%|████████▎ | 10/12 [09:50<02:19, 69.77s/it]

email_to_fauci {'before': 55.9570198059082, 'after': 56.01524353027344}


100%|██████████| 1/1 [00:00<00:00, 27.48it/s]
100%|██████████| 1/1 [00:00<00:00, 24.11it/s]
 92%|█████████▏| 11/12 [10:02<00:52, 52.08s/it]

enron_email1 {'before': 59.76203536987305, 'after': 59.75802230834961}


100%|██████████| 1/1 [00:00<00:00,  8.55it/s]
100%|██████████| 1/1 [00:00<00:00,  8.91it/s]
100%|██████████| 12/12 [10:48<00:00, 54.00s/it]

openai_board_ann {'before': 30.923919677734375, 'after': 30.946474075317383}





In [34]:
print('perplexity (on 2nd half) before and after training adapter on first half of text')
df = pd.DataFrame(data).set_index('name')

df['learning'] = (df['before']-df['after'])/df['before']
df.sort_values('learning').drop(columns=['text', 'url'])

perplexity (on 2nd half) before and after training adapter on first half of text


Unnamed: 0_level_0,before,after,in_training,learning
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
email_to_fauci,55.95702,56.015244,False,-0.001041
openai_board_ann,30.92392,30.946474,False,-0.000729
wikipedia on LK-99,53.24197,53.270264,False,-0.000531
sokal hoax,29.558676,29.561066,True,-8.1e-05
enron_email1,59.762035,59.758022,True,6.7e-05
good_ml,48.654518,48.639786,False,0.000303
Theory o. general relativity,48.482555,48.460346,True,0.000458
AI gen fake paper,11.114971,11.10958,False,0.000485
"Schmidhuber 2023 Subjective Novelty, Surprise",67.336823,67.202103,False,0.002001
bad_ml,17.131952,17.076616,False,0.00323


## Result