Experiment to use lora to make a lying model. Here we think of Lora as a probe, as it acts in a very similar way - modifying the residual stream.

Then the hope is it will assist at lie detecting and generalize to unseen dataset

- https://github.dev/JD-P/minihf/blob/b54075c34ef88d9550e37fdf709e78e5a68787c4/lora_tune.py
- https://github.com/jonkrohn/NLP-with-LLMs

In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"]= "1"


In [2]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

plt.style.use("ggplot")

from typing import Optional, List, Dict, Union
from jaxtyping import Float
from torch import Tensor

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import Tensor
from torch import optim
from torch.utils.data import random_split, DataLoader, TensorDataset

from pathlib import Path
from einops import rearrange

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, LoftQConfig, IA3Config

import datasets
from datasets import Dataset

from loguru import logger

logger.add(os.sys.stderr, format="{time} {level} {message}", level="INFO")


# # quiet please
torch.set_float32_matmul_precision("medium")
import warnings
warnings.filterwarnings("ignore", ".*does not have many workers.*")
# warnings.filterwarnings(
#     "ignore", ".*sampler has shuffling enabled, it is strongly recommended that.*"
# )
# warnings.filterwarnings("ignore", ".*has been removed as a dependency of.*")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load my code
%load_ext autoreload
%autoreload 2

from src.config import ExtractConfig
from src.prompts.prompt_loading import load_preproc_dataset
from src.models.load import load_model


## Parameters


In [4]:
# params
max_epochs = 1
device = "cuda:0"

cfg = ExtractConfig(
    batch_size=3,
    max_examples=(400, 150),
    intervention_fit_examples=60,
)


## Load model

In [5]:
model, tokenizer = load_model(
    cfg.model,
    device=device,
)


A new version of the following files was downloaded from https://huggingface.co/wassname/phi-1_5-w_hidden_states:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A new version of the following files was downloaded from https://huggingface.co/wassname/phi-1_5-w_hidden_states:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [6]:
# TODO I would like to only have biases, but for now lets just try a very small intervention on the last parts of a layer...
peft_config = LoraConfig(
    target_modules=[
        "out_proj",
        "mlp.fc2",
    ],  # only the layers that go directly to the residual
    # bias="lora_only",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=4,
    lora_alpha=1,
    lora_dropout=0.0,
)


# peft_config = IA3Config(
#     task_type=TaskType.SEQ_CLS, target_modules=[ "out_proj",
#         "mlp.fc2",], feedforward_modules=["out_proj", "mlp.fc2",]
# )
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 1,376,256 || all params: 1,419,646,976 || trainable%: 0.09694353760240743


In [7]:
N = sum(cfg.max_examples)
ds_name = "amazon_polarity"
ds_tokens = load_preproc_dataset(
    ds_name,
    tokenizer,
    N=N,
    seed=cfg.seed,
    num_shots=cfg.num_shots,
    max_length=cfg.max_length,
    prompt_format=cfg.prompt_format,
).with_format("torch")


format_prompt:   0%|          | 0/1652 [00:00<?, ? examples/s]Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_

## Lora train

In [8]:
# from https://github.com/jonkrohn/NLP-with-LLMs/blob/main/code/Finetune-T5-on-GPU.ipynb
import lightning.pytorch as pl


In [9]:
from src.datasets.dm import DeceptionDataModule


In [10]:

from src.models.pl_lora_ft import AtapterFinetuner


## Train

In [11]:
dm = DeceptionDataModule(ds_tokens, batch_size=cfg.batch_size)
dm


<src.datasets.dm.DeceptionDataModule at 0x7fa8f8e73990>

In [12]:
dl_train = dm.train_dataloader()
dl_val = dm.val_dataloader()


In [13]:
b = next(iter(dl_train))
print(b.keys(), b["input_ids"].shape)
c_in = b["input_ids"].shape[1]
c_in


dict_keys(['ds_string', 'example_i', 'answer', 'messages', 'answer_choices', 'template_name', 'label_true', 'label_instructed', 'instructed_to_lie', 'sys_instr_name', 'question', 'input_ids', 'attention_mask', 'truncated', 'length', 'prompt_truncated', 'choice_ids']) torch.Size([3, 777])


777

In [14]:
net = AtapterFinetuner(
    model, tokenizer, lr=5e-5, weight_decay=0, total_steps=len(dl_train) * max_epochs
)

print(c_in)
# net.model.enable_adapters()


777


In [15]:
# # debug
# with torch.no_grad():
#     o = net.training_step(b, None)
# o


In [16]:
# # debug
# with torch.no_grad():
#     o = net.predict_step(b, None)
# # o


In [17]:
# we want to init lightning early, so it inits accelerate
trainer1 = pl.Trainer(
    # precision="16-true", # works?
    # precision="16-mixed",
    # precision="b16-mixed",
    gradient_clip_val=20,
    # accelerator="auto",
    # devices="1",
    accelerator="gpu",
    devices=[0],
    # accumulate_grad_batches=2,
    max_epochs=max_epochs,
    log_every_n_steps=1,
    # enable_progress_bar=False,
    enable_model_summary=False,
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/media/wassname/SGIronWolf/projects5/elk/sgd_probes_are_lie_detectors/.venv/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [18]:
trainer1.fit(model=net, train_dataloaders=dl_train, val_dataloaders=dl_val);


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Epoch 0: 100%|██████████| 92/92 [01:43<00:00,  0.89it/s, v_num=101]        

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 92/92 [01:44<00:00,  0.88it/s, v_num=101]


In [19]:
checkpoint_path = Path(trainer1.log_dir)/'final'
model.save_pretrained(checkpoint_path)


In [20]:
from src.helpers.lightning import read_metrics_csv

pd.read_csv(trainer1.logger.experiment.metrics_file_path).bfill().ffill()


Unnamed: 0,step,train/loss_step,epoch,val/loss_step,val/loss_epoch,train/loss_epoch
0,0,3.659522,0.0,1.790487,3.465001,3.343624
1,1,4.253745,0.0,1.790487,3.465001,3.343624
2,2,8.194761,0.0,1.790487,3.465001,3.343624
3,3,2.753622,0.0,1.790487,3.465001,3.343624
4,4,3.210605,0.0,1.790487,3.465001,3.343624
...,...,...,...,...,...,...
135,43,3.415613,0.0,1.579672,3.465001,3.343624
136,44,3.415613,0.0,0.650827,3.465001,3.343624
137,45,3.415613,0.0,4.310611,3.465001,3.343624
138,91,3.415613,0.0,4.310611,3.465001,3.343624


In [21]:
# df_hist_epoch, df_hist_step = read_metrics_csv(trainer1.logger.experiment.metrics_file_path)
# # for key in ["loss"]:
# #     df_hist_epoch[[c for c in df_hist_epoch.columns if key in c]].plot(logy=True)

# df_hist_step = df_hist_step.dropna()
# df_hist_step.plot()
# df_hist_step


## Generate


In [22]:
# get a row
bi = cfg.intervention_fit_examples + 4
inputs = ds_tokens.with_format("torch")[bi]


In [23]:
from IPython.display import display, HTML

# generate
# https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/text_generation#transformers.GenerationConfig


@torch.no_grad()
def gen(model):
    s = model.generate(
        input_ids=inputs["input_ids"][None, :].to(model.device),
        attention_mask=inputs["attention_mask"][None, :].to(model.device),
        use_cache=False,
        max_new_tokens=100,
        min_new_tokens=100,
        do_sample=False,
        early_stopping=False,
    )
    input_l = inputs["input_ids"].shape[0]
    old = tokenizer.decode(
        s[0, :input_l], clean_up_tokenization_spaces=False, skip_special_tokens=False
    )
    new = tokenizer.decode(
        s[0, input_l:], clean_up_tokenization_spaces=False, skip_special_tokens=False
    )
    display(HTML(f"<pre>{old}</pre><b><pre>{new}</pre></b>"))


In [24]:
# # with model.disable_adapters():
# with model.disable_adapter():
#     gen(model)

# gen(model)

# # RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'


In [25]:
model, tokenizer = model, tokenizer = load_model(
    cfg.model,
    device=device,
    adaptor_path=checkpoint_path
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
# with model.disable_adapters():
with model.disable_adapter():
    gen(model)

gen(model)


# Test

In [27]:
N = sum(cfg.max_examples)
ds_name = "imdb"
ds_tokens = load_preproc_dataset(
    ds_name,
    tokenizer,
    N=N // 4,
    seed=cfg.seed,
    num_shots=cfg.num_shots,
    max_length=cfg.max_length,
    prompt_format=cfg.prompt_format,
).with_format("torch")


format_prompt:   0%|          | 0/412 [00:00<?, ? examples/s]Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_t

In [28]:
dm = DeceptionDataModule(ds_tokens, batch_size=cfg.batch_size * 3)
dl_train2 = dm.train_dataloader()
dl_val2 = dm.val_dataloader()
dl_test2 = dm.test_dataloader()


In [29]:
dl_oos2 = DataLoader(
    ds_tokens, batch_size=cfg.batch_size * 3, drop_last=False, shuffle=False
)


In [30]:
# def get_acc_subset(df, query, verbose=True):
#     if query:
#         df = df.query(query)
#     acc = (df["probe_pred"] == df["y"]).mean()
#     if verbose:
#         print(f"acc={acc:2.2%},\tn={len(df)},\t[{query}] ")
#     return acc


# def calc_metrics(dm, trainer, net, use_val=False, verbose=True):
#     dl_test = dm.test_dataloader()
#     rt = trainer.predict(net, dataloaders=dl_test)
#     y_test_pred = np.concatenate(rt)
#     splits = dm.splits["test"]
#     df_test = dm.df.iloc[splits[0] : splits[1]].copy()
#     df_test["probe_pred"] = y_test_pred > 0.0

#     if use_val:
#         dl_val = dm.val_dataloader()
#         rv = trainer.predict(net, dataloaders=dl_val)
#         y_val_pred = np.concatenate(rv)
#         splits = dm.splits["val"]
#         df_val = dm.df.iloc[splits[0] : splits[1]].copy()
#         df_val["probe_pred"] = y_val_pred > 0.0

#         df_test = pd.concat([df_val, df_test])

#     if verbose:
#         print("probe results on subsets of the data")
#     acc = get_acc_subset(df_test, "", verbose=verbose)
#     get_acc_subset(
#         df_test, "instructed_to_lie==True", verbose=verbose
#     )  # it was ph told to lie
#     get_acc_subset(
#         df_test, "instructed_to_lie==False", verbose=verbose
#     )  # it was told not to lie
#     get_acc_subset(
#         df_test, "llm_ans==label_true", verbose=verbose
#     )  # the llm gave the true ans
#     get_acc_subset(
#         df_test, "llm_ans==label_instructed", verbose=verbose
#     )  # the llm gave the desired ans
#     acc_lie_lie = get_acc_subset(
#         df_test, "instructed_to_lie==True & llm_ans==label_instructed", verbose=verbose
#     )  # it was told to lie, and it did lie
#     acc_lie_truth = get_acc_subset(
#         df_test, "instructed_to_lie==True & llm_ans!=label_instructed", verbose=verbose
#     )

#     a = get_acc_subset(
#         df_test, "instructed_to_lie==False & llm_ans==label_instructed", verbose=False
#     )
#     b = get_acc_subset(
#         df_test, "instructed_to_lie==False & llm_ans!=label_instructed", verbose=False
#     )
#     c = get_acc_subset(
#         df_test, "instructed_to_lie==True & llm_ans==label_instructed", verbose=False
#     )
#     d = get_acc_subset(
#         df_test, "instructed_to_lie==True & llm_ans!=label_instructed", verbose=False
#     )
#     d1 = pd.DataFrame(
#         [[a, b], [c, d]],
#         index=["instructed_to_lie==False", "instructed_to_lie==True"],
#         columns=["llm_ans==label_instructed", "llm_ans!=label_instructed"],
#     )
#     d1 = pd.DataFrame(
#         [[a, b], [c, d]],
#         index=["tell a truth", "tell a lie"],
#         columns=["did", "didn't"],
#     )
#     d1.index.name = "instructed to"
#     d1.columns.name = "llm gave"
#     print("probe accuracy for quadrants")
#     display(d1.round(2))

#     if verbose:
#         print(f"⭐PRIMARY METRIC⭐ acc={acc:2.2%} from probe")
#         print(f"⭐SECONDARY METRIC⭐ acc_lie_lie={acc_lie_lie:2.2%} from probe")
#     return dict(acc=acc, acc_lie_lie=acc_lie_lie, acc_lie_truth=acc_lie_truth)


In [31]:
# import re


# def transform_dl_k(k: str) -> str:
#     p = re.match(r"test\/(.+)\/dataloader_idx_\d", k)
#     return p.group(1) if p else k


# def rename(rs, ks=["train", "val", "test"]):
#     rs = {
#         ks[i]: {transform_dl_k(k): v for k, v in rs[i].items()} for i in range(len(ks))
#     }
#     return rs


In [32]:
# model.device


In [33]:
# rs = trainer1.test(
#     net,
#     dataloaders=[
#         # dl_train2, dl_val2,
#         dl_test2,
#         dl_oos2,
#     ],
# )
# rs = rename(rs, ["train", "val", "test", "oos"])
# rs[0]


# Predict

Here we want to see if we can do a probe on the hidden states to see if it's lying...


In [34]:
rv = trainer1.predict(net, dataloaders=dl_val2)
y_val_pred = np.concatenate(rv)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
testval_metrics = calc_metrics(dm, trainer1, net, use_val=True)

# rs['test'] = {**rs['test'], **test_metrics}
rs["test"]["acc_lie_lie"] = testval_metrics["acc_lie_lie"]
rs["testval_metrics"] = rs["test"]
