<a href="https://colab.research.google.com/github/yoyostudy/RL4LM_PI/blob/main/scripts/pi/train_decision_policy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TL,DR:

- This file is for **training** and **inference**
- High level policy to decide what action(Attack or Attempt) to take based on the current observation(llm_response)
- fine tune model: DistilBertForSequenceClassification
- base model: distilbert-base-uncased
- Trainer: Supervised Fine tuning

In [None]:
from transformers import AutoTokenizer, AutoModel, DistilBertForSequenceClassification
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd
import torch as th
import numpy as np
from sklearn.metrics import accuracy_score

class FinetuneTagger:

    def __init__(
        self,
        data_path: str = "https://github.com/HumanCompatibleAI/tensor-trust-data/raw/main/detecting-extractions/v1/prompt_extraction_detection.jsonl",
        model_name: str = 'distilbert-base-uncased',
        epoch: int = 3,
        batch_size: int = 64,
        lr: int = 5e-5,
        device: str = 'cuda',
        seed: int = 42,
        train_ratio: float = 0.8,
        val_ratio: float = 0.1):

        self.seed = seed
        self.epoch = epoch
        self.batch_size = batch_size
        self.device = device

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = DistilBertForSequenceClassification.from_pretrained(model_name,
                                                                         num_labels=2,
                                                                         problem_type="multi_label_classification")
        self.model.to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr = lr, eps =1e-8)

        self.load_data(
            data_path,
            train_ratio,
            val_ratio
        )

    def load_data(self, data_path, train_ratio, val_ratio):
        df = pd.read_json(data_path, lines = True).set_index('sample_id')
        df = df.sample(frac=1, random_state=self.seed).reset_index(drop=True)

        size = len(df)
        train_data = df.iloc[: int(train_ratio*size)]
        val_data = df.iloc[int(train_ratio*size): int(train_ratio*size+val_ratio*size)]
        test_data = df.iloc[int(train_ratio*size+val_ratio*size):]

        def create_data_loader(data):
            labels = data['is_prompt_extraction'].to_numpy().astype(int)
            one_hot_labels = th.eye(2)[labels].to(self.device)
            obs = data['llm_output'].tolist()

            encode_obs = self.tokenizer(obs,
                                        truncation = True,
                                        padding = 'max_length',
                                        add_special_tokens = False,
                                        max_length = 64,
                                        return_tensors = 'pt').to(self.device)
            encode_obs_list = [{key: encode_obs[key][i] for key in encode_obs} for i in range(len(encode_obs['input_ids']))]

            paired_data = list(zip(encode_obs_list, one_hot_labels))
            return DataLoader(paired_data, batch_size=self.batch_size)

        self.train_loader = create_data_loader(train_data)
        self.val_loader = create_data_loader(val_data)
        self.test_loader = create_data_loader(test_data)

    def train(self):
        self.model.train()
        for epoch in range(self.epoch):
            with tqdm(self.train_loader) as train_iter:
                for batch_obs, batch_label in train_iter:
                    batch_input_ids = batch_obs['input_ids']
                    batch_input_attn = batch_obs['attention_mask']

                    loss = self.model(
                        batch_input_ids,
                        batch_input_attn,
                        labels = batch_label
                    ).loss

                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    train_iter.set_description("loss: %f" % loss)

    def test(self):
        self.model.eval()
        true_labels = []
        predicted_labels = []

        with tqdm(self.test_loader) as test_iter:
            for batch_obs, batch_label in test_iter:
                with th.no_grad():
                    logits = self.model(**batch_obs).logits

                predicted_label = logits.argmax(dim=1).detach().cpu().numpy().tolist()
                predicted_labels.extend(predicted_label)

                true_label = th.argmax(batch_label, dim=1).tolist()
                true_labels.extend(true_label)

        accuracy = accuracy_score(true_labels, predicted_labels)
        return accuracy

In [None]:
from google.colab import drive
drive.mount('/content/drive')
ckp_path = '/content/drive/My Drive/finetuned_model.pth'

INFERENCE = True

trainer = FinetuneTagger(epoch = 15, seed = 1032)

if INFERENCE:
    trainer.model.load_state_dict(th.load(ckp_path))
else:
    trainer.train()
    trainer.model.save_pretrained(ckp_path)

ac = trainer.test()
print('\naccuracy', ac)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:00<00:00, 66.24it/s]


accuracy 0.6956521739130435



