pool -> top k instances -> fine-tune LLM

In [1]:
'''
credit: https://github.com/prateekjoshi565/Fine-Tuning-BERT
'''
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, BertTokenizerFast
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
# specify GPU
device = torch.device("cuda")

from typing import List, Union
from datasets import load_dataset
import json



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filter_type = 'classification'

# Import BERT Model and BERT Tokenizer

In [3]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased', return_dict=False)

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# gte = AutoModel.from_pretrained('thenlper/gte-small', return_dict=False)
# tokenizer = AutoTokenizer.from_pretrained('thenlper/gte-small')



In [4]:
text_embedder = bert

# Define Model Architecture

In [5]:
class Regression(nn.Module):

    def __init__(self, text_embedder):
      
      super(Regression, self).__init__()

      self.text_embedder = text_embedder 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,1)

      # self.fc1 = nn.Linear(384, 3) # for GTE-small

      # #softmax activation function
      # self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.text_embedder(sent_id, attention_mask=mask, return_dict=False)
      # print(cls_hs.shape)
      
      x = self.fc1(cls_hs) # GTE-small

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # # apply softmax activation
      # x = self.softmax(x)

      return x

In [6]:
class CLF(nn.Module):

    def __init__(self, text_embedder):
      
      super(CLF, self).__init__()

      self.text_embedder = text_embedder 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,3)

      # self.fc1 = nn.Linear(384, 3)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.text_embedder(sent_id, attention_mask=mask, return_dict=False)
      # print(cls_hs.shape)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [7]:
def get_model_arch(text_embedder, model_type: str):
    if model_type == 'regression':
        return Regression(text_embedder)
    elif model_type == 'classification':
        return CLF(text_embedder)
    else:
        raise ValueError('model_type should be either regression or classification')

In [8]:

# pass the pre-trained BERT to our define architecture
model = get_model_arch(text_embedder, model_type = filter_type)

# push the model to GPU
model = model.to(device)

# Load Saved Model

In [9]:
if filter_type == 'regression':
    model.load_state_dict(torch.load('saved_regression_weights.pt'))
else:
    model.load_state_dict(torch.load('saved_clf_weights.pt'))

In [10]:
model.fc2

Linear(in_features=512, out_features=3, bias=True)

# Data Selection Pool

In [11]:

def load_raw_dataset(train_files: Union[List[str], str]):
    """ load raw dataset """
    if isinstance(train_files, str):
        train_files = [train_files]
    processed_datasets = load_dataset(
        "json",
        data_files=train_files,
    )
    return processed_datasets

In [12]:
data_selection = load_raw_dataset(os.path.expanduser("~/data/llm/train/processed/dolly/train_dolly_data.jsonl"))['train']

In [13]:

def unfold_QA_short(data):
    # Create a new dataset with the unfolded QA format
    new_dataset = []
    for QA_entry in data:
        unfolded_QA_entry = ''
        for QA_pair in QA_entry:
            unfolded_QA_entry += QA_pair['role'] + ': ' + QA_pair['content'] + ' '
        new_dataset.append(unfolded_QA_entry)
    return new_dataset

In [14]:
pool = data_selection.map(lambda x: tokenizer(unfold_QA_short(x['messages']), padding='max_length', truncation=True, max_length=512), batched=True)

In [15]:
pool.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [16]:
# for test set
pool_seq = torch.tensor(pool['input_ids'])
pool_mask = torch.tensor(pool['attention_mask'])
# pool['input_ids'].clone().detach()

  pool_seq = torch.tensor(pool['input_ids'])
  pool_mask = torch.tensor(pool['attention_mask'])


In [17]:
pool_seq.shape

torch.Size([7505, 512])

# Filter Pool

In [18]:
# get predictions for test data
with torch.no_grad():
  batch_size = 32
  preds = []

  for i in range(0, len(pool_seq), batch_size):
    batch_seq = pool_seq[i:i+batch_size].to(device)
    batch_mask = pool_mask[i:i+batch_size].to(device)
    batch_preds = model(batch_seq, batch_mask)
    preds.append(batch_preds)
    # .detach().cpu().numpy()
    if i % 100 == 0:
      print("Processed ", i)
      
  preds = torch.cat(preds, dim=0)


Processed  0
Processed  800
Processed  1600
Processed  2400
Processed  3200
Processed  4000
Processed  4800
Processed  5600
Processed  6400
Processed  7200


In [20]:
filter_type

'classification'

In [19]:
preds.shape

torch.Size([7505, 3])

In [None]:
if filter_type == 'regression':
    preds = preds.squeeze()
    torch.save(preds, "../selected_data/filtered/mmlu/dolly_influence_score.pt")
else:
    preds = preds.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    target_indices = np.arange(len(preds))[preds == 2]
    results = data_selection.select(target_indices)
    with open(os.path.expanduser('~/data/llm/train/processed/dolly/filtered_train_dolly_data.jsonl'), 'w', encoding='utf-8') as file:
        for entry in results:
            file.write(json.dumps(entry) + '\n')

In [26]:
preds.shape

(7505,)