In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForCausalLM
import torch.nn as nn
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.metrics import roc_auc_score
import nlpaug.augmenter.word as naw
from functools import partial
from functions import *
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
batch_size = 1
num_workers = 16
accumulation_steps = 64
verbose = 2048
lr = 8e-5
clip = 8e-3

save_model_path = get_next_folder_name('Model/')
if not os.path.exists(save_model_path):
    os.makedirs(save_model_path)
device = 'cuda'

In [3]:
## get random hyper-parameters ##
TARGET_MODEL = "mistralai/Mistral-7B-v0.1"
pred_type = np.random.choice(['LM','classification'])
config_type = np.random.choice(['prefix','prompt_encoder','prompt_txt','LoRA'],p=[0.3,0.3,0.1,0.3])
epochs = 2
alpha = np.random.rand()*0.1
aug_kwargs = dict(aug_max=np.random.randint(5,30),aug_p=np.random.rand()*0.3)
config_class,config_kwargs = get_random_config(config_type,pred_type,TARGET_MODEL)
# save
config = save_config(TARGET_MODEL, pred_type, config_type, epochs, alpha, aug_kwargs, config_kwargs)
with open(save_model_path+'/config.pkl', 'wb') as pickle_file:
    pickle.dump(config, pickle_file)

In [4]:
# load hyper-parameter for inference
# with open(save_model_path+'/config.pkl', 'rb') as pickle_file:
#     config = pickle.load(pickle_file)
# TARGET_MODEL, pred_type, config_type, epochs, alpha, aug_kwargs, config_class, config_kwargs = load_config(config)

Data

In [5]:
val = pd.read_csv('data/val_data.csv')
train = pd.read_csv('data/train_data.csv')

In [6]:
tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# http://paraphrase.org/#/download
# https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb
aug = naw.SynonymAug(aug_src='ppdb',model_path='Model/ppdb-2.0-s-all',
                     aug_min=1,
                     **aug_kwargs)
# print(aug.augment(train.text.iloc[3]))

In [8]:
if pred_type == 'LM':
    prompt = tokenizer.batch_encode_plus(['Is this essay AI-generated, yes or no?'],add_special_tokens=False,return_tensors='pt')
    prompt,prompt_mask = prompt['input_ids'],prompt['attention_mask']
else:
    prompt,prompt_mask = None, None
train_data = TxtData(train,aug.augment)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, \
                        collate_fn=partial(collate_fn,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
val_data = InfData(val)
val_loader = DataLoader(val_data, batch_size=batch_size*2, shuffle=False, num_workers=num_workers, \
                        collate_fn=partial(collate_inf,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
# input_ids,attention_mask, label, score = next(iter(train_loader))

Model

In [9]:
from torch.nn.utils import clip_grad_value_
from transformers import BitsAndBytesConfig
from peft import get_peft_model

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
base_class = AutoModelForCausalLM if pred_type == 'LM' else AutoModelForSequenceClassification
base_model = base_class.from_pretrained(TARGET_MODEL,quantization_config=nf4_config, \
                                                          device_map={"":0},use_flash_attention_2=True)
peft_config = config_class(**config_kwargs)    
model = get_peft_model(base_model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id
# model.print_trainable_parameters()
trainable_params = [param for param in model.parameters() if param.requires_grad]
trainable_names = [name for name,param in model.named_parameters() if param.requires_grad]
# optimizer = torch.optim.AdamW(trainable_params,lr = lr,amsgrad=True,weight_decay=6e-3)
optimizer = torch.optim.SGD(trainable_params,lr=lr)
if pred_type == 'LM':
    model_lm = LM(model,tokenizer,config_kwargs.get('num_virtual_tokens', 0),alpha,config_type)
else:
    model_lm = Classification(model,alpha)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
loss_fn = nn.BCEWithLogitsLoss()
loss_fct = torch.nn.CrossEntropyLoss()
best_auc = 0
for epoch in range(epochs):
    model.train()
    train_loss = 0
    skip = 0
    for i, (input_ids,attention_mask, label, score) in enumerate(train_loader):
        # train
        input_ids,attention_mask, label, score = input_ids.to('cuda'),attention_mask.to('cuda'), label.to('cuda'), score.to('cuda')
        loss = model_lm.get_loss(input_ids,attention_mask, label, score)
        if math.isinf(loss.item()) or math.isnan(loss.item()):
            skip += 1
            continue

        loss.backward()
        train_loss += loss.item()
        # print(i,train_loss)
        if (i + 1) % accumulation_steps == 0:
            clip_grad_value_(trainable_params,clip)
            optimizer.step()
            optimizer.zero_grad()

        # eval    
        if (i + 1) % verbose == 0:
            model.eval()
            train_loss /= (verbose-skip)
            yhat = []
            for input_ids,attention_mask in val_loader:
                input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
                out = model_lm.predict(input_ids,attention_mask).detach().cpu().numpy()
                yhat.append(out)
            yhat = np.concatenate(yhat)
            auc = roc_auc_score(val.label.to_numpy(), yhat)
            print(f"epoch {epoch} iter {i}: train loss {train_loss}, test AUC {auc}")
            if auc > best_auc:
                best_auc = auc
                torch.save({k: v for k, v in model.state_dict().items() if k in trainable_names}, save_model_path+'/weights.pth')
            train_loss = 0
            skip = 0
            model.train()

epoch 0 iter 2047: train loss 2.0507650351898974, test AUC 0.539869132290185
epoch 0 iter 4095: train loss 1.9345119767074266, test AUC 0.5453557480925901


KeyboardInterrupt: 

Save/Load model

In [None]:
# only save/load the trainable.
# model.load_state_dict(torch.load(save_model_path+'/weights2.pth'),strict=False)

In [None]:
# model.base_model.save_pretrained(save_model_path,safe_serialization=False)
# model.config.save_pretrained(save_model_path)
# model.save_pretrained(save_model_path)
# tokenizer.save_pretrained(save_model_path)


# from peft import PeftModel
# baseModel = MistralForSequenceClassification.from_pretrained(TARGET_MODEL,num_labels=2,quantization_config=nf4_config, \
#                                                           device_map={"":0},use_flash_attention_2=True)
# peft_type = PeftType.P_TUNING
# peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
#                                   encoder_hidden_size=4096,encoder_dropout=0.1,\
#                                   encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
# model2 = PeftModel.from_pretrained(baseModel, 'Model/p_tune/adapter',config=peft_config)


# import json
# with open('Model/p_tune/config.json', 'r') as file:
#     config = json.load(file)
# from transformers import PretrainedConfig
# model = MistralForSequenceClassification(PretrainedConfig(**config))
# peft_type = PeftType.P_TUNING
# peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
#                                   encoder_hidden_size=4096,encoder_dropout=0.1,\
#                                   encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
# model = get_peft_model(model, peft_config)
# model.config.pad_token_id = tokenizer.pad_token_id
# model.load_state_dict(torch.load('Model/p_tune.pth'))
# model = model.to('cuda')

In [None]:
# model.eval()
# yhat,y = [],[]
# for input_ids,attention_mask, label, score in val_loader:
#     input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
#     with torch.no_grad():
#         out = model(input_ids=input_ids,attention_mask=attention_mask).logits[:,0].detach().cpu().numpy()
#     if np.any(np.isnan(out)):
#         continue
#     yhat.append(out)
#     y.append(label)
# yhat = np.concatenate(yhat)
# y = np.concatenate(y)
# auc = roc_auc_score(y, yhat)
# auc