In [1]:
from transformers import AutoTokenizer, MistralForCausalLM
import pandas as pd
import math
import numpy as np
from sklearn.metrics import roc_auc_score
import nlpaug.augmenter.word as naw
from functools import partial
from functions import *
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
batch_size = 1
num_workers = 16
TARGET_MODEL = "mistralai/Mistral-7B-v0.1"
save_model_path = 'Model/prompt02'
if not os.path.exists(save_model_path):
    os.makedirs(save_model_path)
device = 'cuda'

Data

In [3]:
val = pd.read_csv('data/val_data.csv')
train = pd.read_csv('data/train_data.csv')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# http://paraphrase.org/#/download
# https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb
aug = naw.SynonymAug(aug_src='ppdb',model_path='Model/ppdb-2.0-s-all',
                     aug_min=1,
                     aug_max=10,
                     aug_p=0.3)
# print(aug.augment(train.text.iloc[3]))

In [6]:
prompt = tokenizer.batch_encode_plus(['Is this essay AI-generated, yes or no?'],add_special_tokens=False,return_tensors='pt')
prompt,prompt_mask = prompt['input_ids'],prompt['attention_mask']

In [7]:
train_data = TxtData(train,aug.augment)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, \
                          collate_fn=partial(collate_fn,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
val_data = TxtData(val)
val_loader = DataLoader(val_data, batch_size=batch_size*2, shuffle=False, num_workers=num_workers, \
                          collate_fn=partial(collate_fn,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
# input_ids,attention_mask, label, score = next(iter(train_loader))

Model

In [8]:
epochs = 1
accumulation_steps = 64
verbose = 2048
lr = 6e-5
clip = 6e-3
alpha = 0.15
num_virtual_tokens = 32

In [9]:
from torch.nn.utils import clip_grad_value_
from transformers import BitsAndBytesConfig
from peft import (
    get_peft_model,
    PeftType,
    PromptEncoderConfig)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [10]:
model = MistralForCausalLM.from_pretrained(TARGET_MODEL,quantization_config=nf4_config, \
                                                          device_map={"":0},use_flash_attention_2=True)
peft_type = PeftType.P_TUNING
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=num_virtual_tokens, \
                                  encoder_hidden_size=4096,encoder_dropout=0.1,\
                                  encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
model = get_peft_model(model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id
# model.print_trainable_parameters()
trainable_params = [param for param in model.parameters() if param.requires_grad]
trainable_names = [name for name,param in model.named_parameters() if param.requires_grad]
# optimizer = torch.optim.AdamW(trainable_params,lr = lr,amsgrad=True,weight_decay=6e-3)
optimizer = torch.optim.SGD(trainable_params,lr=lr)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
loss_fn = nn.BCEWithLogitsLoss()
loss_fct = torch.nn.CrossEntropyLoss()
best_auc = 0
for epoch in range(epochs):
    model.train()
    train_loss = 0
    skip = 0
    for i, (input_ids,attention_mask, label, score) in enumerate(train_loader):
        # train
        input_ids,attention_mask, label, score = input_ids.to('cuda'),attention_mask.to('cuda'), label.to('cuda'), score.to('cuda')
        out = model(input_ids=input_ids,attention_mask=attention_mask)
        logits = out.logits[:,-1,[5081, 5592]].sum(-1) - out.logits[:,-1,[708, 1770]].sum(-1)
        if torch.any(torch.isnan(logits)):
            skip += 1
            continue
        # LM objective
        shift_logits = out.logits[..., num_virtual_tokens:-1, :].contiguous()
        shift_labels = input_ids[..., 1:].contiguous()
        # Flatten the tokens
        shift_logits = shift_logits.view(-1, tokenizer.vocab_size)
        shift_labels = shift_labels.view(-1)
        # Enable model parallelism
        shift_labels = shift_labels.to(shift_logits.device)
        loss_lm = loss_fct(shift_logits, shift_labels)

        loss = loss_fn(logits, label) + loss_lm * alpha
        if math.isinf(loss.item()) or math.isnan(loss.item()):
            skip += 1
            continue

        loss.backward()
        train_loss += loss.item()
        # print(i,train_loss)
        if (i + 1) % accumulation_steps == 0:
            clip_grad_value_(trainable_params,clip)
            optimizer.step()
            optimizer.zero_grad()

        # eval    
        if (i + 1) % verbose == 0:
            model.eval()
            train_loss /= (verbose-skip)
            yhat,y = [],[]
            for input_ids,attention_mask, label, score in val_loader:
                input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
                with torch.no_grad():
                    out = model(input_ids=input_ids,attention_mask=attention_mask)
                    logits = out.logits[:,-1,[5081, 5592]].sum(-1) - out.logits[:,-1,[708, 1770]].sum(-1)
                    out = logits.detach().cpu().numpy()
                if np.any(np.isnan(out)):
                    continue
                yhat.append(out)
                y.append(label)
            yhat = np.concatenate(yhat)
            y = np.concatenate(y)
            auc = roc_auc_score(y, yhat)
            print(f"epoch {epoch} iter {i}: train loss {train_loss}, test AUC {auc}")
            if auc > best_auc:
                best_auc = auc
                torch.save({k: v for k, v in model.state_dict().items() if k in trainable_names}, save_model_path+'/weights.pth')
            train_loss = 0
            skip = 0
            model.train()

epoch 0 iter 2047: train loss 1.3375108776963316, test AUC 0.46832432432432425
epoch 0 iter 4095: train loss 1.175271304018679, test AUC 0.6407986551144446
epoch 0 iter 6143: train loss 1.0663184128206922, test AUC 0.6800548299495668
epoch 0 iter 8191: train loss 1.0003767281741602, test AUC 0.71911858269753
epoch 0 iter 10239: train loss 0.9397046865778975, test AUC 0.7576795551532394
epoch 0 iter 12287: train loss 0.9210830043448368, test AUC 0.7855471356523989
epoch 0 iter 14335: train loss 0.8695427044440294, test AUC 0.8109432303116515
epoch 0 iter 16383: train loss 0.858693098751246, test AUC 0.797028320186215
epoch 0 iter 18431: train loss 0.8557329258619575, test AUC 0.811203413940256
epoch 0 iter 20479: train loss 0.8372019648668356, test AUC 0.8101497478339583
epoch 0 iter 22527: train loss 0.8191808817791753, test AUC 0.8308992628992629
epoch 0 iter 24575: train loss 0.8007474370970158, test AUC 0.8214048881417302
epoch 0 iter 26623: train loss 0.7928132977540372, test AUC 0

KeyboardInterrupt: 

Save/Load model

In [None]:
# only save/load the trainable.
# model.load_state_dict(torch.load(save_model_path+'/weights2.pth'),strict=False)

In [None]:
# model.base_model.save_pretrained(save_model_path,safe_serialization=False)
# model.config.save_pretrained(save_model_path)
# model.save_pretrained(save_model_path)
# tokenizer.save_pretrained(save_model_path)


# from peft import PeftModel
# baseModel = MistralForSequenceClassification.from_pretrained(TARGET_MODEL,num_labels=2,quantization_config=nf4_config, \
#                                                           device_map={"":0},use_flash_attention_2=True)
# peft_type = PeftType.P_TUNING
# peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
#                                   encoder_hidden_size=4096,encoder_dropout=0.1,\
#                                   encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
# model2 = PeftModel.from_pretrained(baseModel, 'Model/p_tune/adapter',config=peft_config)


# import json
# with open('Model/p_tune/config.json', 'r') as file:
#     config = json.load(file)
# from transformers import PretrainedConfig
# model = MistralForSequenceClassification(PretrainedConfig(**config))
# peft_type = PeftType.P_TUNING
# peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
#                                   encoder_hidden_size=4096,encoder_dropout=0.1,\
#                                   encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
# model = get_peft_model(model, peft_config)
# model.config.pad_token_id = tokenizer.pad_token_id
# model.load_state_dict(torch.load('Model/p_tune.pth'))
# model = model.to('cuda')

In [None]:
# model.eval()
# yhat,y = [],[]
# for input_ids,attention_mask, label, score in val_loader:
#     input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
#     with torch.no_grad():
#         out = model(input_ids=input_ids,attention_mask=attention_mask).logits[:,0].detach().cpu().numpy()
#     if np.any(np.isnan(out)):
#         continue
#     yhat.append(out)
#     y.append(label)
# yhat = np.concatenate(yhat)
# y = np.concatenate(y)
# auc = roc_auc_score(y, yhat)
# auc