In [1]:
from transformers import AutoTokenizer, MistralForSequenceClassification
import pandas as pd
import math
import numpy as np
from sklearn.metrics import roc_auc_score
import nlpaug.augmenter.word as naw
from functools import partial
from functions import *
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
batch_size = 1
num_workers = 16
TARGET_MODEL = "mistralai/Mistral-7B-v0.1"
save_model_path = 'Model/p_tune'
device = 'cuda'

Data

In [3]:
val = pd.read_csv('data/val_data.csv')
train = pd.read_csv('data/train_data.csv')

In [4]:
tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# http://paraphrase.org/#/download
# https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb
aug = naw.SynonymAug(aug_src='ppdb',model_path='Model/ppdb-2.0-s-all',
                     aug_min=1,
                     aug_max=10,
                     aug_p=0.3)
# print(aug.augment(train.text.iloc[3]))

In [6]:
train_data = TxtData(train,aug.augment)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, \
                          collate_fn=partial(collate_fn,tokenizer=tokenizer))
val_data = TxtData(val)
val_loader = DataLoader(val_data, batch_size=batch_size*2, shuffle=False, num_workers=num_workers, \
                          collate_fn=partial(collate_fn,tokenizer=tokenizer))
# input_ids,attention_mask, label, score = next(iter(train_loader))

Model

In [7]:
epochs = 1
accumulation_steps = 64
verbose = 2048
lr = 6e-5
clip = 6e-3
alpha = 0.05

In [9]:
from torch.nn.utils import clip_grad_value_
from transformers import BitsAndBytesConfig
from peft import (
    get_peft_model,
    PeftType,
    PromptEncoderConfig)

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:
model = MistralForSequenceClassification.from_pretrained(TARGET_MODEL,num_labels=2,quantization_config=nf4_config, \
                                                          device_map={"":0},use_flash_attention_2=True)
peft_type = PeftType.P_TUNING
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
                                  encoder_hidden_size=4096,encoder_dropout=0.1,\
                                  encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
model = get_peft_model(model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id
# model.print_trainable_parameters()
trainable_params = [param for param in model.parameters() if param.requires_grad]
# optimizer = torch.optim.AdamW(trainable_params,lr = lr,amsgrad=True,weight_decay=6e-3)
optimizer = torch.optim.SGD(trainable_params,lr=lr)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
loss_fn = nn.BCEWithLogitsLoss()
best_auc = 0
for epoch in range(epochs):
    model.train()
    train_loss = 0
    skip = 0
    for i, (input_ids,attention_mask, label, score) in enumerate(train_loader):
        # train
        input_ids,attention_mask, label, score = input_ids.to('cuda'),attention_mask.to('cuda'), label.to('cuda'), score.to('cuda')
        out = model(input_ids=input_ids,attention_mask=attention_mask).logits
        if torch.any(torch.isnan(out)):
            skip += 1
            continue
        loss_label = loss_fn(out[:,0], label)
        loss_score = torch.sum((score!=-1.)*torch.abs(out[:,1]-score))/torch.sum((score!=-1.)+0.01)
        loss = loss_label + loss_score * alpha
        if math.isinf(loss.item()) or math.isnan(loss.item()):
            skip += 1
            continue

        loss.backward()
        train_loss += loss.item()
        # print(i,train_loss)
        if (i + 1) % accumulation_steps == 0:
            clip_grad_value_(trainable_params,clip)
            optimizer.step()
            optimizer.zero_grad()

        # eval    
        if (i + 1) % verbose == 0:
            model.eval()
            train_loss /= (verbose-skip)
            yhat,y = [],[]
            for input_ids,attention_mask, label, score in val_loader:
                input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
                with torch.no_grad():
                    out = model(input_ids=input_ids,attention_mask=attention_mask).logits[:,0].detach().cpu().numpy()
                if np.any(np.isnan(out)):
                    continue
                yhat.append(out)
                y.append(label)
            yhat = np.concatenate(yhat)
            y = np.concatenate(y)
            auc = roc_auc_score(y, yhat)
            print(f"epoch {epoch} iter {i}: train loss {train_loss}, test AUC {auc}")
            if auc > best_auc:
                best_auc = auc
                torch.save(model.state_dict(), save_model_path+'/weights.pth')
            train_loss = 0
            skip = 0
            model.train()

epoch 0 iter 2047: train loss 0.30687638403996687, test AUC 0.93735471356524
epoch 0 iter 4095: train loss 0.29798026692151325, test AUC 0.9283357041251777
epoch 0 iter 6143: train loss 0.2599989454606515, test AUC 0.9179283589809906
epoch 0 iter 8191: train loss 0.2405460048556023, test AUC 0.9446688219319799
epoch 0 iter 10239: train loss 0.22331751200012206, test AUC 0.9139868097762835
epoch 0 iter 12287: train loss 0.21671514115720925, test AUC 0.9193089357299884
epoch 0 iter 14335: train loss 0.22223180926209807, test AUC 0.9205498512866934
epoch 0 iter 16383: train loss 0.20717017828786766, test AUC 0.9491234967024442


KeyboardInterrupt: 

Save/Load model

In [None]:
model.load_state_dict(torch.load('Model/p_tune.pth'))

<All keys matched successfully>

In [13]:
# model.base_model.save_pretrained(save_model_path,safe_serialization=False)
# model.config.save_pretrained(save_model_path)
# model.save_pretrained(save_model_path)
# tokenizer.save_pretrained(save_model_path)


# from peft import PeftModel
# baseModel = MistralForSequenceClassification.from_pretrained(TARGET_MODEL,num_labels=2,quantization_config=nf4_config, \
#                                                           device_map={"":0},use_flash_attention_2=True)
# peft_type = PeftType.P_TUNING
# peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
#                                   encoder_hidden_size=4096,encoder_dropout=0.1,\
#                                   encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
# model2 = PeftModel.from_pretrained(baseModel, 'Model/p_tune/adapter',config=peft_config)


# import json
# with open('Model/p_tune/config.json', 'r') as file:
#     config = json.load(file)
# from transformers import PretrainedConfig
# model = MistralForSequenceClassification(PretrainedConfig(**config))
# peft_type = PeftType.P_TUNING
# peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=8, \
#                                   encoder_hidden_size=4096,encoder_dropout=0.1,\
#                                   encoder_num_layers=2,encoder_reparameterization_type='MLP')#'LSTM')
# model = get_peft_model(model, peft_config)
# model.config.pad_token_id = tokenizer.pad_token_id
# model.load_state_dict(torch.load('Model/p_tune.pth'))
# model = model.to('cuda')

In [None]:
model.eval()
yhat,y = [],[]
for input_ids,attention_mask, label, score in val_loader:
    input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
    with torch.no_grad():
        out = model(input_ids=input_ids,attention_mask=attention_mask).logits[:,0].detach().cpu().numpy()
    if np.any(np.isnan(out)):
        continue
    yhat.append(out)
    y.append(label)
yhat = np.concatenate(yhat)
y = np.concatenate(y)
auc = roc_auc_score(y, yhat)

In [None]:
auc

0.9491602224233804