In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForCausalLM
import torch.nn as nn
import pandas as pd
import numpy as np
import math
import pickle
from sklearn.metrics import roc_auc_score
import nlpaug.augmenter.word as naw
from functools import partial
from functions import *
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
batch_size = 1
num_workers = 16
accumulation_steps = 64
verbose = 2048
lr = 8e-5
clip = 8e-3

save_model_path = get_next_folder_name('Model/')
if not os.path.exists(save_model_path):
    os.makedirs(save_model_path)
device = 'cuda'

In [5]:
## get random hyper-parameters ##
TARGET_MODEL = "mistralai/Mistral-7B-v0.1"
pred_type = np.random.choice(['LM','classification'])
config_type = np.random.choice(['prefix','prompt_encoder','prompt_txt','LoRA'],p=[0.3,0.3,0.1,0.3])
epochs = 2
alpha = np.random.rand()*0.1
aug_kwargs = dict(aug_max=np.random.randint(5,30),aug_p=np.random.rand()*0.3)
config_class,config_kwargs = get_random_config(config_type,pred_type,TARGET_MODEL)

In [9]:
TARGET_MODEL, pred_type, config_type, epochs, alpha, aug_kwargs, config_class, config_kwargs

('mistralai/Mistral-7B-v0.1',
 'LM',
 'LoRA',
 2,
 0.08,
 {'aug_max': 24, 'aug_p': 0.017483054157222764},
 peft.tuners.lora.config.LoraConfig,
 {'r': 64,
  'lora_alpha': 16,
  'lora_dropout': 0.19961801556694317,
  'bias': 'none',
  'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj']})

In [7]:
# load hyper-parameter for inference
# with open(save_model_path+'/config.pkl', 'rb') as pickle_file:
#     config = pickle.load(pickle_file)
# TARGET_MODEL, pred_type, config_type, epochs, alpha, aug_kwargs, config_class, config_kwargs = load_config(config)

Data

In [10]:
val = pd.read_csv('data/val_data2.csv')
train = pd.read_csv('data/train_data2.csv')

In [11]:
tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
tokenizer.pad_token = tokenizer.eos_token

In [12]:
# http://paraphrase.org/#/download
# https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb
aug = naw.SynonymAug(aug_src='ppdb',model_path='Model/ppdb-2.0-s-all',
                     aug_min=1,
                     **aug_kwargs)
# print(aug.augment(train.text.iloc[3]))

In [13]:
if pred_type == 'LM':
    prompt = tokenizer.batch_encode_plus(['Is this essay AI-generated, yes or no?'],add_special_tokens=False,return_tensors='pt')
    prompt,prompt_mask = prompt['input_ids'],prompt['attention_mask']
else:
    prompt,prompt_mask = None, None
train_data = TxtData(train,aug.augment)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=num_workers, \
                        collate_fn=partial(collate_fn,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
val_data = InfData(val)
val_loader = DataLoader(val_data, batch_size=batch_size*2, shuffle=False, num_workers=num_workers, \
                        collate_fn=partial(collate_inf,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
# input_ids,attention_mask, label, score, topic = next(iter(train_loader))

Model

In [14]:
# Change hidden_states output in modeling_mistral.py
# return CausalLMOutputWithPast(
#     loss=loss,
#     logits=logits,
#     past_key_values=outputs.past_key_values,
#     hidden_states=outputs.hidden_states,
#     hidden_states=hidden_states,
#     attentions=outputs.attentions,
# )

In [15]:
beta_ = 0.25

In [16]:
from torch.nn.utils import clip_grad_value_
from transformers import BitsAndBytesConfig
from peft import get_peft_model

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)
base_class = AutoModelForCausalLM if pred_type == 'LM' else AutoModelForSequenceClassification
base_model = base_class.from_pretrained(TARGET_MODEL,quantization_config=nf4_config, \
                                                          device_map={"":0},use_flash_attention_2=True)
# base_model.config.output_hidden_states = True
peft_config = config_class(**config_kwargs)    
model = get_peft_model(base_model, peft_config)
model.config.pad_token_id = tokenizer.pad_token_id
# model.print_trainable_parameters()
topicModel = MLP(model.config.hidden_size,14,2,2048,0.1).to('cuda')
trainable_params = [param for param in model.parameters() if param.requires_grad] + [param for param in topicModel.parameters()]
trainable_names = [name for name,param in model.named_parameters() if param.requires_grad]
# optimizer = torch.optim.AdamW(trainable_params,lr = lr,amsgrad=True,weight_decay=6e-3)
optimizer = torch.optim.SGD(trainable_params,lr=lr)
if pred_type == 'LM':
    model_lm = LM(model,tokenizer,config_kwargs.get('num_virtual_tokens', 0),alpha,config_type,topicModel,beta_)
else:
    model_lm = Classification(model,alpha,topicModel,beta_)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
loss_fn = nn.BCEWithLogitsLoss()
loss_fct = torch.nn.CrossEntropyLoss()
best_auc = 0
topicModel.train()
for epoch in range(epochs):
    model.train()
    train_loss = 0
    skip = 0
    for i, (input_ids,attention_mask, label, score, topic) in enumerate(train_loader):
        # train
        input_ids,attention_mask, label, score, topic = input_ids.to('cuda'),attention_mask.to('cuda'), label.to('cuda'), score.to('cuda'), topic.to('cuda')
        loss = model_lm.get_loss(input_ids,attention_mask, label, score,topic)
        if math.isinf(loss.item()) or math.isnan(loss.item()):
            skip += 1
            continue

        loss.backward()
        train_loss += loss.item()
        # print(i,train_loss)
        if (i + 1) % accumulation_steps == 0:
            clip_grad_value_(trainable_params,clip)
            optimizer.step()
            optimizer.zero_grad()

        # eval    
        if (i + 1) % verbose == 0:
            model.eval()
            train_loss /= (verbose-skip)
            yhat = []
            for input_ids,attention_mask in val_loader:
                input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
                out = model_lm.predict(input_ids,attention_mask).detach().cpu().numpy()
                yhat.append(out)
            yhat = np.concatenate(yhat)
            auc = roc_auc_score(val.label.to_numpy(), yhat)
            print(f"epoch {epoch} iter {i}: train loss {train_loss}, test AUC {auc}")
            if auc > best_auc:
                best_auc = auc
                torch.save({k: v for k, v in model.state_dict().items() if k in trainable_names}, save_model_path+'/weights.pth')
            train_loss = 0
            skip = 0
            model.train()

In [None]:
# save config and local score
config = save_config(TARGET_MODEL, pred_type, config_type, epochs, alpha, aug_kwargs, config_kwargs)
config['local auc'] = best_auc
with open(save_model_path+'/config.pkl', 'wb') as pickle_file:
    pickle.dump(config, pickle_file)

Inference

In [None]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForCausalLM
# import torch.nn as nn
# import pandas as pd
# import numpy as np
# import math
# import pickle
# from sklearn.metrics import roc_auc_score
# import nlpaug.augmenter.word as naw
# from functools import partial
# from functions import *
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# batch_size = 1
# num_workers = 16
# accumulation_steps = 64
# verbose = 2048
# lr = 8e-5
# clip = 8e-3

# save_model_path = 'Model/model03'
# device = 'cuda'
# # load hyper-parameter for inference
# with open(save_model_path+'/config.pkl', 'rb') as pickle_file:
#     config = pickle.load(pickle_file)
# TARGET_MODEL, pred_type, config_type, epochs, alpha, aug_kwargs, config_class, config_kwargs = load_config(config)

# val = pd.read_csv('data/val_data.csv')
# tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL)
# tokenizer.pad_token = tokenizer.eos_token
# if pred_type == 'LM':
#     prompt = tokenizer.batch_encode_plus(['Is this essay AI-generated, yes or no?'],add_special_tokens=False,return_tensors='pt')
#     prompt,prompt_mask = prompt['input_ids'],prompt['attention_mask']
# else:
#     prompt,prompt_mask = None, None
# val_data = InfData(val)
# val_loader = DataLoader(val_data, batch_size=batch_size*4, shuffle=False, num_workers=num_workers, \
#                         collate_fn=partial(collate_inf,tokenizer=tokenizer,prompt=prompt,prompt_mask=prompt_mask))
# # input_ids,attention_mask, label, score = next(iter(train_loader))

# from torch.nn.utils import clip_grad_value_
# from transformers import BitsAndBytesConfig
# from peft import get_peft_model

# nf4_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_use_double_quant=True,
#    bnb_4bit_compute_dtype=torch.bfloat16
# )
# base_class = AutoModelForCausalLM if pred_type == 'LM' else AutoModelForSequenceClassification
# base_model = base_class.from_pretrained(TARGET_MODEL,quantization_config=nf4_config, \
#                                                           device_map={"":0},use_flash_attention_2=True)
# peft_config = config_class(**config_kwargs)    
# model = get_peft_model(base_model, peft_config)
# model.config.pad_token_id = tokenizer.pad_token_id

# model.load_state_dict(torch.load(save_model_path+'/weights.pth'),strict=False)
# model = model.half()
# if pred_type == 'LM':
#     model_lm = LM(model,tokenizer,config_kwargs.get('num_virtual_tokens', 0),alpha,config_type)
# else:
#     model_lm = Classification(model,alpha)

# yhat = []
# for input_ids,attention_mask in val_loader:
#     input_ids,attention_mask = input_ids.to('cuda'),attention_mask.to('cuda')
#     out = model_lm.predict(input_ids,attention_mask).detach().cpu().numpy()
#     yhat.append(out)
# yhat = np.concatenate(yhat)
# auc = roc_auc_score(val.label.to_numpy(), yhat)
# print(auc)