In [10]:
import os

train_data_path = "./data/train_essays.csv"
train_prompts_path = "./data/train_prompts.csv"
supplement_data_dir = "./data/archive/"
supplement_data_files = [  os.path.join(supplement_data_dir,f)  
                          for f in os.listdir(supplement_data_dir)
                          if(f.endswith('.csv'))]

In [11]:
def set_proxy():
    import os
    cache_dir = "/home/tx/workspace/cache"  # 替换为你想要的缓存目录的路径
    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    # 代理
    os.environ['http_proxy'] = 'http://127.0.0.1:7890'
    os.environ['https_proxy'] = 'http://127.0.0.1:7890'
    os.environ['no_proxy'] = '127.0.0.1,localhost'
    os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
    os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'
    os.environ['NO_PROXY'] = '127.0.0.1,localhost'
    return
set_proxy()

In [12]:
#!curl huggingface.co

In [13]:
import pandas as pd
train_data=pd.read_csv(train_data_path)
train_prompts = pd.read_csv(train_prompts_path)
instructions = {
    0:train_prompts['instructions'][0],
    1:train_prompts['instructions'][1],
}

train_data['prompt'] = train_data.apply(
    lambda r: instructions[r['prompt_id']] if  r['prompt_id'] in instructions else -1,axis=1
)


In [14]:
train_data

Unnamed: 0,id,prompt_id,text,generated,prompt
0,0059830c,0,Cars. Cars have been around since they became ...,0,Write an explanatory essay to inform fellow ci...
1,005db917,0,Transportation is a large necessity in most co...,0,Write an explanatory essay to inform fellow ci...
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0,Write an explanatory essay to inform fellow ci...
3,00940276,0,How often do you ride in a car? Do you drive a...,0,Write an explanatory essay to inform fellow ci...
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0,Write an explanatory essay to inform fellow ci...
...,...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0,Write a letter to your state senator in which ...
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0,Write an explanatory essay to inform fellow ci...
1375,ffa247e0,0,There's a new trend that has been developing f...,0,Write an explanatory essay to inform fellow ci...
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0,Write an explanatory essay to inform fellow ci...


In [15]:


supplement_train_data = pd.concat([
    pd.read_csv(f)
    for f in supplement_data_files
])


In [16]:
supplement_train_data.label.value_counts()

label
0    115372
1     44084
Name: count, dtype: int64

In [17]:
supplement_train_data['generated'] = supplement_train_data['label']

supplement_train_data = supplement_train_data[['text','generated','prompt']]

In [18]:
supplement_train_data[supplement_train_data.generated == 1]

Unnamed: 0,text,generated,prompt
7,"As an eighth-grade student, I have noticed tha...",1,
10,Studying Science and History at Generic_School...,1,
17,"As an artist, I am constantly developing and h...",1,
22,After conducting extensive research on the eff...,1,
24,The COVID-19 pandemic has brought about a sign...,1,
...,...,...,...
39767,Sure thing! Here's my attempt at writing an es...,1,Task: Research why it is beneficial for people...
39770,I think our principal's idea of making us do e...,1,Your principal has decided that all students m...
39771,I think it's a good idea for schools to have o...,1,Some schools have implemented policies that al...
39778,Advantages of Limiting Car Usage\n\nLimiting c...,1,


In [19]:
import gc
#del train_data_re
gc.collect()
train_data_re = pd.DataFrame(train_data[['text','generated','prompt']])


In [20]:
train_data_all = pd.concat([train_data,supplement_train_data])

In [21]:
train_data_all.generated.value_counts()

generated
0    116747
1     44087
Name: count, dtype: int64

## build base_model

In [23]:
cache_dir = "/home/tx/workspace/cache"  # 替换为你想要的缓存目录的路径
from transformers import AutoModelForCausalLM,AutoTokenizer
from transformers import AutoConfig, LlamaConfig 
# from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
# class LlavaConfig(LlamaConfig):
#     model_type = "llava"
# AutoConfig.register("llava", LlavaConfig)
model_name = "bigscience/bloom-3b"
#model_name = "ChocoWu/nextgpt_7b_tiva_v0"#"liuhaotian/llava-v1.5-7b"
original_model = AutoModelForCausalLM.from_pretrained(model_name,cache_dir=cache_dir,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name,cache_dir=cache_dir)
# tokenizer.add_tokens(["[PAD]"])

## dataset

In [24]:
#train_data_re

In [25]:
def generate_prompt(prompt,feature_text):
    """
    
    """
    text= None 
    if(prompt is not None):
        template_with_prompt = f"""
        a prompt is shown as follows: 
        {prompt}.
        text generated by this this prompt is shown as below:
        {feature_text}.
        is this text generated by AI?
        """
        text = template_with_prompt
    else:
        template_without_prompt = f"""
        text generated without any prompt is shown as below:
        {feature_text}.
        is this text generated by AI?
        """
        text = template_without_prompt

    return text

In [26]:

from torch.utils.data import TensorDataset,DataLoader,RandomSampler
import torch
from tqdm.auto import tqdm

class myDataset(TensorDataset):

    def __init__(self, datalist,max_length=256,tokenizer=None,preprocess_func= None) -> None:
        super(myDataset,self).__init__()

        if(isinstance(datalist,pd.DataFrame)):
            self.datalist = datalist.to_dict(orient='list')
        elif(isinstance(datalist,dict)):
            self.datalist = datalist
        else:
            raise Exception("错误输入类型")
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.preprocess_func = preprocess_func
    
    def preprocess(self):
        datalist_tmp = {
            "text":[],
            "prompt":[],
            "generated":[]
        }
        for idx in tqdm(range(len(self))):
           _, act_len = self[idx]
           if(act_len > self.max_length):
               continue
           
           datalist_tmp['text'].append(self.datalist['text'][idx])
           datalist_tmp['prompt'].append(self.datalist['prompt'][idx])
           datalist_tmp['generated'].append(self.datalist['generated'][idx])
        
        self.datalist = datalist_tmp
        return



    def __len__(self):
        return len(self.datalist['text'])
    
    def __getitem__(self, index):
        text = self.datalist['text'][index]  
        prompt = self.datalist['prompt'][index]  

        final_text = self.preprocess_func(prompt,text)
        input_ids = self.tokenizer.encode(final_text)
        att_mask = [1] * len(input_ids)


        labels = None
        if('generated' in self.datalist):
            generated = self.datalist['generated'][index]
            label_text = " the text is generated by AI" if generated > 0 else "the text is written by students"
            label_ids = self.tokenizer.encode(label_text)

            labels = [self.tokenizer.eos_token_id]  * len(input_ids)
            labels = labels + label_ids
            input_ids = input_ids + label_ids   
            att_mask = [1] * len(input_ids)
        act_len = len(input_ids)
        while(len(input_ids) < self.max_length):
            input_ids.append(self.tokenizer.eos_token_id)
            if(labels is not None):
                labels.append(self.tokenizer.eos_token_id)
            att_mask.append(0)
        input_ids = input_ids[:self.max_length]
        labels = labels[:self.max_length]
        att_mask = att_mask[:self.max_length]
        if(labels is not None):
            return {'input_ids':torch.LongTensor(input_ids),'labels':torch.LongTensor(labels),'att_mask':torch.LongTensor(att_mask)},act_len
        else:
            return {'input_ids':torch.LongTensor(input_ids),'att_mask':torch.LongTensor(att_mask)}

        



In [27]:
train_data_all_sample = train_data_all.groupby(['generated']).sample(n=4000,replace=False)

In [28]:
dataset = myDataset(datalist=train_data_all_sample,max_length=512,tokenizer=tokenizer,preprocess_func=generate_prompt) 

dataset.preprocess()

  0%|          | 0/8000 [00:00<?, ?it/s]

In [33]:
len(dataset)

5247

In [34]:
random_sampler = RandomSampler(dataset)

In [35]:

dataloader_random = DataLoader(dataset, batch_size=2, sampler=random_sampler)


In [36]:
import torch

In [37]:
#model(input_ids = torch.LongTensor([dataset[0]['input_ids']]),attention_mask=torch.LongTensor([dataset[0]['att_mask']]))

In [38]:
# for x in original_model.named_parameters():
#      print(x)
# original_model

## model definition

In [39]:
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from peft import LoraConfig, TaskType
#lora_target_modules = ["query_key_value"]
lora_target_modules = [ f"transformer.h.{ly}.self_attention.query_key_value" for ly in range(25,29) ]

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, 
                         inference_mode=False, target_modules=lora_target_modules,
                         r=4, lora_alpha=16, lora_dropout=0.1)
model = get_peft_model(original_model,peft_config)
print(model.print_trainable_parameters())


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.7/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 5.2
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/tx/.conda/envs/llava/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


  warn(


trainable params: 163,840 || all params: 3,002,721,280 || trainable%: 0.005456383883888151
None


In [40]:
model = model.cuda()

In [41]:
import torch 
loss_fn = torch.nn.CrossEntropyLoss(reduction='mean',ignore_index=tokenizer.pad_token_id)



In [42]:
for x in model.parameters():
    print(x)

Parameter containing:
tensor([[ 0.0011,  0.0049,  0.0007,  ..., -0.0077,  0.0007,  0.0030],
        [ 0.0023, -0.0144,  0.0066,  ..., -0.0262,  0.0102,  0.0046],
        [ 0.0016, -0.0141,  0.0037,  ..., -0.0114, -0.0110, -0.0043],
        ...,
        [ 0.0008, -0.0010, -0.0003,  ...,  0.0008, -0.0002, -0.0005],
        [ 0.0008, -0.0010, -0.0003,  ...,  0.0008, -0.0002, -0.0005],
        [ 0.0008, -0.0010, -0.0003,  ...,  0.0008, -0.0002, -0.0005]],
       device='cuda:0')
Parameter containing:
tensor([0.7578, 0.5562, 0.6216,  ..., 0.6353, 0.5747, 0.9253], device='cuda:0')
Parameter containing:
tensor([-0.0575,  0.0819, -0.0065,  ...,  0.0696, -0.0514,  0.1059],
       device='cuda:0')
Parameter containing:
tensor([0.4917, 0.6392, 0.6440,  ..., 0.5195, 0.6235, 0.4048], device='cuda:0')
Parameter containing:
tensor([0.0128, 0.0041, 0.0119,  ..., 0.0114, 0.0082, 0.0043], device='cuda:0')
Parameter containing:
tensor([[ 0.0198, -0.0190,  0.0406,  ..., -0.0245, -0.0222, -0.0328],
       

In [43]:
import torch
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup
optimizer = torch.optim.AdamW(
    [
        {'params': [p for p in model.parameters() if p.requires_grad],'lr': 5e-5},
    ]
)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(dataloader_random) ),
)

In [49]:
def train():
    num_epochs = 1
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        pbar = tqdm(dataloader_random)
        for step, batch_a in enumerate(pbar):
            batch,_ = batch_a
            batch = {k: v.cuda() for k, v in batch.items()}
            #print(batch)
            #outputs = model(batch['input_ids'],labels=batch['labels'])
            labels_tensor = batch['labels']
            outputs = model(batch['input_ids'])
            
            logits = outputs.logits
            logits = logits[...,:-1,:].contiguous()
            labels_tensor = labels_tensor[...,1:].contiguous()
            
            
            #loss = outputs.loss
            loss = loss_fn(logits.view(-1, logits.size(-1)), labels_tensor.view(-1))
        
            total_loss += loss.detach().float()
            #print(loss)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            if(step % 10 == 0):
                pbar.set_description(f"step {step} loss {loss}")
            
#train()           

# Test

In [48]:
test_data_path = "./data/test_essays.csv"
test_data = pd.read_csv(test_data_path)

In [47]:
test_data

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [50]:
import os
save_dir =  "/home/tx/workspace/saved_model"
if(not os.path.exists(save_dir)):
    os.makedirs(save_dir)

In [None]:
model_save_name = 