In [2]:
import datasets
import random
from transformers import AutoTokenizer
from utils import ChatMlSpecialTokens
import torch
torch.set_printoptions(threshold=10000)
random.seed(100)

In [3]:
data = datasets.load_from_disk("merge")
shuffle_data = data.shuffle()
shuffle_data

Dataset({
    features: ['data'],
    num_rows: 457887
})

In [4]:
shuffle_data['data'][0]

['i want to design a special  prompts and test it interactively to  predict complications in diabetes from doctor free notes, what dataset quality will be suitable',
 'I would recommend using a high-quality dataset that is well-curated and has been thoroughly cleaned. The dataset should be large enough to provide enough training data for your model, and it should be representative of the real-world data that you will be using.']

In [5]:
def filter_data(sample):
    turns = len(sample['data'])
    prompt_len = sum(len(s) for s in sample['data'])
    if prompt_len > 800 and prompt_len < 3600:
        if turns < 6:
            if random.random() < 0.7 :
                return True
            else:
                return False
        else:
            return True
    else:
        return False
# 应用过滤器
filtered_data = shuffle_data.filter(filter_data)
filtered_data

Filter:   0%|          | 0/457887 [00:00<?, ? examples/s]

Dataset({
    features: ['data'],
    num_rows: 187864
})

In [6]:
invaild = 0
for i in filtered_data['data']:
    if len(i) % 2 != 0:
        invaild += 1
print(invaild)

0


In [7]:
system_msg = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"""
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
chat_format = ChatMlSpecialTokens()
tokenizer.chat_template = chat_format.chat_template

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
chat = []
for item in filtered_data:
    conv = []
    if len(item['data']) % 2 != 0:
        continue
    for idx, msg in enumerate(item["data"]):
        # 去掉以 assistant:和user:结尾的数据，不干净的数据
        if msg.strip().endswith("assistant:") or msg.strip().endswith("user:"):
            conv = []
            break
        if msg.strip().endswith("Assistant:") or msg.strip().endswith("User:"):
            conv = []
            break
        if idx == 0:
            conv.append( {"role": "system", "content": system_msg })
        if idx % 2 ==0 :
            conv.append({"role": "user", "content": msg})
        else:
            conv.append({"role": "assistant", "content": msg})
    if conv:
        chat.append(conv)

In [9]:
for i in chat:
    if len(i) % 2 != 1:
        print(i)

In [10]:
len(chat)

176306

In [11]:
chat_tokens = tokenizer.apply_chat_template(chat, tokenize=True)
len(chat_tokens)

176306

In [12]:
tokenizer.decode(chat_tokens[0])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.\nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\'t know the answer to a question, please don\'t share false information.\n<|eot_id|><|start_header_id|>user<|end_header_id|>1. Create a Google account if you don\'t already have one.<|eot_id|><|start_header_id|>assistant<|end_header_id|>I don\'t have the ability to create an account. however, you can create a google account by following these steps:\n\n1. go to the google sign-up page: https://accounts.google.com/signup.\n2. enter your first and last name in the appropriate fields.\n3. choose 

In [13]:
chat[0]

[{'role': 'system',
  'content': "\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.\nYour answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"},
 {'role': 'user',
  'content': "1. Create a Google account if you don't already have one."},
 {'role': 'assistant',
  'content': 'I don\'t have the ability to create an account. however, you can create a google account by following these steps:\n\n1. go to the google sign-up page: https://accounts.google.com/signup.\n2. enter your first and last name in the appropriate fields.\n3. choose a username for your account. this will become your gmail 

In [14]:
filter_length_idx = []
for idx, item in enumerate(chat_tokens):
    if len(item) < 1024:
        filter_length_idx.append(idx)
len(filter_length_idx)

176220

In [15]:
select_chat = []
for i in filter_length_idx:
    select_chat.append(chat[i])

In [16]:
select_chat_format = tokenizer.apply_chat_template(select_chat, tokenize=True)

In [17]:
# 看有没有超过1024个token的
exceed = []
for idx, item in enumerate(select_chat_format):
    if len(item) >= 1024:
        exceed.append(idx)
len(exceed)

0

In [18]:
chat_dataset = datasets.Dataset.from_list([ {"messages": item}  for item in select_chat])

In [19]:
train_test_data=chat_dataset.train_test_split(test_size=0.007)
train_test_data

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 174986
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 1234
    })
})

In [20]:
train_test_data.save_to_disk("1024_train_and_test")

Saving the dataset (0/1 shards):   0%|          | 0/174986 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1234 [00:00<?, ? examples/s]

In [21]:
invaild = []
for item in train_test_data['train']:
    lenght = len(item["messages"])
    if lenght % 2 == 0:
        print(item)
        invaild.append(item)

In [22]:
# invaild

[]