数据集与模型加载

In [None]:
!pip install datasets
!pip install transformers
!pip install peft

In [None]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, TrainerCallback

In [None]:
ds = load_dataset("yahma/alpaca-cleaned", split="train[:8000]")
ds

Dataset({
    features: ['input', 'instruction', 'output'],
    num_rows: 8000
})

In [None]:
ds[:3]

{'input': ['', '', ''],
 'instruction': ['Give three tips for staying healthy.',
  'What are the three primary colors?',
  'Describe the structure of an atom.'],
 'output': ['1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
  'The three primary colors are red, blue, and yellow. These col

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")
tokenizer

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

BloomTokenizerFast(name_or_path='bigscience/bloom-1b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False)

In [None]:
def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ")
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8000
})

In [None]:
tokenizer.decode(tokenized_ds[1]["input_ids"])

'Human: What are the three primary colors?\n\nAssistant: The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).</s>'

In [None]:
tokenizer.decode(list(filter(lambda x: x != -100, tokenized_ds[1]["labels"])))

'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).</s>'

In [None]:
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1", low_cpu_mem_usage=True)

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.13G [00:00<?, ?B/s]

In [None]:
from peft import PrefixTuningConfig, get_peft_model, TaskType

config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=10, prefix_projection=True)
config

PrefixTuningConfig(peft_type=<PeftType.PREFIX_TUNING: 'PREFIX_TUNING'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, num_virtual_tokens=10, token_dim=None, num_transformer_submodules=None, num_attention_heads=None, num_layers=None, encoder_hidden_size=None, prefix_projection=True)

In [None]:
model = get_peft_model(model, config)

In [None]:
model.prompt_encoder

ModuleDict(
  (default): PrefixEncoder(
    (embedding): Embedding(10, 1536)
    (transform): Sequential(
      (0): Linear(in_features=1536, out_features=1536, bias=True)
      (1): Tanh()
      (2): Linear(in_features=1536, out_features=73728, bias=True)
    )
  )
)

In [None]:
model.print_trainable_parameters()

trainable params: 115,696,128 || all params: 1,181,010,432 || trainable%: 9.796367996857796


In [None]:
args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=1
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [None]:
class LossLoggingCallback(TrainerCallback):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.losses.append(logs['loss'])
            with open(f"{self.output_dir}/losses.txt", "a") as f:
                f.write(f"{state.global_step}: {logs['loss']}\n")

trainer.add_callback(LossLoggingCallback(output_dir="./"))
trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 3.484, 'learning_rate': 4.9500000000000004e-05, 'epoch': 0.01}
{'loss': 2.2931, 'learning_rate': 4.9e-05, 'epoch': 0.02}
{'loss': 2.136, 'learning_rate': 4.85e-05, 'epoch': 0.03}
{'loss': 2.1706, 'learning_rate': 4.8e-05, 'epoch': 0.04}
{'loss': 2.0298, 'learning_rate': 4.75e-05, 'epoch': 0.05}
{'loss': 1.9578, 'learning_rate': 4.7e-05, 'epoch': 0.06}
{'loss': 2.1358, 'learning_rate': 4.6500000000000005e-05, 'epoch': 0.07}
{'loss': 1.8626, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.08}
{'loss': 1.8911, 'learning_rate': 4.55e-05, 'epoch': 0.09}
{'loss': 1.9588, 'learning_rate': 4.5e-05, 'epoch': 0.1}
{'loss': 1.7925, 'learning_rate': 4.4500000000000004e-05, 'epoch': 0.11}
{'loss': 1.8493, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.12}
{'loss': 2.0942, 'learning_rate': 4.35e-05, 'epoch': 0.13}
{'loss': 1.9398, 'learning_rate': 4.3e-05, 'epoch': 0.14}
{'loss': 1.8094, 'learning_rate': 4.25e-05, 'epoch': 0.15}
{'loss': 1.8911, 'learning_rate': 4.2e-05, 'epoch': 0.

TrainOutput(global_step=1000, training_loss=1.8318476066589355, metrics={'train_runtime': 1903.1187, 'train_samples_per_second': 4.204, 'train_steps_per_second': 0.525, 'train_loss': 1.8318476066589355, 'epoch': 1.0})

In [None]:
from peft import PeftModel, PeftConfig
model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b1")
model = PeftModel.from_pretrained(model, "/kaggle/working/chatbot/checkpoint-4000")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-1b1")
model = model.cuda()
ipt = tokenizer("Human: {}\n{}".format("How to prepare an exam?", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**ipt, max_length=128, do_sample=True)[0], skip_special_tokens=True)

"Human: How to prepare an exam?\n\nAssistant: Preparing for the upcoming exam can be challenging, especially if you haven't taken a class before and don't feel comfortable with the materials being presented. Here are a few helpful tips to help you with the exam preparation process:\n\n1. Know the Test Pre-Tests (TPT):\nAs recommended by some of the test authorities, take time to familiarize yourself with the test prior to taking the exam. This will help you avoid problems during the exam and ensure that you do not find yourself feeling confused or overwhelmed at the time the test is coming up.\n\n2. Practice"