In [1]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, TrainerCallback

In [2]:
ds = load_dataset("yahma/alpaca-cleaned", split='train[:8000]')
tokenizer = AutoTokenizer.from_pretrained("../bloom-1b1")
model = AutoModelForCausalLM.from_pretrained("../bloom-1b1")
tokenizer

BloomTokenizerFast(name_or_path='../bloom-1b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [3]:
def process_func(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ")
    response = tokenizer(example["output"] + tokenizer.eos_token)
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)

In [4]:
# bitfit

num_param = 0
for name, param in model.named_parameters():
    if "bias" not in name:
        param.requires_grad = False
    else:
        num_param += param.numel()

print(num_param)
print(sum(param.numel() for param in model.parameters()))
print(num_param / sum(param.numel() for param in model.parameters()))

408576
1065314304
0.00038352624992069945


In [5]:
args = TrainingArguments(
    output_dir="./chatbot",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=1
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

class LossLoggingCallback(TrainerCallback):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.losses.append(logs['loss'])
            with open(f"{self.output_dir}/losses.txt", "a") as f:
                f.write(f"{state.global_step}: {logs['loss']}\n")

trainer.add_callback(LossLoggingCallback(output_dir="./"))
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,2.4116
20,2.4619
30,2.3903
40,2.4312
50,2.487
60,2.2426
70,2.2196
80,2.3262
90,2.2638
100,2.0452


TrainOutput(global_step=1000, training_loss=1.9403210582733155, metrics={'train_runtime': 1348.5063, 'train_samples_per_second': 5.932, 'train_steps_per_second': 0.742, 'total_flos': 4491070176079872.0, 'train_loss': 1.9403210582733155, 'epoch': 1.0})

In [6]:
model = model.cuda()
ipt = tokenizer("Human: {}\n{}".format("How to prepare an exam?", "").strip() + "\n\nAssistant: ", return_tensors="pt").to(model.device)
tokenizer.decode(model.generate(**ipt, max_length=128, do_sample=True)[0], skip_special_tokens=True)

"Human: How to prepare an exam?\n\nAssistant: As a student, it's important to create a study plan for the exam, and study it in preparation for it. You will want to:\n\nStudy the exam itself: To understand the requirements for each section, and take notes. This will be especially important when studying for a real exam, as it will help you with the exam preparation procedure and help build an exam plan that is specifically tailored to you. Also, when you use the mock test to prepare for a real exam, remember to ask questions related to the material to be studied so you can answer questions using the material at"