In [48]:
import random
import torch
import pickle as pkl

from transformers import AutoTokenizer, BloomForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

In [35]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m")
context_length = 2048

In [52]:
#Load data
split_percent = 0.95

data = pkl.load(open("conv_dicts/530_human_filtered_conv_pairs.pkl", "rb"))
random.shuffle(data)
split = int(split_percent * len(data))

train_data = data[:split]
val_data = data[split:]

In [64]:
# tokenize data
tokenizer.pad_token = tokenizer.eos_token

def tokenize(data, tokenizer, context_length):
    outputs = tokenizer(
        data,
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length < context_length:
            input_batch.append(input_ids)
            
    return input_batch

In [65]:
class DialogueDataset(torch.utils.data.Dataset):
    def __init__(self, data_list, tokenizer, context_length):
        self.data_strings = data_list
        self.tokenized_data = tokenize(data_list, tokenizer, context_length)
        
    def __len__(self):
        return len(self.data_strings)
    
    def __getitem__(self, idx):
        return self.tokenized_data[idx].copy()

In [66]:
train_dataset = DialogueDataset(train_data, tokenizer, context_length)
val_dataset = DialogueDataset(val_data, tokenizer, context_length)

In [67]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [69]:
# Setup trainer args
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_steps=5,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [70]:
# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [71]:
trainer.train()

***** Running training *****
  Num examples = 503
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 189
  Number of trainable parameters = 559214592


Epoch,Training Loss,Validation Loss
1,0.7902,0.828489
2,0.3365,0.391211
3,0.1804,0.322937


***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=189, training_loss=0.6709897691610629, metrics={'train_runtime': 937.9351, 'train_samples_per_second': 1.609, 'train_steps_per_second': 0.202, 'total_flos': 335937985609728.0, 'train_loss': 0.6709897691610629, 'epoch': 3.0})

In [73]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Configuration saved in ./results/generation_config.json
Model weights saved in ./results/pytorch_model.bin


In [74]:
finetuned_model = trainer.model

In [85]:
line = "user: Do you like watching movies?\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



['user: Do you like watching movies?\nchatbot: Yes, I really enjoy watching movies! My favorite so far is "Juliette Houston," which I think is really gripping and well-done. I think the acting is spot on and the special effects really work. I haven\'t seen any other']
