In [1]:
import random
import torch
import pickle as pkl

from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

from transformers.models.bloom.configuration_bloom import BloomConfig
from pruning.pruned_bloom import PrunedBloomForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
context_length = 2048

In [3]:
# Load model
weights_path = "pruned_30percent_560m_bloom.pt"
state_dict_shapes_path = "pruned_30percent_560m_bloom_state_dict_shapes.pkl"

bloom_config = BloomConfig(
    vocab_size=250880,
    hidden_size=1024,
    n_layer=24,
    n_head=16,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    apply_residual_connection_post_layernorm=False,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    pretraining_tp=1,  # TP rank used when training with megatron
    slow_but_exact=False,
    attention_softmax_in_fp32=True,
    bias_dropout_fusion=True,
    masked_softmax_fusion=True,
    offset_alibi=100,
    pad_token_id=3,
    seq_length=2048,
    skip_bias_add=True,
    skip_bias_add_qkv=False,
    unk_token_id=0,
    
)

In [4]:
pruned_model = PrunedBloomForCausalLM(bloom_config, state_dict_shapes_path)
pruned_model.load_state_dict(torch.load(weights_path))

<All keys matched successfully>

In [5]:
#Load data
split_percent = 0.95

data = pkl.load(open("conv_dicts/530_human_filtered_conv_pairs.pkl", "rb"))
random.shuffle(data)
split = int(split_percent * len(data))

train_data = data[:split]
val_data = data[split:]

In [6]:
# tokenize data
tokenizer.pad_token = tokenizer.eos_token

def tokenize(data, tokenizer, context_length):
    outputs = tokenizer(
        data,
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length < context_length:
            input_batch.append(input_ids)
            
    return input_batch

In [7]:
class DialogueDataset(torch.utils.data.Dataset):
    def __init__(self, data_list, tokenizer, context_length):
        self.data_strings = data_list
        self.tokenized_data = tokenize(data_list, tokenizer, context_length)
        
    def __len__(self):
        return len(self.data_strings)
    
    def __getitem__(self, idx):
        return self.tokenized_data[idx].copy()

In [8]:
train_dataset = DialogueDataset(train_data, tokenizer, context_length)
val_dataset = DialogueDataset(val_data, tokenizer, context_length)

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [10]:
# Setup trainer args
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=6,
    logging_steps=5,
)

In [11]:
# init trainer
trainer = Trainer(
    model=pruned_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 503
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 378
  Number of trainable parameters = 391362561


Epoch,Training Loss,Validation Loss
1,3.2271,2.992978
2,1.7673,1.630269
3,0.919,0.753202
4,0.2719,0.448844
5,0.1921,0.282054
6,0.1303,0.21809


***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=378, training_loss=1.406246720877274, metrics={'train_runtime': 1397.8396, 'train_samples_per_second': 2.159, 'train_steps_per_second': 0.27, 'total_flos': 301016676583644.0, 'train_loss': 1.406246720877274, 'epoch': 6.0})

In [13]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Configuration saved in ./results/generation_config.json
Model weights saved in ./results/pytorch_model.bin


In [14]:
finetuned_model = trainer.model

In [15]:
line = "user: Do you like watching movies?\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



["user: Do you like watching movies?\nchatbot:  Of course! Strickland's's story is a great enteriveive and comedy-drama. It follows two unexpected unexpected members in the community and their lives. It takes uncovercover different different cultures and unjoys. It's a great"]


In [16]:
torch.save(finetuned_model, "finetuned_20percent_pruned_bloom560m.pt")

In [17]:
line = "user: Can you tell me about yourself?\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



['user: Can you tell me about yourself?\nchatbot: Sure thing! Alpha and Omega: Dino Digs is set in a prehistoric world and follows two wolves, Stinky and Claudette, as they embark on a fossil-digging adventure. Along the way, they discover']


In [18]:
line = "user: you misunderstood me\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



["user: you misunderstood me\nchatbot:  I am amazed at how\x07\x07\x06\x06\x07\x05\x05\x06\x07\x08\x08\x06\x05\x08\x06\x07\x06\x06\x07\x06\x05\x08downdowns and I haven't tried a Snickerdoodle yet. What does you improve? I'm sure"]
