In [2]:
import random
import torch
import pickle as pkl

from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

from transformers.models.bloom.configuration_bloom import BloomConfig
from pruning.pruned_bloom import PrunedBloomForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
context_length = 2048

In [4]:
# Load model
weights_path = "pruned_40percent_560m_bloom.pt"
state_dict_shapes_path = "pruned_40percent_560m_bloom_state_dict_shapes.pkl"

bloom_config = BloomConfig(
    vocab_size=250880,
    hidden_size=1024,
    n_layer=24,
    n_head=16,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    apply_residual_connection_post_layernorm=False,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    pretraining_tp=1,  # TP rank used when training with megatron
    slow_but_exact=False,
    attention_softmax_in_fp32=True,
    bias_dropout_fusion=True,
    masked_softmax_fusion=True,
    offset_alibi=100,
    pad_token_id=3,
    seq_length=2048,
    skip_bias_add=True,
    skip_bias_add_qkv=False,
    unk_token_id=0,
    
)

In [5]:
pruned_model = PrunedBloomForCausalLM(bloom_config, state_dict_shapes_path)
pruned_model.load_state_dict(torch.load(weights_path))

<All keys matched successfully>

In [6]:
#Load data
split_percent = 0.95

data = pkl.load(open("conv_dicts/530_human_filtered_conv_pairs.pkl", "rb"))
random.shuffle(data)
split = int(split_percent * len(data))

train_data = data[:split]
val_data = data[split:]

In [7]:
# tokenize data
tokenizer.pad_token = tokenizer.eos_token

def tokenize(data, tokenizer, context_length):
    outputs = tokenizer(
        data,
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length < context_length:
            input_batch.append(input_ids)
            
    return input_batch

In [8]:
class DialogueDataset(torch.utils.data.Dataset):
    def __init__(self, data_list, tokenizer, context_length):
        self.data_strings = data_list
        self.tokenized_data = tokenize(data_list, tokenizer, context_length)
        
    def __len__(self):
        return len(self.data_strings)
    
    def __getitem__(self, idx):
        return self.tokenized_data[idx].copy()

In [9]:
train_dataset = DialogueDataset(train_data, tokenizer, context_length)
val_dataset = DialogueDataset(val_data, tokenizer, context_length)

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
# Setup trainer args
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    logging_steps=5,
)

In [12]:
# init trainer
trainer = Trainer(
    model=pruned_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 503
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 189
  Number of trainable parameters = 469695831


Epoch,Training Loss,Validation Loss
1,1.304,1.073681
2,0.4204,0.430984
3,0.2555,0.303735


***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8
***** Running Evaluation *****
  Num examples = 27
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=189, training_loss=0.9888538090640275, metrics={'train_runtime': 837.2838, 'train_samples_per_second': 1.802, 'train_steps_per_second': 0.226, 'total_flos': 241628394340500.0, 'train_loss': 0.9888538090640275, 'epoch': 3.0})

In [14]:
trainer.save_model()

Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Configuration saved in ./results/generation_config.json
Model weights saved in ./results/pytorch_model.bin


In [15]:
finetuned_model = trainer.model

In [16]:
line = "user: Do you like watching movies?\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



["user: Do you like watching movies?\nchatbot: I absolutely love movies! Especially those with a lot of suspense and action. I love taking me to a different trip with my friends to watch some amazing movies. What type of movie do you usually like to watch? It's always a fun and interesting"]


In [17]:
torch.save(finetuned_model, "finetuned_40percent_pruned_bloom560m.pt")

In [18]:
line = "user: Can you tell me about yourself?\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



["user: Can you tell me about yourself?\nchatbot: Yes, of course! Anime is one of my favorite genres of work to be able to talk about. I love stories with a lot of suspense and action, and I'm always excited to learn more about the characters and the underlying storylines."]


In [21]:
line = "user: what do you want to talk about?\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=25, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



["user: what do you want to talk about?\nchatbot: I want to talk about the power of technology and the potential to make a real difference in the world. I'm here to help"]


In [19]:
line = "user: you misunderstood me\nchatbot:"
inputs = tokenizer(line, return_tensors="pt")
pruned_times = []


outputs = finetuned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=50, 
    do_sample=True, 
    top_k=50, 
    top_p=0.95,
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



["user: you misunderstood me\nchatbot: That is totally true. I understand that you love Alpha and Omega: Dino Digs and I am passionate about making sure that everyone's voice is heard in the future. I'm here to help learn more about how to be an effective advocate for"]


In [25]:
while True:
    user_input = input().strip()
    line = f"user: {user_input}\nchatbot:"
    inputs = tokenizer(line, return_tensors="pt")
    pruned_times = []


    outputs = finetuned_model.generate(
        input_ids=inputs["input_ids"], 
        max_new_tokens=25, 
        do_sample=True, 
        top_k=50, 
        top_p=0.95,
    )
    

    textoutput = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    bot_response = textoutput.split("\nchatbot:")[1]
    print(bot_response)

 Hi, what do you want to talk about?


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



 Hi, what are some of your favorite movies? I'm always interested to hear what other people think about the movies they are watching


 my favorite movie is inception


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



  Absolutely! It's a great movie with a great message. Have you seen any other movies that you liked? What genre


 I really like movies in the thiller genre, what about you?


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



  I completely agree with you! The thriller novel is an intricate plotline and a great reminder of how far we


In [26]:
while True:
    chat_history = ""
    user_input = input().strip()
    line = f"user: {user_input}\nchatbot:"
    chat_history += line
    inputs = tokenizer(chat_history, return_tensors="pt")


    outputs = finetuned_model.generate(
        input_ids=inputs["input_ids"], 
        max_new_tokens=25, 
        do_sample=True, 
        top_k=50, 
        top_p=0.95,
    )
    

    textoutput = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    bot_response = textoutput.split("\nchatbot:")[1]
    print(bot_response)
    chat_history += bot_response + "\n"

 Hi, what do you want to talk about?


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



 I think exploring the great outdoors is my favorite activity by far. I'm particularly fond of hiking, exploring the nature,


 I also like exploring nature, have you recently been camping?


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



 Yes, I've been camping a bit lately. The most rewarding part was getting back into the real world. The scall


 how long was your camping trip?


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



  Sure thing! My camping trip was a combination of adventure and peace. I took two-hour-long and fourchatbot's


 i didn't understand that last bit


Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 3,
  "transformers_version": "4.26.0"
}



 That sounds like a fascinating story! It's incredible how Young Justice is able to bring the story of DC comics to life
