In [1]:
import torch
import json
import time

from transformers import AutoTokenizer
from transformers.models.bloom.configuration_bloom import BloomConfig
from pruning.pruned_bloom import PrunedBloomForCausalLM
from node_attribution.utils import count_params

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
weights_path = "pruned_560m_bloom.pt"
state_dict_shapes_path = "state_dict_shapes.pkl"
config_path = "bloom_560m_config.json"
config_json = json.load(open(config_path, "rb"))
bloom_config = BloomConfig(
    vocab_size=250880,
    hidden_size=1024,
    n_layer=24,
    n_head=16,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    use_cache=True,
    bos_token_id=1,
    eos_token_id=2,
    apply_residual_connection_post_layernorm=False,
    hidden_dropout=0.0,
    attention_dropout=0.0,
    pretraining_tp=1,  # TP rank used when training with megatron
    slow_but_exact=False,
)

In [3]:
pruned_model = PrunedBloomForCausalLM(bloom_config, state_dict_shapes_path)

In [4]:
pruned_model.load_state_dict(torch.load(weights_path))

<All keys matched successfully>

In [5]:
count_params(pruned_model)

(736658518, 479757398)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(f"bigscience/bloom-560m")

In [21]:
line = "Hello"
inputs = tokenizer(line, return_tensors="pt")
start = time.time()
outputs = pruned_model.generate(
    input_ids=inputs["input_ids"], 
    max_new_tokens=20, 
    do_sample=False, 
    top_k=50, 
    top_p=0.95,
)
end = time.time()
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
print(f"inference time: {end - start}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['Hello參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了參加了']
inference time: 1.4760398864746094


In [9]:
from transformers import AutoTokenizer, BloomForCausalLM

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m")

In [22]:
inputs = tokenizer("Hello", return_tensors="pt")
start = time.time()
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=20, do_sample=False, top_k=50, top_p=0.95)
end = time.time()
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
print(f"inference time: {end - start}")

['Hello can be a good friend']
inference time: 0.5328428745269775
