In [1]:
# Import all the dependencies

import nvidia
import os
import time
import torch
import transformers

from datasets import load_dataset
from random import randint
from transformers import GenerationConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [2]:
cuda_install_dir = '/'.join(nvidia.__file__.split('/')[:-1]) + '/cuda_runtime/lib/'
os.environ['LD_LIBRARY_PATH'] =  cuda_install_dir


In [3]:
# Load dataset from the hub
test_dataset = load_dataset("samsum", split="test")


In [None]:
# Load the fine tuned falcon-7b model
model_id = "/mnt/falcon_7b_model_adapter"
bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,
        )
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             trust_remote_code=True, 
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                             quantization_config=bnb_config,
                                             cache_dir='/mnt')

In [5]:
# Change the index to select a different sample
sample = test_dataset[5]

# format sample
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n"

test_sample = prompt_template.format(dialogue=sample["dialogue"])

print(test_sample)

Summarize the chat dialogue:
Benjamin: Hey guys, what are we doing with the keys today?
Hilary: I've got them. Whoever wants them can meet me at lunchtime or after
Elliot: I'm ok. We're meeting for the drinks in the evening anyway and I guess we'll be going back to the apartment together?
Hilary: Yeah, I guess so
Daniel: I'm with Hilary atm and won't let go of her for the rest of the day, so any option you guys choose is good for me
Benjamin: Hmm I might actually pass by at lunchtime, take the keys and go take a nap. I'm sooo tired after yesterday
Hilary: Sounds good. We'll be having lunch with some French people (the ones who work on the history of food in colonial Mexico - I already see you yawning your head off)
Benjamin: YAAAAWN 🙊 Where and where are you meeting?
Hilary: So I'm meeting them at the entrance to the conference hall at 2 pm and then we'll head to this place called La Cantina. Italian cuisine, which is quite funny, but that's what they've chosen
Benjamin: Interesting 😱 

In [9]:
#generate output from the fine tuned falcon-7b model to compare run time
#set the tokens for the summary evaluation
tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
tokenizer.pad_token = tokenizer.eos_token

start_time = time.perf_counter()
input_ids = tokenizer(test_sample, return_tensors="pt").input_ids
input_ids = input_ids.to('cuda')
max_new_tokens = 50

generation_config = GenerationConfig(
            pad_token_id=tokenizer.pad_token_id,
            max_new_tokens = max_new_tokens
        )

with torch.no_grad():
    generated_ids = model.generate(input_ids, generation_config=generation_config)
    
gen_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
end_time = time.perf_counter()     
print(gen_text)

Summarize the chat dialogue:
Benjamin: Hey guys, what are we doing with the keys today?
Hilary: I've got them. Whoever wants them can meet me at lunchtime or after
Elliot: I'm ok. We're meeting for the drinks in the evening anyway and I guess we'll be going back to the apartment together?
Hilary: Yeah, I guess so
Daniel: I'm with Hilary atm and won't let go of her for the rest of the day, so any option you guys choose is good for me
Benjamin: Hmm I might actually pass by at lunchtime, take the keys and go take a nap. I'm sooo tired after yesterday
Hilary: Sounds good. We'll be having lunch with some French people (the ones who work on the history of food in colonial Mexico - I already see you yawning your head off)
Benjamin: YAAAAWN 🙊 Where and where are you meeting?
Hilary: So I'm meeting them at the entrance to the conference hall at 2 pm and then we'll head to this place called La Cantina. Italian cuisine, which is quite funny, but that's what they've chosen
Benjamin: Interesting 😱 

In [10]:
print(f'\n Huggingface finetuned model took {round(end_time - start_time, 3)} sec and generated {round(max_length / (end_time - start_time),3)} tokens/sec')


 Huggingface finetuned model took 9.277 sec and generated 5.39 tokens/sec


In [None]:
%%timeit -n 1 -r 30
with torch.no_grad():
      outputs = model.generate(inputs=input_ids, generation_config=generation_config)  
gen_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]