In [2]:
# Import all the dependencies

import ctranslate2
import nvidia
import os
import time
import torch
import transformers

from datasets import load_dataset
from random import randint

In [3]:
cuda_install_dir = '/'.join(nvidia.__file__.split('/')[:-1]) + '/cuda_runtime/lib/'
os.environ['LD_LIBRARY_PATH'] =  cuda_install_dir

In [4]:
# Load dataset from the hub
test_dataset = load_dataset("samsum", split="test")

In [None]:
# Load the ctranslate model. Please change the path below according to your project
generator = ctranslate2.Generator("/mnt/artifacts/ct2_int8", device="cuda")
tokenizer = transformers.AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# select a random test sample
# sample = test_dataset[randint(0, len(test_dataset))]

# Change the index to select a different sample
sample = test_dataset[5]

# format sample
prompt_template = f"Summarize the chat dialogue:\n{{dialogue}}\n---\nSummary:\n"

test_sample = prompt_template.format(dialogue=sample["dialogue"])

print(test_sample)

Summarize the chat dialogue:
Benjamin: Hey guys, what are we doing with the keys today?
Hilary: I've got them. Whoever wants them can meet me at lunchtime or after
Elliot: I'm ok. We're meeting for the drinks in the evening anyway and I guess we'll be going back to the apartment together?
Hilary: Yeah, I guess so
Daniel: I'm with Hilary atm and won't let go of her for the rest of the day, so any option you guys choose is good for me
Benjamin: Hmm I might actually pass by at lunchtime, take the keys and go take a nap. I'm sooo tired after yesterday
Hilary: Sounds good. We'll be having lunch with some French people (the ones who work on the history of food in colonial Mexico - I already see you yawning your head off)
Benjamin: YAAAAWN 🙊 Where and where are you meeting?
Hilary: So I'm meeting them at the entrance to the conference hall at 2 pm and then we'll head to this place called La Cantina. Italian cuisine, which is quite funny, but that's what they've chosen
Benjamin: Interesting 😱 

In [7]:
# generate output from the ctranslate model and print how long it took
start_time = time.perf_counter()
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(test_sample)) 
max_length = 50
results = generator.generate_batch([tokens], sampling_topk=1, max_length=max_length, include_prompt_in_result=False)
output = tokenizer.decode(results[0].sequences_ids[0])
end_time = time.perf_counter()
print(output)

Benjamin is tired and will take a nap at 2. Hilary is meeting the French people at 2 and then will go to the Italian restaurant. Elliot will meet at 2.  They're all going to take the keys and have a


In [8]:
print(f'\n ctranslate model took {round(end_time - start_time, 3)} sec and generated {round(max_length / (end_time - start_time),3)} tokens/sec')


 ctranslate model took 1.196 sec and generated 41.794 tokens/sec


In [None]:
%%timeit -n 1 -r 30
results = generator.generate_batch([tokens], sampling_topk=1, max_length=max_length, include_prompt_in_result=True)
output = tokenizer.decode(results[0].sequences_ids[0])