In [None]:
import json
import numpy as np
import re
import tritonclient.http as httpclient

from tritonclient.utils import np_to_triton_dtype

In [None]:
URL = "localhost:8000"
MODEL = "ensemble"
IS_RETURN_LOG_PROBS = True
START_ID = 220
END_ID = 50256
RANDOM_SEED = 0
BAD_WORDS_LIST = [""]
client = httpclient.InferenceServerClient(URL, concurrency=1, verbose=False)

In [None]:
def prepare_tensor(name, input):
    tensor = httpclient.InferInput(
        name, input.shape, np_to_triton_dtype(input.dtype))
    tensor.set_data_from_numpy(input)
    return tensor

In [None]:
def prepare_inputs(data):
    bad_words_list = np.array([data["bad_words_list"]], dtype=object)
    stop_words_list = np.array([data["stop_words_list"]], dtype=object)
    input0_data = np.array(data["prompt"]).astype(object)
    output0_len = np.ones_like(data["prompt"]).astype(np.uint32) * data["tokens_to_generate"]
    runtime_top_k = (data["runtime_top_k"] * np.ones([input0_data.shape[0], 1])).astype(np.uint32)
    runtime_top_p = data["runtime_top_p"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    beam_search_diversity_rate = data["beam_search_diversity_rate"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    temperature = data["temperature"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    len_penalty = data["len_penalty"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    repetition_penalty = data["repetition_penalty"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    random_seed = data["random_seed"] * np.ones([input0_data.shape[0], 1]).astype(np.uint64)
    is_return_log_probs = data["is_return_log_probs"] * np.ones([input0_data.shape[0], 1]).astype(bool)
    beam_width = (data["beam_width"] * np.ones([input0_data.shape[0], 1])).astype(np.uint32)
    start_id = data["start_id"] * np.ones([input0_data.shape[0], 1]).astype(np.uint32)
    end_id = data["end_id"] * np.ones([input0_data.shape[0], 1]).astype(np.uint32)

    inputs = [
        prepare_tensor("INPUT_0", input0_data),
        prepare_tensor("INPUT_1", output0_len),
        prepare_tensor("INPUT_2", bad_words_list),
        prepare_tensor("INPUT_3", stop_words_list),
        prepare_tensor("runtime_top_k", runtime_top_k),
        prepare_tensor("runtime_top_p", runtime_top_p),
        prepare_tensor("beam_search_diversity_rate", beam_search_diversity_rate),
        prepare_tensor("temperature", temperature),
        prepare_tensor("len_penalty", len_penalty),
        prepare_tensor("repetition_penalty", repetition_penalty),
        prepare_tensor("random_seed", random_seed),
        prepare_tensor("is_return_log_probs", is_return_log_probs),
        prepare_tensor("beam_width", beam_width),
        prepare_tensor("start_id", start_id),
        prepare_tensor("end_id", end_id),
    ]
    return inputs

In [None]:
def prepare_outputs(result):
    completions = result.as_numpy("OUTPUT_0")
    formatted_completions = []
    for completion in completions:
        tmp_string = completion.decode("utf-8")
        tmp_string = re.sub('<\|endoftext\|>', "", tmp_string)
        formatted_completions.append(tmp_string)
        
    return formatted_completions

## Sample Prompts

### Tuning Parameters
| **Parameter** | **Description** |
|---|---|
| Number of Tokens | _Specifies how much text to generate. Tokens can be either an entire word, or parts of words. For English, 100 tokens form approximately 75 words._ |
| Temperature | _Controls the randomness of selecting the next token during text generation. Lower values reduce randomness, suitable for tasks with a correct answer such as question answering or summarization. Higher values increase randomness, suitable for tasks that require creativity. The [0.5, 0.8] range is a good starting point for experimentation._ |
| Top K | _Controls the randomness of selecting the next token during text generation. The number of highest-probability tokens to keep, from which the next token will be selected at random. Lower values reduce randomness, suitable for tasks with a correct answer such as question answering or summarization. Higher values increase randomness, suitable for tasks that require creativity. 0 means Top K is not used. 1 means greedy decoding, that is, always selecting the most probable token next._ |
| Top P | _Controls the randomness of selecting the next token during text generation. This determines the minimum number of highest-probability tokens whose probabilities sum to or exceed the Top P value, from which the next token will be selected at random. Lower values reduce randomness, suitable for tasks with a correct answer such as question answering or summarization. Higher values increase randomness, suitable for tasks that require creativity._ |
| Repetition Penalty | _How much to penalize tokens based on how frequently they occur in the text. A value of 1 means no penalty, while values larger than 1 discourage repeated tokens._ |
| Length Penalty | _Only applies to beam search, that is, when the beam width is >1. Larger values penalize long candidates more heavily thus preferring shorter candidates._ |
| Beam Search Diversity Rate | _Only applies to beam search, that is, when the beam width is >1. A higher value encourages beam search to return a more diverse set of candidates._ |
| Beam Width | _The number of concurrent candidates to keep track of during beam search. Higher values increase the chance of finding a good output but also require more computation. Streaming is supported with a “beam width” hyperparameter set to 1 only._ |
| Random Seed | _The model generates random results. Changing the random seed alone will produce a different response with similar characteristics. It is possible to reproduce results by fixing the random seed (assuming all other hyperparameters are also fixed)._ |
| Stop Words | _Set of character sequences, upon generating any of which, the API will stop generating any further text prematurely, even if the output length has not yet reached the specified number of tokens. It is useful to design a stopping template in the examples given to the model so that it can learn to stop appropriately upon completing an intended task._ |

### AI Chatbot QA

In [None]:
chatbot = {
    "prompt": [["Misty is a cheerful AI assistant and companion, created by NVIDIA engineers. Misty is clever and helpful, and will do everything it can to cheer you up:\n\nYou: How are you feeling?\nMisty: I'm feeling great, how may I help you today?\nYou: Can you please suggest a movie?\nMisty: How about \"The Martian\". It's a sci-fi movie about an astronaut getting stranded on Mars!\nYou: That's cool! But i'm in the mood for watching comedy today\nMisty:"]],
    "stop_words_list": ["You:"],
    "tokens_to_generate": 40,
    "temperature": 0.5,
    "runtime_top_k": 2,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(chatbot)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Summarization

In [None]:
summarization = {
    "prompt": [["Summarize the following article:\nArticle: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.\n\nSummary: The Transformer architecture based solely on the attention mechanism deliver superior quality on several translation tasks while being more parallelizable and requiring significantly less time to train compared to recurrence and convolution alternatives.\n\n===\n\nSummarize the following article:\nArticle: In recent years, supervised learning with convolutional networks (CNNs) has seen huge adoption in computer vision applications. Comparatively, unsupervised learning with CNNs has received less attention. In this work we hope to help bridge the gap between the success of CNNs for supervised learning and unsupervised learning. We introduce a class of CNNs called deep convolutional generative adversarial networks (DCGANs), that have certain architectural constraints, and demonstrate that they are a strong candidate for unsupervised learning. Training on various image datasets, we show convincing evidence that our deep convolutional adversarial pair learns a hierarchy of representations from object parts to scenes in both the generator and discriminator. Additionally, we use the learned features for novel tasks - demonstrating their applicability as general image representations.\n\nSummary:"]],
    "stop_words_list": ["===","\n\n"],
    "tokens_to_generate": 64,
    "temperature": 0.5,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(summarization)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Open Domain Q&A

In [None]:
open_domain_qa = {
    "prompt": [["Q: What is the capital of Spain? \nA: Madrid\n\nQ: What is synthetic biology?\nA: Synthetic Biology is about designing biological systems at multiple levels from individual molecules up to whole cells and even multicellular assemblies like tissues and organs to perform specific functions.\n\nQ: What are the greatest threats of climate change?\nA: The greatest threats of climate change are rising sea levels, extreme weather events, and droughts.\n\nQ: What roles do proteins play in our cells?\nA: Proteins within a cell determine its health and function. Proteins are responsible for nearly every task of cellular life, including cell shape and inner organization, product manufacture and waste cleanup, and routine maintenance. Proteins also receive signals from outside the cell and mobilize intracellular response. They are the workhorse macro molecules of the cell and are as diverse as the functions they serve.\n\nQ: What is the largest source of uncertainty in climate sensitivity?\nA:"]],
    "stop_words_list": ["Q:","\\n"],
    "tokens_to_generate": 48,
    "temperature": 0.2,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(open_domain_qa)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Structured Data Q&A

In [None]:
structured_data_qa = {
    "prompt": [["| Company Name | Number of Employees | Year Established | IPO Date | Share Price |\n| Providence Inc. | 250 | 1990 | 25th August 1990 | $0.90 |\n| Grant Corporation | 2000 | 1890 | 21st September 1920 | $115.90 |\n| Rusty Metalworks | 12459 | 1946 | 12th September 1986 | $15.23 |\n| Dull Knives &  Blades | 3412 | 2008 | 1st December 2012 | $3.20 |\n\n\nQ: Which company has the most employees?\nA: Rusty Metalworks\n\nQ: When was Grant Corporation established?\nA: 21st September 1920\n\nQ: Which company had the most recent IPO date?\nA: Dull Knives & Blades\n\nQ: What is the share price and IPO date of Rusty Metalworks?\nA:"]],
    "stop_words_list": ["Q:"],
    "tokens_to_generate": 32,
    "temperature": 0.2,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(structured_data_qa)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Unstructured Data Q & A

In [None]:
unstructured_data_qa = {
    "prompt": [["Date: 6/06/22\nTime: 3 pm\n\nAttendees NVIDIA: Jane, Darby, Jerry\nAttendees Peach Corp: Ram, Eva, Harry\n\n* Explored Product X\n* Feature Request for adding backward compatibility with PyTorch 1.1\n* Liked Feature A\n* Willing to proceed with adoption if feature request is fulfilled.\n-----------------------------------\nDate: 5/30/22\nTime: 10 am\n\nAttendees NVIDIA: Jane, Jone, Jerry\nAttendees Peach Corp: Adam, Eva, Harry\n\n* Product X was introduced, initial response was lukewarm\n* Peach Corp agreed to evaluate product X. 2 Engineers allocated for Exploration\n* Looking for a cost analysis. Jane to send.\n* Mainly interested in feature A\n* Concerned about compatibility with PyTorch 1.1\n* Do not want to spend resources on updating stack to support streaming Feature B\n* Jane to set up follow-up call Friday next week for feedback on feature A\n-----------------------------------\nQ: What are the action items for NVIDIA attendees?\nA: "]],
    "stop_words_list": ["Q:"],
    "tokens_to_generate": 32,
    "temperature": 0.2,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.8,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(unstructured_data_qa)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Blog Post

In [None]:
blog_post = {
    "prompt": [["Generate a blog intro for the following topic: speechAI\nSpeech AI is the ability of intelligent systems to communicate with users using a voice-based interface, which has become ubiquitous in everyday life. People regularly interact with smart home devices, telephone banking services, and phones via speech. Speech interface quality has improved leaps and bounds in recent years, making them a much more pleasant, practical, and natural experience than just a decade ago. In this blog, we will present the workflow, tools, and best practices that the NVIDIA engineering team employed to make new world-class Riva SpeechAI services. Let's start this journey!\n\n===\n\nGenerate a blog intro for the following topic: TensorRT\n"]],
    "stop_words_list": ["===","\n\n"],
    "tokens_to_generate": 128,
    "temperature": 0.5,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(blog_post)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Classification

In [None]:
classification = {
    "prompt": [["Classify the following articles into one of three topics: Politics, Sport and Science\n\nArticle: Britain’s plan to become a 'science and technology superpower' is so lacking in focus and so full of new organisational structures that the country risks becoming a 'bureaucracy superpower' instead, an influential crossbench peer has said.\nClass: Politics\n\nArticle: It is the middle of the European winter, and sprinter Evan O'Hanlon is shovelling snow off a track in the Czech Republic, where he lives, so he can train in -8C weather. \nClass: Sport\n\nArticle: NASA is 'in the final stretch' of launching its Artemis I mission as it will roll out the worlds most powerful rocket, the Space Launch System (SLS), and the Orion capsule to the launch pad in just two weeks.\nNASA Administrator Bill Nelson said during a Wednesday briefing: 'This is now the Artemis generation,' Nelson said.\nClass: Science\n\nArticle: The Cartwheel Galaxy, also known as ESO 350-40 or PGC 2248, is a rare ring galaxy located about 500 million light-years away in the Sculptor constellation. The Cartwheel Galaxy, seen largest in the image below, resulted from an intense high-speed collision between a large spiral galaxy and a smaller galaxy that's not visible. The Webb team writes, 'Collisions of galactic proportions cause a cascade of different, smaller events between the galaxies involved; the Cartwheel is no exception.'\nClass:"]],
    "stop_words_list": ["\n"],
    "tokens_to_generate": 5,
    "temperature": 0.2,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(classification)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

### Write Python Code

In [None]:
write_code = {
    "prompt": [["Generate python code that prints each item in a list.\n\nCode:\nitem_list = [1,2,3,4,5]\nfor i in item_list:\n\tprint(i)\n\n===\n\nGenerate python code that prints each item in a list if it's an even number.\nCode:"]],
    "stop_words_list": ["===","\n\n"],
    "tokens_to_generate": 64,
    "temperature": 0.7,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "beam_search_diversity_rate": 0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": BAD_WORDS_LIST,
    "random_seed": RANDOM_SEED,
}

inputs = prepare_inputs(write_code)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

In [None]:
def prepare_perf_analyzer(data):
    bad_words_list = np.array([data["bad_words_list"]], dtype=object)
    stop_words_list = np.array([data["stop_words_list"]], dtype=object)
    input0_data = np.array(data["prompt"]).astype(object)
    output0_len = np.ones_like(data["prompt"]).astype(np.uint32) * data["tokens_to_generate"]
    runtime_top_k = (data["runtime_top_k"] * np.ones([input0_data.shape[0], 1])).astype(np.uint32)
    runtime_top_p = data["runtime_top_p"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    beam_search_diversity_rate = data["beam_search_diversity_rate"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    temperature = data["temperature"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    len_penalty = data["len_penalty"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    repetition_penalty = data["repetition_penalty"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    random_seed = data["random_seed"] * np.ones([input0_data.shape[0], 1]).astype(np.uint64)
    is_return_log_probs = data["is_return_log_probs"] * np.ones([input0_data.shape[0], 1]).astype(bool)
    beam_width = (data["beam_width"] * np.ones([input0_data.shape[0], 1])).astype(np.uint32)
    start_id = data["start_id"] * np.ones([input0_data.shape[0], 1]).astype(np.uint32)
    end_id = data["end_id"] * np.ones([input0_data.shape[0], 1]).astype(np.uint32)
    
    json_input = {
        "INPUT_0": {"content": input0_data.reshape(-1).tolist(), "shape": [1]},
        "INPUT_1": {"content": output0_len.astype(object).reshape(-1).tolist(), "shape": [1]},
        "INPUT_2": {"content": bad_words_list.reshape(-1).tolist(), "shape": [bad_words_list.shape[1]]},
        "INPUT_3": {"content": stop_words_list.reshape(-1).tolist(), "shape": [stop_words_list.shape[1]]},
        "runtime_top_k": runtime_top_k.reshape(-1).tolist(),
        "runtime_top_p": runtime_top_p.reshape(-1).tolist(),
        "beam_search_diversity_rate": beam_search_diversity_rate.reshape(-1).tolist(),
        "temperature": temperature.reshape(-1).tolist(),
        "len_penalty": len_penalty.reshape(-1).tolist(),
        "repetition_penalty": repetition_penalty.reshape(-1).tolist(),
        "random_seed": random_seed.reshape(-1).tolist(),
        "is_return_log_probs": is_return_log_probs.reshape(-1).tolist(),
        "beam_width": beam_width.reshape(-1).tolist(),
        "start_id": start_id.reshape(-1).tolist(),
        "end_id": end_id.reshape(-1).tolist()
    }
    return json_input

In [None]:
raw_data_input = [chatbot, summarization, open_domain_qa, structured_data_qa, unstructured_data_qa, blog_post, classification, write_code]
perf_analyzer_list = []


for i in raw_data_input:
    perf_analyzer_list.append(prepare_perf_analyzer(i))
                              
perf_analyzer_data = {"data": perf_analyzer_list}

with open("perf_analyzer_data.json", "w") as f:
    json.dump(perf_analyzer_data, f)
    
perf_analyzer_data