In [1]:
import numpy as np
import re
import tritonclient.http as httpclient

from tritonclient.utils import np_to_triton_dtype

In [2]:
URL = "localhost:8000"
MODEL = "ensemble"
IS_RETURN_LOG_PROBS = True
START_ID = 220
END_ID = 50256
RANDOM_SEED = 1
client = httpclient.InferenceServerClient(URL, concurrency=1, verbose=False)

In [3]:
def prepare_tensor(name, input):
    tensor = httpclient.InferInput(
        name, input.shape, np_to_triton_dtype(input.dtype))
    tensor.set_data_from_numpy(input)
    return tensor

In [4]:
def prepare_inputs(data):
    bad_words_list = np.array([data["bad_words_list"]], dtype=object)
    stop_words_list = np.array([data["stop_words_list"]], dtype=object)
    input0_data = np.array(data["prompt"]).astype(object)
    output0_len = np.ones_like(data["prompt"]).astype(np.uint32) * data["tokens_to_generate"]
    runtime_top_k = (data["runtime_top_k"] * np.ones([input0_data.shape[0], 1])).astype(np.uint32)
    runtime_top_p = data["runtime_top_p"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    beam_search_diversity_rate = data["beam_search_diversity_rate"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    temperature = data["temperature"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    len_penalty = data["len_penalty"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    repetition_penalty = data["repetition_penalty"] * np.ones([input0_data.shape[0], 1]).astype(np.float32)
    random_seed = data["random_seed"] * np.ones([input0_data.shape[0], 1]).astype(np.uint64)
    is_return_log_probs = data["is_return_log_probs"] * np.ones([input0_data.shape[0], 1]).astype(bool)
    beam_width = (data["beam_width"] * np.ones([input0_data.shape[0], 1])).astype(np.uint32)
    start_id = data["start_id"] * np.ones([input0_data.shape[0], 1]).astype(np.uint32)
    end_id = data["end_id"] * np.ones([input0_data.shape[0], 1]).astype(np.uint32)

    inputs = [
        prepare_tensor("INPUT_0", input0_data),
        prepare_tensor("INPUT_1", output0_len),
        prepare_tensor("INPUT_2", bad_words_list),
        prepare_tensor("INPUT_3", stop_words_list),
        prepare_tensor("runtime_top_k", runtime_top_k),
        prepare_tensor("runtime_top_p", runtime_top_p),
        prepare_tensor("beam_search_diversity_rate", beam_search_diversity_rate),
        prepare_tensor("temperature", temperature),
        prepare_tensor("len_penalty", len_penalty),
        prepare_tensor("repetition_penalty", repetition_penalty),
        prepare_tensor("random_seed", random_seed),
        prepare_tensor("is_return_log_probs", is_return_log_probs),
        prepare_tensor("beam_width", beam_width),
        prepare_tensor("start_id", start_id),
        prepare_tensor("end_id", end_id),
    ]
    return inputs

In [5]:
def prepare_outputs(result):
    completions = result.as_numpy("OUTPUT_0")
    formatted_completions = []
    for completion in completions:
        tmp_string = completion.decode("utf-8")
        tmp_string = re.sub('<\|endoftext\|>', "", tmp_string)
        formatted_completions.append(tmp_string)
        
    return formatted_completions

## Sample Prompts

### Tuning Parameters
| **Parameter** | **Description** |
|---|---|
| Number of Tokens | _Specifies how much text to generate. Tokens can be either an entire word, or parts of words. For English, 100 tokens form approximately 75 words._ |
| Temperature | _Controls the randomness of selecting the next token during text generation. Lower values reduce randomness, suitable for tasks with a correct answer such as question answering or summarization. Higher values increase randomness, suitable for tasks that require creativity. The [0.5, 0.8] range is a good starting point for experimentation._ |
| Top K | _Controls the randomness of selecting the next token during text generation. The number of highest-probability tokens to keep, from which the next token will be selected at random. Lower values reduce randomness, suitable for tasks with a correct answer such as question answering or summarization. Higher values increase randomness, suitable for tasks that require creativity. 0 means Top K is not used. 1 means greedy decoding, that is, always selecting the most probable token next._ |
| Top P | _Controls the randomness of selecting the next token during text generation. This determines the minimum number of highest-probability tokens whose probabilities sum to or exceed the Top P value, from which the next token will be selected at random. Lower values reduce randomness, suitable for tasks with a correct answer such as question answering or summarization. Higher values increase randomness, suitable for tasks that require creativity._ |
| Repetition Penalty | _How much to penalize tokens based on how frequently they occur in the text. A value of 1 means no penalty, while values larger than 1 discourage repeated tokens._ |
| Length Penalty | _Only applies to beam search, that is, when the beam width is >1. Larger values penalize long candidates more heavily thus preferring shorter candidates._ |
| Beam Search Diversity Rate | _Only applies to beam search, that is, when the beam width is >1. A higher value encourages beam search to return a more diverse set of candidates._ |
| Beam Width | _The number of concurrent candidates to keep track of during beam search. Higher values increase the chance of finding a good output but also require more computation. Streaming is supported with a “beam width” hyperparameter set to 1 only._ |
| Random Seed | _The model generates random results. Changing the random seed alone will produce a different response with similar characteristics. It is possible to reproduce results by fixing the random seed (assuming all other hyperparameters are also fixed)._ |
| Stop Words | _Set of character sequences, upon generating any of which, the API will stop generating any further text prematurely, even if the output length has not yet reached the specified number of tokens. It is useful to design a stopping template in the examples given to the model so that it can learn to stop appropriately upon completing an intended task._ |

### Summarization

In [6]:
summarization = {
    "prompt": [["Summarize the following article:\nArticle: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.\n\nSummary: The Transformer architecture based solely on the attention mechanism deliver superior quality on several translation tasks while being more parallelizable and requiring significantly less time to train compared to recurrence and convolution alternatives.\n\n===\n\nSummarize the following article:\nArticle: In recent years, supervised learning with convolutional networks (CNNs) has seen huge adoption in computer vision applications. Comparatively, unsupervised learning with CNNs has received less attention. In this work we hope to help bridge the gap between the success of CNNs for supervised learning and unsupervised learning. We introduce a class of CNNs called deep convolutional generative adversarial networks (DCGANs), that have certain architectural constraints, and demonstrate that they are a strong candidate for unsupervised learning. Training on various image datasets, we show convincing evidence that our deep convolutional adversarial pair learns a hierarchy of representations from object parts to scenes in both the generator and discriminator. Additionally, we use the learned features for novel tasks - demonstrating their applicability as general image representations.\n\nSummary:"]],
    "stop_words_list": ["===","\n\n"],
    "tokens_to_generate": 64,
    "temperature": 0.5,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "random_seed": RANDOM_SEED,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": [""],
}

inputs = prepare_inputs(summarization)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

Summarize the following article:
Article: The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from t

### AI Chatbot QA

In [7]:
chatbot = {
    "prompt": [["Misty is a cheerful AI assistant and companion, created by NVIDIA engineers. Misty is clever and helpful, and will do everything it can to cheer you up:\n\nYou: How are you feeling?\nMisty: I'm feeling great, how may I help you today?\nYou: Can you please suggest a movie?\nMisty: How about \"The Martian\". It's a sci-fi movie about an astronaut getting stranded on Mars!\nYou: That's cool! But i'm in the mood for watching comedy today\nMisty:"]],
    "stop_words_list": ["You:"],
    "tokens_to_generate": 40,
    "temperature": 0.5,
    "runtime_top_k": 2,
    "runtime_top_p": 1.0,
    "random_seed": RANDOM_SEED,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": [""],
}

inputs = prepare_inputs(chatbot)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

Misty is a cheerful AI assistant and companion, created by NVIDIA engineers. Misty is clever and helpful, and will do everything it can to cheer you up:

You: How are you feeling?
Misty: I'm feeling great, how may I help you today?
You: Can you please suggest a movie?
Misty: How about "The Martian". It's a sci-fi movie about an astronaut getting stranded on Mars!
You: That's cool! But i'm in the mood for watching comedy today
Misty: How about "The Hangover"?
You:


### Write Code

In [8]:
write_code = {
    "prompt": [["Write a python3 script that prints each item in a list.\n\n===\n\n"]],
    "stop_words_list": ["===","\n\n"],
    "tokens_to_generate": 128,
    "temperature": 0.3,
    "runtime_top_k": 32,
    "runtime_top_p": 0.9,
    "random_seed": 1,
    "beam_search_diversity_rate": 0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": [""],
}

inputs = prepare_inputs(write_code)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

Write a python3 script that prints each item in a list.

===

    >>> my_list = [1, 2, 3]
    >>> for item in my_list:
    ...     print(item)
    ...
    1
    2
    3

===


### Unstructured Q & A

In [9]:
unstructured_qa = {
    "prompt": [["Built with 80 billion transistors using a cutting-edge TSMC 4N process custom tailored for NVIDIA’s accelerated compute needs, H100 is the world’s most advanced chip ever built. It features major advances to accelerate AI, HPC, memory bandwidth, interconnect, and communication at data center scale.\n\nThe Hopper architecture’s second-generation MIG supports multi-tenant, multi-user configurations in virtualized environments, securely partitioning the GPU into isolated, right-size instances to maximize quality of service (QoS) for 7X more secured tenants.\n\nThe Transformer Engine uses software and Hopper Tensor Core technology designed to accelerate training for models built from the world’s most important AI model building block, the transformer. Hopper Tensor Cores can apply mixed FP8 and FP16 precisions to dramatically accelerate AI calculations for transformers.\n\nThe NVLink Switch System enables the scaling of multi-GPU input/output (IO) across multiple servers at 900 gigabytes per second (GB/s) bidirectional per GPU, over 7X the bandwidth of PCIe Gen5. The system supports clusters of up to 256 H100s and delivers 9X higher bandwidth than InfiniBand HDR on the NVIDIA Ampere architecture.\n\nNVIDIA Confidential Computing is a built-in security feature of Hopper that makes NVIDIA H100 the world’s first accelerator with confidential computing capabilities. Users can protect the confidentiality and integrity of their data and applications in use while accessing the unsurpassed acceleration of H100 GPUs.\n\nHopper’s DPX instructions accelerate dynamic programming algorithms by 40X compared to CPUs and 7X compared to NVIDIA Ampere architecture GPUs. This leads to dramatically faster times in disease diagnosis, real-time routing optimizations, and graph analytics.\n\nQ: How many h100 GPUs can I connect with nvlink?\nA:"]],
    "stop_words_list": ["Q:"],
    "tokens_to_generate": 32,
    "temperature": 0.2,
    "runtime_top_k": 0,
    "runtime_top_p": 1.0,
    "random_seed": RANDOM_SEED,
    "beam_search_diversity_rate": 0.0,
    "beam_width": 1,
    "repetition_penalty": 1.0,
    "len_penalty": 1.0,
    "is_return_log_probs": IS_RETURN_LOG_PROBS,
    "start_id": START_ID,
    "end_id": END_ID,
    "bad_words_list": [""],
}

inputs = prepare_inputs(unstructured_qa)
result = client.infer(MODEL, inputs)
completions = prepare_outputs(result)
for i in completions:
    print(i)

Built with 80 billion transistors using a cutting-edge TSMC 4N process custom tailored for NVIDIA’s accelerated compute needs, H100 is the world’s most advanced chip ever built. It features major advances to accelerate AI, HPC, memory bandwidth, interconnect, and communication at data center scale.

The Hopper architecture’s second-generation MIG supports multi-tenant, multi-user configurations in virtualized environments, securely partitioning the GPU into isolated, right-size instances to maximize quality of service (QoS) for 7X more secured tenants.

The Transformer Engine uses software and Hopper Tensor Core technology designed to accelerate training for models built from the world’s most important AI model building block, the transformer. Hopper Tensor Cores can apply mixed FP8 and FP16 precisions to dramatically accelerate AI calculations for transformers.

The NVLink Switch System enables the scaling of multi-GPU input/output (IO) across multiple servers at 900 gigabytes per sec