In [1]:
from pathlib import Path
import asyncio
import time
from IPython.display import display
from llama_server import LlamaServer

Download the quantized Llama2 model on [huggingface](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/tree/main) and save them to the folder "llms"

#### Start the server

In [2]:
config = {
    "alias": "LLaMA2 7B",
    "model": "./llms/llama-2-7b-chat.Q5_K_M.gguf",
    "server_exe": "../llama_server",
    "system_prompt": "./llama2_prompt.json",
    "prefix": "[INST]",
    "suffix": "[/INST]",
}
slots = 5 # let us run 5 inferences in parallel
context_size = 4096 # maximum context size 

In [3]:
llm = LlamaServer(config, context_size=context_size, slots=slots)
proc = await llm.start_server()

[32m2023-11-05 20:15:00.009 GMT[0m: [1mServer online.[0m


#### Try an example

In [4]:
prompt = "Tell me the value of $\pi$."
result, _ = await llm.query(prompt, slot_id=-1, n_predict=512, stop=["User:","[INST]"])
answer = result["content"].strip()
print(f"Q: {prompt}\nA: {answer}")

Q: Tell me the value of $\pi$.
A: The value of $\pi$ is an irrational number, which means it cannot be expressed as a finite decimal or fraction. It is approximately equal to:
$$\pi \approx 3.141592653589793238462643383279502884197179769008651328230664709384460955058223172535940812848111745028410270007656757863067528000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


#### Tokenize the prompt

In [5]:
result = await llm.tokenize(prompt)
tokens = result["tokens"]
print(tokens)

[24948, 592, 278, 995, 310, 779, 1631, 1504]


#### Detokenization

In [6]:
print(await llm.detokenize(tokens))

{'content': ' Tell me the value of $\\pi$.'}


In [7]:
prompts = [
    "How to build a website?",
    "Teach me how to calculate $\pi$?",
    "Which city is the capital of China?",
    "1+1=?",
    "What is the answer to life the universe and everything?"
]

In [8]:
start = time.time()
results, _ = await llm.batch_query(prompts, n_predict=200, stop=["User:","[INST]"])
end = time.time()
print(f"Time spent: {end-start:.3f} seconds")
for prompt, result in zip(prompts, results):
    answer = result["content"].strip()
    print(f"Q: {prompt}\nA: {answer}")
    print("-"*20)

Time spent: 5.401 seconds
Q: How to build a website?
A: Building a website can be a complex process, but it can be broken down into several manageable steps. Here's a general outline of the steps involved in building a website:
1. Define your website's purpose and goals: Before you start building your website, you need to determine its purpose and what you want to achieve with it. What is the main message you want to convey? Who is your target audience? What do you want visitors to do when they visit your site?
2. Choose a domain name and web hosting: Your domain name is the address of your website (e.g., [www.yoursite.com](http://www.yoursite.com)), while web hosting is the service that allows your website to be accessed on the internet. You can choose from various web hosting providers, such as Bluehost, HostGator, or SiteGround.
3. Plan your website's structure: Determine the pages
--------------------
Q: Teach me how to calculate $\pi$?
A: Sure! There are many ways to calculate the

In [9]:
start = time.time()
results = []
for prompt in prompts:
    result, _ = await llm.query(prompt, n_predict=200, stop=["User:","[INST]"])
    results.append(result)
end = time.time()
print(f"Time spent: {end-start:.3f} seconds")
for prompt, result in zip(prompts, results):
    answer = result["content"].strip()
    print(f"Q: {prompt}\nA: {answer}")
    print("-"*20)

Time spent: 10.815 seconds
Q: How to build a website?
A: Building a website can be a complex process, but it can be broken down into several manageable steps. Here's a general outline of the steps involved in building a website:
1. Define your website's purpose and goals: Before you start building your website, you need to determine its purpose and what you want to achieve with it. What is the main message you want to convey? Who is your target audience? What do you want visitors to do when they visit your site?
2. Choose a domain name and web hosting: Your domain name is the address of your website (e.g., [www.yoursite.com](http://www.yoursite.com)), while web hosting is the service that allows your website to be viewed on the internet. You can choose from various web hosting providers, such as Bluehost, HostGator, or SiteGround.
3. Plan your website's structure: Sketch out a rough
--------------------
Q: Teach me how to calculate $\pi$?
A: Sure! Calculating $\pi$ is a classic problem

We can see that having 5 parallel slots cuts the total inference time by half (10.8 → 5.4).
The quality of generated texts remains similar.

In [10]:
llm.stop_server()

-15