In [4]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core.embeddings.utils import resolve_embed_model
from llama_index.llms.ollama import Ollama
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core import Settings
import json
import pandas as pd
import random
from llama_cpp import Llama

In [5]:
!nvidia-smi

Wed Mar  6 11:25:54 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
| 22%   22C    P8    14W / 215W |     15MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
# documents = SimpleDirectoryReader("data/docs").load_data()

# embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

# service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

# index = VectorStoreIndex.from_documents(
#     documents, show_progress=True, service_context=service_context
# )

# index.storage_context.persist(persist_dir="data/persist_dir")

In [7]:
# llm = Ollama(model="mistral", request_timeout=180.0) # , base_url="http://62.80.172.138:11434"
llm = Llama('models/mistral-7b-instruct-v0.2.Q4_0.gguf', n_ctx=2048, verbose=True) # mistral-7b-instruct-v0.2.Q4_0.gguf mistral-7b-instruct-v0.2.Q4_K_M.gguf

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2080, compute capability 7.5, VMM: yes
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from models/mistral-7b-instruct-v0.2.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:        

In [8]:
llm.create_chat_completion(messages=[{'role': 'user', 'content': 'Why is sky blue?'}], max_tokens=60)


llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =      23.68 ms /    60 runs   (    0.39 ms per token,  2534.10 tokens per second)
llama_print_timings: prompt eval time =    1671.87 ms /    14 tokens (  119.42 ms per token,     8.37 tokens per second)
llama_print_timings:        eval time =    9611.85 ms /    59 runs   (  162.91 ms per token,     6.14 tokens per second)
llama_print_timings:       total time =   11417.82 ms /    73 tokens


{'id': 'chatcmpl-31c12d74-cd69-4d46-9d32-5256a85685eb',
 'object': 'chat.completion',
 'created': 1709717165,
 'model': 'models/mistral-7b-instruct-v0.2.Q4_0.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': " The color of the sky appears blue due to a process called Rayleigh scattering. As sunlight reaches Earth's atmosphere, it interacts with different gases and particles in the air. Blue light has a shorter wavelength and gets scattered more easily than other colors because it travels in smaller, shorter"},
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 14, 'completion_tokens': 60, 'total_tokens': 74}}

In [9]:
Settings.embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="data/persist_dir")

# load index
index = load_index_from_storage(storage_context, show_progress=True) # , service_context=service_context

In [10]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,
)

In [38]:
json_scheme_prompt = {
    "method": {
        "type": "string"
    },
    "params": {
        "type": "object"
    }
}

example_1_json = {
  "method":"Cover.Open",
  "params":
  {
    "id":2
  }
}

example_2_json = {
  "method":"Cover.GetStatus",
  "params":
  {
    "id":0
  }
}

with open('data/prompts/validation/instruction.md') as f:
  instruction = f.read()

variables = {
    "instruction": instruction,
    "json_scheme": "The output JSON should follow the next scheme: " + json.dumps(json_scheme_prompt),
    "devices": """Cover id=1""",
    "example_1": """Devices: Cover id=2
Methods:
API method 1: Cover.Open
Description: Preconditions: Cover will not accept the command if: An overvoltage error is set at the time of the request. An undervoltage error is set at the time of the request. An overtemp error is set at the time of the request. An engaged safety_switch prohibits movement in the requested direction. Cover calibration is running at the time of the request. Properties: [{'name': 'id', 'type': 'number', 'description': 'The numeric ID of the Cover component instance'}, {'name': 'duration', 'type': 'number', 'description': 'If duration is not provided, Cover will fully open, unless it times out because of maxtime_open first. If duration (seconds) is provided, Cover will move in the open direction for the specified time. duration must be in the range [0.1..maxtime_open]Optional'}] Response: null on success; error if the request can not be executed or failed 
Command: Open the cover.
JSON: """ + json.dumps(example_1_json),
    "example_2": """Devices: Cover id=0
Methods: 
API method 1: Cover.GetStatus
Description: Properties: [{'name': 'id', 'type': 'number', 'description': 'The numeric ID of the Cover component instance'}] Find more about the status properties in status section 
Command: Get cover status.
JSON: """ + json.dumps(example_2_json),
}

In [39]:
base_prompt_template = """
{instruction}
{json_scheme}

{example_1}

{example_2}
"""

user_prompt_template = """Devices: {env}
Methods:
{methods_description}
Command: {user_cmd}
JSON:
"""

base_prompt = base_prompt_template.format(**variables)

print(base_prompt)


You are a helpful AI Assistant that controls the devices in a house. For a given user command create a corresponding JSON object. Don't add properties with null value in output JSON object. Output must be strictly in JSON format.
The output JSON should follow the next scheme: {"method": {"type": "string"}, "params": {"type": "object"}}

Devices: Cover id=2
Methods:
API method 1: Cover.Open
Description: Preconditions: Cover will not accept the command if: An overvoltage error is set at the time of the request. An undervoltage error is set at the time of the request. An overtemp error is set at the time of the request. An engaged safety_switch prohibits movement in the requested direction. Cover calibration is running at the time of the request. Properties: [{'name': 'id', 'type': 'number', 'description': 'The numeric ID of the Cover component instance'}, {'name': 'duration', 'type': 'number', 'description': 'If duration is not provided, Cover will fully open, unless it times out becaus

In [40]:
# import logging
# import sys

# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logger = logging.getLogger()
# logger.addHandler(logging.StreamHandler(stream=sys.stdout))

In [41]:
# logger.disabled = True

Larger Q4_K_M model generates more complex output with unnecessary parameters. Possible fix: parameters tuning. \
Whereas Q4_0 model tries to escape `_` character with `\` inside object property. Possible fix: grammar.

In [42]:
from llama_cpp import LlamaGrammar

In [43]:
# json_scheme_general = {
#     "type": "object",
#     "properties": {
#         "method": {
#             "type": "string"
#         },
#         "params": {
#             "type": "object",
#             # doesn't work without properties defined.
#             # however, json schema is valid even without 'properties' field
#             # https://stackoverflow.com/questions/42977208/json-schema-without-properties-keyword
#             # "properties": {
#             #     "config": {
#             #         "type": "string"
#             #     }
#             # }
#         }
#     }
# }
# s = str(json_scheme_general).replace("'", '"')
# print(s)
# gr = LlamaGrammar.from_json_schema(s)
# print(gr)

In [19]:
with open('data/grammars/json.gbnf') as f:
    grammar_str = f.read()
print(grammar_str)

root   ::= object
value  ::= object | array | string | number | ("true" | "false" | "null") ws

object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws

array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws

string ::=
  "\"" (
    [^"\\\x7F\x00-\x1F] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws

# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n])?



In [21]:
grammar = LlamaGrammar.from_string(grammar_str)
print(grammar)

<llama_cpp.llama_grammar.LlamaGrammar object at 0x7fc1d433a590>


from_string grammar:
root ::= object 
object ::= [{] ws object_11 [}] ws 
value ::= object | array | string | number | value_6 ws 
array ::= [[] ws array_15 []] ws 
string ::= ["] string_18 ["] ws 
number ::= number_19 number_25 number_29 ws 
value_6 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= ws_31 
object_8 ::= string [:] ws value object_10 
object_9 ::= [,] ws string [:] ws value 
object_10 ::= object_9 object_10 | 
object_11 ::= object_8 | 
array_12 ::= value array_14 
array_13 ::= [,] ws value 
array_14 ::= array_13 array_14 | 
array_15 ::= array_12 | 
string_16 ::= [^"\<U+0000>-<U+001F>] | [\] string_17 
string_17 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_18 ::= string_16 string_18 | 
number_19 ::= number_20 number_21 
number_20 ::= [-] | 
number_21 ::= [0-9] | [1-9] number_22 
number_22 ::= [0-9] number_22 | 
number_23 ::= [.] number_24 
number_24 ::= [0-9] number_24 | [0-9] 
number_25 ::= number_23 | 
number_26 ::= [eE

In [44]:
limit_rows = -1 # 20
df = pd.read_csv('data/datasets/dataset_v0.csv')
devices = list(df['device'].unique())
output_df = pd.DataFrame(columns=['device', 'user_cmd', 'mtd', 'json_cmd'])

for i, row in df.iterrows():
    # if i < 8 or i > 16:
    #     continue
    user_cmd = row['user_cmd']

    device = row['device']
    sample_devices = devices.copy()
    sample_devices.remove(device)
    sample_devices = random.sample(sample_devices, k=2)
    env = f'{sample_devices[0]} id=1, {sample_devices[1]} id=2, {device} id=444'

    retrieved_nodes = retriever.retrieve(user_cmd)
    methods_description = f'API method 1:\n{retrieved_nodes[0].text}'
    method = retrieved_nodes[0].metadata['file_name'].replace('.md', '')

    user_prompt = user_prompt_template.format(**{'env': env, 
                                                 'methods_description': methods_description, 
                                                 'user_cmd': user_cmd})
    query = base_prompt + '\n\n' + user_prompt
    response = llm.create_chat_completion(
        messages=[
            {'role': 'user', 'content': query}
        ],
        grammar=grammar
        # response_format={"type": "json_object",
        #                 "schema": json_scheme_general} # 
    )
    response_text = response['choices'][0]['message']['content']
    response_text = response_text.replace('\_', '_')
    print(response_text)
    json_cmd = json.dumps(json.loads(response_text))

    output_df.loc[len(output_df)] = pd.Series({'device': row['device'], 'user_cmd': user_cmd, 'mtd': method, 'json_cmd': json_cmd})

    if limit_rows > 0 and i == limit_rows - 1:
        break
output_df.to_csv('output/output.csv', index=False)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     155.13 ms /    22 runs   (    7.05 ms per token,   141.81 tokens per second)
llama_print_timings: prompt eval time =    3305.44 ms /   758 tokens (    4.36 ms per token,   229.32 tokens per second)
llama_print_timings:        eval time =    3592.75 ms /    21 runs   (  171.08 ms per token,     5.85 tokens per second)
llama_print_timings:       total time =    7117.97 ms /   779 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444}}



llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     155.11 ms /    22 runs   (    7.05 ms per token,   141.84 tokens per second)
llama_print_timings: prompt eval time =    1565.01 ms /   310 tokens (    5.05 ms per token,   198.08 tokens per second)
llama_print_timings:        eval time =    3576.11 ms /    21 runs   (  170.29 ms per token,     5.87 tokens per second)
llama_print_timings:       total time =    5359.23 ms /   331 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444}}



llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     195.13 ms /    28 runs   (    6.97 ms per token,   143.49 tokens per second)
llama_print_timings: prompt eval time =    1570.17 ms /   312 tokens (    5.03 ms per token,   198.70 tokens per second)
llama_print_timings:        eval time =    4602.49 ms /    27 runs   (  170.46 ms per token,     5.87 tokens per second)
llama_print_timings:       total time =    6447.82 ms /   339 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444, "name": null}}




llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     155.00 ms /    22 runs   (    7.05 ms per token,   141.94 tokens per second)
llama_print_timings: prompt eval time =    1564.55 ms /   310 tokens (    5.05 ms per token,   198.14 tokens per second)
llama_print_timings:        eval time =    3575.61 ms /    21 runs   (  170.27 ms per token,     5.87 tokens per second)
llama_print_timings:       total time =    5357.95 ms /   331 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444}}



llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     216.96 ms /    31 runs   (    7.00 ms per token,   142.89 tokens per second)
llama_print_timings: prompt eval time =    1574.84 ms /   313 tokens (    5.03 ms per token,   198.75 tokens per second)
llama_print_timings:        eval time =    5106.93 ms /    30 runs   (  170.23 ms per token,     5.87 tokens per second)
llama_print_timings:       total time =    6986.94 ms /   343 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444, "name": "bedroom"}}




llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     217.65 ms /    31 runs   (    7.02 ms per token,   142.43 tokens per second)
llama_print_timings: prompt eval time =    1540.43 ms /   304 tokens (    5.07 ms per token,   197.35 tokens per second)
llama_print_timings:        eval time =    5123.99 ms /    30 runs   (  170.80 ms per token,     5.85 tokens per second)
llama_print_timings:       total time =    6971.68 ms /   334 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444, "name": "kitchen"}}




llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     155.68 ms /    22 runs   (    7.08 ms per token,   141.32 tokens per second)
llama_print_timings: prompt eval time =    1624.65 ms /   328 tokens (    4.95 ms per token,   201.89 tokens per second)
llama_print_timings:        eval time =    3586.26 ms /    21 runs   (  170.77 ms per token,     5.86 tokens per second)
llama_print_timings:       total time =    5430.24 ms /   349 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444}}



llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     194.99 ms /    28 runs   (    6.96 ms per token,   143.60 tokens per second)
llama_print_timings: prompt eval time =    1555.11 ms /   309 tokens (    5.03 ms per token,   198.70 tokens per second)
llama_print_timings:        eval time =    4583.96 ms /    27 runs   (  169.78 ms per token,     5.89 tokens per second)
llama_print_timings:       total time =    6414.99 ms /   336 tokens
Llama.generate: prefix-match hit


{"method": "Temperature.GetConfig", "params": {"id": 444, "name": null}}




llama_print_timings:        load time =    1672.22 ms
llama_print_timings:      sample time =     153.73 ms /    22 runs   (    6.99 ms per token,   143.10 tokens per second)
llama_print_timings: prompt eval time =    3928.69 ms /   754 tokens (    5.21 ms per token,   191.92 tokens per second)
llama_print_timings:        eval time =    3662.15 ms /    21 runs   (  174.39 ms per token,     5.73 tokens per second)
llama_print_timings:       total time =    7809.75 ms /   775 tokens


{"method": "Input.GetStatus", "params": {"id": 444}}



In [30]:
output_df

Unnamed: 0,device,user_cmd,mtd,json_cmd
0,Switch,Set the dining room switch to flip mode.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
1,Switch,Set the living room switch to cycle mode.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
2,Switch,Enable Automatic OFF for the kitchen switch wi...,Switch.GetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
3,Switch,Set the bedroom switch name to 'Bed Lamp'.,Light.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
4,Switch,Set the hallway switch power limit to 50 Watts.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
5,Switch,Set the patio switch voltage limit to 220 Volts.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
6,Switch,Enable autorecover voltage errors for the gara...,Switch.GetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
7,Switch,Set the office switch current limit to 8 Amperes.,Switch.SetConfig,"{""method"": ""Switch.SetConfig"", ""params"": {""id""..."
8,Temperature,Please provide me with the current temperature...,Temperature.SetConfig,"{""method"": ""Temperature.GetConfig"", ""params"": ..."
9,Temperature,Get the temperature report threshold value for...,Temperature.GetConfig,"{""method"": ""Temperature.GetConfig"", ""params"": ..."


In [26]:
import ast

output_df = pd.read_csv('output/output.csv')
incorrect_output_df = pd.DataFrame(columns=['device', 'user_cmd', 'gt_mtd', 'output_mtd', 'gt_json_cmd', 'output_json_cmd'])

correct_methods = 0
correct_json_cmds = 0
for i in range(len(output_df)):
    gt = df.iloc[i]
    output = output_df.iloc[i]

    correct_ouput = True
    if gt['mtd'] == output['mtd']:
        correct_methods += 1
    else:
        correct_ouput = False
    try:
        if ast.literal_eval(gt['json_cmd'].replace("'", '"')) == json.loads(output['json_cmd']):
            correct_json_cmds += 1
        else:
            correct_ouput = False
    except Exception as ex:
        print(ex)
        print(gt['json_cmd'].replace("'", '"'))
        print(i, output['json_cmd'])
    
    if not correct_ouput:
        incorrect_output_df.loc[len(incorrect_output_df)] = pd.Series({'device': gt['device'], 'user_cmd': gt['user_cmd'], 'gt_mtd': gt['mtd'],
                                                              'output_mtd': output['mtd'], 'gt_json_cmd': gt['json_cmd'], 'output_json_cmd': output['json_cmd']})
acc_methods = round(correct_methods / len(output_df), 2)
acc_json_cmds = round(correct_json_cmds / len(output_df), 2)

incorrect_output_df.to_csv('output/incorrect_output.csv', index=False)

with open('output/results', 'a') as f:
    f.write(f'Acc of methods: {acc_methods}\n'
            f'Acc of json cmds: {acc_json_cmds}\n\n'
            f'-----------------------------------\n\n')

In [1]:
from llama_cpp import Llama

In [2]:
llm = Llama('models/mistral-7b-instruct-v0.2.Q4_K_M.gguf')

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 2080, compute capability 7.5, VMM: yes
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from models/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:      

In [3]:
output = llm('Why is sky blue?')
print(output)


llama_print_timings:        load time =     572.01 ms
llama_print_timings:      sample time =       5.60 ms /    16 runs   (    0.35 ms per token,  2857.14 tokens per second)
llama_print_timings: prompt eval time =     571.62 ms /     6 tokens (   95.27 ms per token,    10.50 tokens per second)
llama_print_timings:        eval time =    2425.41 ms /    15 runs   (  161.69 ms per token,     6.18 tokens per second)
llama_print_timings:       total time =    3036.28 ms /    21 tokens


{'id': 'cmpl-3836340b-ae7d-4091-b36b-3cc92bd2af80', 'object': 'text_completion', 'created': 1709562983, 'model': 'models/mistral-7b-instruct-v0.2.Q4_K_M.gguf', 'choices': [{'text': ' Is it because of pollution, or the presence of certain gases in the atmosphere', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 6, 'completion_tokens': 16, 'total_tokens': 22}}


In [20]:
# for i in range(10):
output = llm.create_chat_completion(
    messages=[
        {'role': 'user', 'content': 'Count to 10. Respond in JSON.'}
    ],
    response_format={'type': 'json_object',
                        "schema": {
                            "type": "object",
                            "properties": {"numbers": {"type": "array", "items": {"type": "number"}}}
                        }}
)

from_string grammar:
space ::= space_1 
space_1 ::= [ ] | 
number ::= number_3 number_9 number_13 space 
number_3 ::= number_4 number_5 
number_4 ::= [-] | 
number_5 ::= [0-9] | [1-9] number_6 
number_6 ::= [0-9] number_6 | 
number_7 ::= [.] number_8 
number_8 ::= [0-9] number_8 | [0-9] 
number_9 ::= number_7 | 
number_10 ::= [eE] number_11 number_12 
number_11 ::= [-+] | 
number_12 ::= [0-9] number_12 | [0-9] 
number_13 ::= number_10 | 
numbers ::= [[] space numbers_16 numbers_18 []] space 
numbers_15 ::= number 
numbers_16 ::= numbers_15 | 
numbers_17 ::= [,] space number 
numbers_18 ::= numbers_17 numbers_18 | 
root ::= [{] space ["] [n] [u] [m] [b] [e] [r] [s] ["] space [:] space numbers [}] space 

Llama.generate: prefix-match hit

llama_print_timings:        load time =     572.01 ms
llama_print_timings:      sample time =    5160.62 ms /    41 runs   (  125.87 ms per token,     7.94 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00

In [21]:
print(output['choices'][0]['message']['content'])

{ "numbers": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] }
