In [1]:
import os
import sys
import json

In [2]:
# Open and read the config file
with open('config.json', 'r') as config_file:
    config_data = json.load(config_file)

# Retrieve the API key from the config data
api_key = config_data['api_key']
os.environ['OPENAI_API_KEY'] = api_key

## logging

In [3]:
import tiktoken
from llama_index.callbacks import CallbackManager, TokenCountingHandler
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

# you can set a tokenizer directly, or optionally let it default
# to the same tokenizer that was used previously for token counting
# NOTE: The tokenizer should be a function that takes in text and returns a list of tokens
token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode,
    verbose=True  # set to true to see usage printed to the console
    )
callback_manager = CallbackManager([token_counter])

## create Vector Index for Actions


In [4]:
# Load you data into 'Documents' a custom type by LlamaIndex
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('./data/Actions', recursive=True).load_data()

### add metadata

In [5]:
import re

# save control fields in the documents metadata
for doc in documents:

    # extract the Value of the control from the text with regex. controls are in the format: /nControl: Value/n
    contr = re.search(r'(?<=\ndescription: ).*(?=\n)', doc.text)
    #get the title. it is the first line of the text
    title = doc.text.split('\n')[2]

    #check if a control value was found and write it to the metadata
    if contr:
        doc.metadata['short_description'] = contr.group(0)
        doc.metadata['name'] = title
        print(doc.metadata['short_description'] + ' : ' + title)


creates a new space for the user. : LeiaActionsCreateSpace
changes the background color of the LiquidEarth Workspace. : LeiaActionsSwitchBackgroundColor


In [6]:
# This is for when building the index from nodes instead Documents:
# Nodes are usefull when the same nodes should be part of several indexes
from llama_index.node_parser import SimpleNodeParser

parser = SimpleNodeParser()

nodes = parser.get_nodes_from_documents(documents)

In [7]:
from llama_index import VectorStoreIndex
from llama_index import LangchainEmbedding, ServiceContext

service_context = ServiceContext.from_defaults(callback_manager=callback_manager) # use default settings but connect the token logger
actions_index = VectorStoreIndex.from_documents(documents, service_context=service_context)

Embedding Token Usage: 329


In [8]:
documents = SimpleDirectoryReader('./data/Other', recursive=True).load_data()


# save control fields in the documents metadata
for doc in documents:

    # extract the Value of the control from the text with regex. controls are in the format: /nControl: Value/n
    contr = re.search(r'(?<=\ndescription: ).*(?=\n)', doc.text)
    #get the title. it is the first line of the text
    title = doc.text.split('\n')[2]

    #check if a control value was found and write it to the metadata
    if contr:
        doc.metadata['short_description'] = contr.group(0)
        doc.metadata['name'] = title
        print(doc.metadata['short_description'] + ' : ' + title)

docu_index = VectorStoreIndex.from_documents(documents, service_context=service_context)

Embedding Token Usage: 605
Embedding Token Usage: 329
Embedding Token Usage: 310
Embedding Token Usage: 395
Embedding Token Usage: 975
Embedding Token Usage: 523
Embedding Token Usage: 1458
Embedding Token Usage: 811
Embedding Token Usage: 257
Embedding Token Usage: 835
Embedding Token Usage: 681
Embedding Token Usage: 540
Embedding Token Usage: 728
Embedding Token Usage: 365
Embedding Token Usage: 404
Embedding Token Usage: 280
Embedding Token Usage: 548
Embedding Token Usage: 692
Embedding Token Usage: 263
Embedding Token Usage: 537
Embedding Token Usage: 454
Embedding Token Usage: 799
Embedding Token Usage: 326
Embedding Token Usage: 371
Embedding Token Usage: 1451
Embedding Token Usage: 1182
Embedding Token Usage: 9062
Embedding Token Usage: 412
Embedding Token Usage: 931
Embedding Token Usage: 1048
Embedding Token Usage: 2510
Embedding Token Usage: 1103
Embedding Token Usage: 4367
Embedding Token Usage: 3477


In [9]:
print(token_counter.total_embedding_token_count)

39358


## save the indexes

saves the index under /storage in json format

In [10]:
actions_index.storage_context.persist("./storage/actions")
docu_index.storage_context.persist("./storage/documentation")