In [1]:
import os
import sys
import json

In [2]:
# Open and read the config file
with open('config.json', 'r') as config_file:
    config_data = json.load(config_file)

# Retrieve the API key from the config data
api_key = config_data['api_key']
os.environ['OPENAI_API_KEY'] = api_key

## logging

In [3]:
import tiktoken
from llama_index.callbacks import CallbackManager, TokenCountingHandler
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

# you can set a tokenizer directly, or optionally let it default
# to the same tokenizer that was used previously for token counting
# NOTE: The tokenizer should be a function that takes in text and returns a list of tokens
token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4").encode,
    verbose=True  # set to true to see usage printed to the console
    )
callback_manager = CallbackManager([token_counter])

## create Vector Index


In [10]:
# Load you data into 'Documents' a custom type by LlamaIndex
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('./data', recursive=True).load_data()

### add metadata

In [11]:
import re

# save control fields in the documents metadata
for doc in documents:

    # extract the Value of the control from the text with regex. controls are in the format: /nControl: Value/n
    contr = re.search(r'(?<=\nControl: ).*(?=\n)', doc.text)
    #get the title. it is the first line of the text
    title = doc.text.split('\n')[2]

    #check if a control value was found and write it to the metadata
    if contr:
        doc.metadata['Control'] = contr.group(0)
        print(doc.metadata['Control'] + ' : ' + title)


NULL : 3D Workspace
NULL : Activate a License
LogWindow : Activity Feed
AcceptAddSpace : Add Space
NULL : Advanced Interactions
AnalysisTabButton : Analysis Explorer
NULL : Annotation
AnnotationsTabButton : Annotations Explorer
AnnotationsList : Annotations List
NULL : Asynchronous collaboration
AutomaticCollabToggle : Automatic Collaboration Mode
AvailableDataListView : Available Data
ProjectAvailableOfflineToggle : Available Offline
AvailableSessionsList : Available Sessions
NULL : Blocks
BreadCrumbs : Breadcrumb Trail
NULL : Cache
NULL : Callout Block
NULL : Clipping Block
CollabMenu : Collab Menu
StartCollab : Collab Start
StopCollab : Collab Stop
NULL : Collaborative Session
ConnectedUsersList : Connected Users
CopyPublicLinkToSpaceButton : Copy a Public Sharing Link to a Space
CopyPublicLinkToViewButton : Copy a Public Sharing Link to a View Block
NULL : Create a New Project
NULL : Create a User Account
NULL : Delete Data
NULL : Delete Your User Account
DeleteExplorerButton : Del

In [12]:
# This is for when building the index from nodes instead Documents:
# Nodes are usefull when the same nodes should be part of several indexes
from llama_index.node_parser import SimpleNodeParser

parser = SimpleNodeParser()

nodes = parser.get_nodes_from_documents(documents)

In [13]:
from llama_index import VectorStoreIndex
from llama_index import LangchainEmbedding, ServiceContext

service_context = ServiceContext.from_defaults(callback_manager=callback_manager) # use default settings but connect the token logger
vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)

Embedding Token Usage: 441
Embedding Token Usage: 628
Embedding Token Usage: 335
Embedding Token Usage: 329
Embedding Token Usage: 421
Embedding Token Usage: 1004
Embedding Token Usage: 545
Embedding Token Usage: 1510
Embedding Token Usage: 831
Embedding Token Usage: 279
Embedding Token Usage: 851
Embedding Token Usage: 706
Embedding Token Usage: 560
Embedding Token Usage: 757
Embedding Token Usage: 383
Embedding Token Usage: 425
Embedding Token Usage: 300
Embedding Token Usage: 565
Embedding Token Usage: 723
Embedding Token Usage: 285
Embedding Token Usage: 573
Embedding Token Usage: 499
Embedding Token Usage: 826
Embedding Token Usage: 345
Embedding Token Usage: 388
Embedding Token Usage: 1471
Embedding Token Usage: 1206
Embedding Token Usage: 9062
Embedding Token Usage: 412
Embedding Token Usage: 931
Embedding Token Usage: 1048
Embedding Token Usage: 2510
Embedding Token Usage: 1103
Embedding Token Usage: 4367
Embedding Token Usage: 3477


In [14]:
print(token_counter.total_embedding_token_count)

79751


## create a List Index

In [9]:
from llama_index import ListIndex
documents = SimpleDirectoryReader('./data/ListData', recursive=True).load_data()
nodes = parser.get_nodes_from_documents(documents)
list_index= ListIndex.from_nodes(nodes)

AttributeError: type object 'ListIndex' has no attribute 'from_nodes'

## save the index

saves the index under /storage in json format

In [15]:
vector_index.storage_context.persist()