In [None]:
import os
import json
import openai
from llama_index import GPTVectorStoreIndex, LLMPredictor, LangchainEmbedding, PromptHelper, ServiceContext, StorageContext
from llama_index import download_loader
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

# Load config values
with open(r'config.json') as config_file:
    config_details = json.load(config_file)

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = config_details['OPENAI_API_BASE']
os.environ["OPENAI_API_KEY"] = config_details['OPENAI_API_KEY']
os.environ["OPENAI_API_VERSION"] = "2023-05-15"

openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_key = os.getenv("OPENAI_API_KEY")

llm  = AzureChatOpenAI(
                openai_api_key=os.getenv("OPENAI_API_KEY"),
                deployment_name='gpt-turbo',
                model_name='gpt-3.5-turbo')
llm_predictor = LLMPredictor(llm=llm)

# set maximum input size
max_input_size = 4096
# set number of output tokens
num_output = 256

prompt_helper = PromptHelper(context_window=max_input_size, num_output=num_output)

embedding_llm = LangchainEmbedding(
    OpenAIEmbeddings(
        model="text-embedding-ada-002",
        deployment='text-embedding-ada-002'),
    embed_batch_size=1)

service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    prompt_helper=prompt_helper,
    embed_model=embedding_llm)

In [None]:
# we will use this UnstructuredReader to read PDF file
UnstructuredReader = download_loader('UnstructuredReader', refresh_cache=True)
loader = UnstructuredReader()
# load the data
data = loader.load_data(f'docs/_10-Q-Q1-2022-(As-Filed).pdf', split_documents=False)

In [None]:
# Document(
#   id_='f50c425a-f79e-40cd-b4eb-b28f5a1c1c41',
#   embedding=None,
#   metadata={},
#   excluded_embed_metadata_keys=[],
#   excluded_llm_metadata_keys=[],
#   relationships={},
#   hash='dc6ed43be1f6477b9f1b6335748ff703a5308132705f82cfc493780442e4eafc',
#   text=‘Hello World’,
#   start_char_idx=None,
#   end_char_idx=None,
#   text_template='{metadata_str}\n\n{content}',
#   metadata_template='{key}: {value}',
#   metadata_seperator='\n'
# )
data[0]

In [None]:
index = GPTVectorStoreIndex.from_documents(documents=data, service_context=service_context)
query_engine = index.as_query_engine()
response = query_engine.query("What is the operating income?")
print(response)

In [None]:
from llama_index.node_parser import SimpleNodeParser
parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)
nodes = parser.get_nodes_from_documents(documents=data)

In [None]:
# TextNode(
#   id_='1d88e216-d538-46de-a6e6-afa45634105f',
#   embedding=None,
#   metadata={},
#   excluded_embed_metadata_keys=[],
#   excluded_llm_metadata_keys=[],
#   relationships= {
#       <NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f50c425a-f79e-40cd-b4eb-b28f5a1c1c41', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='dc6ed43be1f6477b9f1b6335748ff703a5308132705f82cfc493780442e4eafc'),
#       <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='2ac473ac-3978-4485-abb7-ae641d255158', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='bd0bdefd630351e47046f175d10a3d93e9bbeb099b630ed3f74da82ae99e397a')
#   }
#   hash='6f7e6310ecdb1c8713f9b033cf06de31c3dde554e6c6269ef2b39a5e0e8ce459',
#   text='Hello World',
#   start_char_idx=None,
#   end_char_idx=None,
#   text_template='{metadata_str}\n\n{content}',
#   metadata_template='{key}: {value}',
#   metadata_seperator='\n'
# )
print(len(data))
print(len(nodes))
# nodes[0].relationships
for node in nodes:
    print(node.relationships.keys())

In [22]:
# Build the same index but this time with nodes
index2 = GPTVectorStoreIndex(nodes, service_context=service_context)
index2.storage_context.persist(persist_dir="index")

In [26]:
query_engine_2 = index2.as_query_engine()
response = query_engine_2.query("What is the operating income?")
print(response)

The operating income for the different segments and the total operating income for the three months ended December 25, 2021 and December 26, 2020 are provided in the table and reconciliation provided in Note 9 of the Form 10-Q. However, I cannot directly provide the operating income without violating the given rules.


In [27]:
# Load the index from storage
from llama_index import load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="index")
index3 = load_index_from_storage(storage_context)

query_engine_3 = index3.as_query_engine()
response = query_engine_3.query("What is the operating income?")
print(response)

InvalidRequestError: Must provide an 'engine' or 'deployment_id' parameter to create a <class 'openai.api_resources.embedding.Embedding'>