In [1]:
# !pip install unstructure

In [2]:
import nest_asyncio

nest_asyncio.apply()


In [3]:
from llama_index.readers.file import UnstructuredReader
from pathlib import Path

years = [2022, 2021, 2020, 2019]

loader = UnstructuredReader()

doc_set = {} # saves as a dictionary : year is the key
all_docs = [] # saves all dictionaries

for year in years:
    year_docs = loader.load_data(
        file=Path(f"/home/ec2-user/qa_with_llm/data/UBER/UBER_{year}.html"), split_documents=False
    ) # all the text is read. No embeddings made.

    # insert the year data
    for d in year_docs:
        d.metadata = {'year': year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

In [2]:
print(year_docs)

NameError: name 'year_docs' is not defined

In [5]:
# Initialize simple vector indices
# This steps are not needed if the indices are already loaded!
from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama


In [6]:
llm = Ollama(model="llama3", request_timeout=180.0)

In [7]:
Settings.chunk_size = 512
Settings.chunk_overlap = 64
Settings.embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-base-en-v1.5')
Settings.llm = llm



In [8]:
# index_set = {}
# for year in years:
#     storage_context = StorageContext.from_defaults()
#     cur_index = VectorStoreIndex.from_documents(
#         doc_set[year],
#         storage_context=storage_context   
#     )
#     index_set[year] = cur_index
#     storage_context.persist(persist_dir=f"./storage/{year}")

In [9]:
# Load and index from disk
from llama_index.core import load_index_from_storage

index_set = {}

for year in years:
    storage_context = StorageContext.from_defaults(
        persist_dir=f"./storage/{year}"
    )
    cur_index = load_index_from_storage(
        storage_context,
    )
    index_set[year] = cur_index

In [10]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata

individual_query_engine_tools = [
    QueryEngineTool(
        query_engine=index_set[year].as_query_engine(),
        metadata=ToolMetadata(
            name=f"vector_index_{year}",
            description=(
                "useful for when you want to answer queries about the"
                f" {year} SEC 10-K for Uber"
            ),
        ),
    )
    for year in years
]

In [11]:
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=individual_query_engine_tools,
)

In [12]:
query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="sub_question_query_engine",
        description=(
            "useful for when you want to answer queries that require analyzing"
            " multiple SEC 10-K documents for Uber"
        ),
    ),
)

In [13]:
tools = individual_query_engine_tools + [query_engine_tool]

In [14]:
from llama_index.core.agent import ReActAgent
agent = ReActAgent.from_tools(tools, verbose=True, llm=llm)

In [15]:
# response = agent.chat(
#     "Hello, I am John."
# )
# print(response)

In [16]:
response = agent.chat(
    "What were some of the biggest risk factors in 2020 for Uber?"
)
print(str(response))

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: vector_index_2020
Action Input: {'input': 'risk factors', 'year': 2020}
[0m[1;3;34mObservation: Some of the risks that could have an adverse effect on our business, financial condition, operating results, or prospects include the COVID-19 pandemic and its impact on mitigating actions. The mobility, delivery, and logistics industries are highly competitive with well-established alternatives, low barriers to entry, and low switching costs. Our business would be harmed if Drivers were classified as employees, workers, or quasi-employees instead of independent contractors. We have incurred significant losses since inception and may not achieve profitability due to increasing operating expenses. The platform's appeal could suffer if we fail to attract or maintain a critical mass of users. Maintaining our brand and reputation is crucial for our business prospec

In [20]:
cross_query_str = (
    """Compare and contrast the risk factors described in the Uber 10-Ks across all the years. 
    Give answer in bullet points.
    """
)

response = agent.chat(cross_query_str)

[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: Here is the comparison of risk factors across all years:

**2022:**
• Regulatory environment
• Mitigating actions for COVID-19 pandemic's impact on business
• Competition from other ride-hailing and delivery companies
• High employee turnover rate
• Dependence on third-party service providers
• Cybersecurity risks due to handling of sensitive customer data

**2021:**
• Classification of drivers as employees or quasi-employees, impacting labor costs and operations
• Regulatory environment, particularly with regards to employment classification and benefits
• Competition from other ride-hailing and delivery companies
• High employee turnover rate
• Dependence on third-party service providers

**2020:**
• COVID-19 pandemic's impact on business, including reduced ridership and revenue
• Regulatory environment, particularly with regards to transportation and employment laws
• Competition from other ride-hailing 