In [1]:
import os
import json
import openai
from langchain.embeddings import OpenAIEmbeddings
from llama_index.llms import AzureOpenAI
from llama_index import LangchainEmbedding
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index import GPTVectorStoreIndex, LLMPredictor, PromptHelper
import logging
import sys

In [2]:
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_TYPE = os.getenv('OPENAI_API_TYPE')
OPENAI_API_VERSION = os.getenv('OPENAI_API_VERSION')
OPENAI_API_BASE = os.getenv('OPENAI_API_BASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai.api_base = os.getenv("OPENAI_API_BASE")  # Your Azure OpenAI resource's endpoint value.
openai.api_key = os.getenv("OPENAI_API_KEY")


In [3]:
print(os.getenv("OPENAI_API_TYPE"))

azure


In [4]:
logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [6]:
# download files
!mkdir data_10k
!wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O data_10k/UBER.zip
!unzip data_10k/UBER.zip -d data_10k

--2023-07-16 16:52:15--  https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.4.18
Connecting to www.dropbox.com (www.dropbox.com)|162.125.4.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/948jr9cfs7fgj99/UBER.zip [following]
--2023-07-16 16:52:15--  https://www.dropbox.com/s/dl/948jr9cfs7fgj99/UBER.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc22a8628412f805c791a4c3db9c.dl.dropboxusercontent.com/cd/0/get/B_8zaL9CSz6R1bE0MTP9jhm19WiMVuwO4wvXWXD54ZyRg7J01UaJlvv5BDnuTRyARjnFhZJeuUj6KiVZOuDj5442DQdfjQCjCu6yixEyXh8nhrT5TpPa2M5j4n50LtcIUoRwUYNhnvgWmX0zMtiApj6zxIOzv-G_jPxoxk3iiYibLg/file?dl=1# [following]
--2023-07-16 16:52:15--  https://uc22a8628412f805c791a4c3db9c.dl.dropboxusercontent.com/cd/0/get/B_8zaL9CSz6R1bE0MTP9jhm19WiMVuwO4wvXWXD54ZyRg7J01UaJlvv5BDnuTRyARjnFhZJeuUj6KiVZOuDj5442DQdfjQCjCu6yixEyXh8nhrT5TpPa2M

In [7]:
# set text wrapping
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [9]:
from llama_index import download_loader, GPTVectorStoreIndex
from pathlib import Path

In [10]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

Collecting unstructured (from -r /Users/yashdixit/opt/anaconda3/envs/gpt_basic/lib/python3.10/site-packages/llama_index/readers/llamahub_modules/file/unstructured/requirements.txt (line 1))
  Downloading unstructured-0.8.1-py3-none-any.whl (1.4 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 8.9 MB/s eta 0:00:001
[?25hCollecting nltk (from -r /Users/yashdixit/opt/anaconda3/envs/gpt_basic/lib/python3.10/site-packages/llama_index/readers/llamahub_modules/file/unstructured/requirements.txt (line 2))
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 11.7 MB/s eta 0:00:00
[?25hCollecting chardet (from unstructured->-r /Users/yashdixit/opt/anaconda3/envs/gpt_basic/lib/python3.10/site-packages/llama_index/readers/llamahub_modules/file/unstructured/requirements.txt (line 1))
  Downloading chardet-5.1.0-py3-none-any.whl (199 kB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.1/199.1 kB 5.7 MB/s eta 0:00:00
[

In [12]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2022, 2021, 2020, 2019]
for year in years:
    year_docs = loader.load_data(file=Path(f'./data_10k/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to /Users/yashdixit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yashdixit/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...
INFO:unstructured:Reading document from string ...
Reading document from string ...
INFO:unstructured:Reading document ...
Reading document ...


In [13]:
llm = AzureOpenAI(engine="gpt-35-turbo", model="gpt-3.5-turbo")

# You need to deploy your own embedding model as well as your own chat completion model
embedding_llm = LangchainEmbedding(
    OpenAIEmbeddings(
        model="text-embedding-ada-002",
        deployment="text-embedding-ada-002",
        openai_api_key=openai.api_key,
        openai_api_base=openai.api_base,
        openai_api_type=openai.api_type,
        openai_api_version=openai.api_version,
    ),
    embed_batch_size=1,
)

# documents = SimpleDirectoryReader("data").load_data()

In [16]:
from llama_index import set_global_service_context

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embedding_llm,
    chunk_size_limit=512
)

set_global_service_context(service_context)

chunk_size_limit is deprecated, please specify chunk_size instead


In [18]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = VectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    # cur_index.save_to_disk(f'index_{year}.json')
    

In [20]:
### ONLY RUN THIS IF YOU WANT TO ASK GLOBAL QUESTIONS ACROSS ALL YEARS, AS MENTIONED BELOW
# global_index = VectorStoreIndex.from_documents(all_docs, service_context=service_context)

In [24]:
response = (index_set[2020].as_query_engine(similarity_top_k=3)).query("What were some of the biggest risk factors in 2020?")

In [25]:
print(response)

Some of the biggest risk factors in 2020 were the adverse impact of the COVID-19 pandemic on the business and operations, including reduced demand for mobility offerings, changes in travel behavior and demand, and the need for remote work arrangements. Other risk factors included privacy, cybersecurity, and fraud risks, regulatory challenges, workforce reductions, and changes to pricing models. The uncertainty and unpredictability of the pandemic's impact on future business operations, liquidity, financial condition, and results of operations were also significant risk factors.


In [None]:
##ASK GLOBAL QUESTIONS ACROSS ALL YEARS

# risk_query_str = "What are some of the biggest risk factors in each year?"
# response = (global_index.as_query_engine(similarity_top_k=3)).query(risk_query_str)