In [None]:
! pip install llama-index nltk milvus pymilvus langchain openai python-dotenv requests

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yujiantang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yujiantang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from llama_index import (
    VectorStoreIndex, 
    SimpleKeywordTableIndex, 
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
    StorageContext
)
from langchain.llms.openai import OpenAIChat

In [3]:
import os
from dotenv import load_dotenv
import openai
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

from llama_index.vector_stores import MilvusVectorStore
from milvus import default_server

In [4]:
default_server.start()

In [7]:
vector_store = MilvusVectorStore(
   host = "localhost",
   port = default_server.listen_port,
   dim=1536
)

In [6]:
wiki_titles = ["Toronto", "Seattle", "San Francisco", "Chicago", "Boston", "Washington, D.C.", "Cambridge, Massachusetts", "Houston"]

In [8]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)

In [9]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()

In [10]:
llm_predictor_chatgpt = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt)



In [11]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
# Build city document index
city_indices = {}
index_summaries = {}
for wiki_title in wiki_titles:
    city_indices[wiki_title] = VectorStoreIndex.from_documents(city_docs[wiki_title], service_context=service_context, storage_context=storage_context)
    # set summary text for city
    index_summaries[wiki_title] = f"Wikipedia articles about {wiki_title}"

In [13]:
from llama_index.indices.composability import ComposableGraph

In [14]:
graph = ComposableGraph.from_indices(
    SimpleKeywordTableIndex,
    [index for _, index in city_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

In [15]:
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor_chatgpt, verbose=True
)

In [16]:
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
custom_query_engines = {}

In [17]:
for index in city_indices.values():
    query_engine = index.as_query_engine(service_context=service_context)
    transform_extra_info = {'index_summary': index.index_struct.summary}
    tranformed_query_engine = TransformQueryEngine(query_engine, decompose_transform, transform_metadata=transform_extra_info)
    custom_query_engines[index.index_id] = tranformed_query_engine

custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(
    retriever_mode='simple', 
    response_mode='tree_summarize', 
    service_context=service_context
)

query_engine_decompose = graph.as_query_engine(
    custom_query_engines=custom_query_engines,)

In [18]:
response_chatgpt = query_engine_decompose.query(
    "Compare and contrast the airports in Seattle and Houston. "
)

[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[38;5;200m[1;3m> New query: What is the name of the airport in Seattle?
[0m[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[38;5;200m[1;3m> New query: What is the name of the airport in Seattle?
[0m[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[38;5;200m[1;3m> New query: What are the major airports in Houston?
[0m[33;1m[1;3m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[38;5;200m[1;3m> New query: What are the major airports in Houston?
[0m

In [19]:
print(str(response_chatgpt))

The major airport in Seattle is Seattle-Tacoma International Airport, also known as Sea-Tac Airport. In Houston, there are two major airports: George Bush Intercontinental Airport and William P. Hobby Airport.


In [None]:
custom_query_engines = {}
for index in city_indices.values():
    query_engine = index.as_query_engine(service_context=service_context)
    custom_query_engines[index.index_id] = query_engine

custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(
    retriever_mode='simple', 
    response_mode='tree_summarize', 
    service_context=service_context
)

query_engine = graph.as_query_engine(
    custom_query_engines=custom_query_engines,    
)

In [None]:
response_chatgpt = query_engine.query(
    "Compare and contrast the airports in Seattle and Houston. "
)
str(response_chatgpt)

In [None]:
response_chatgpt = query_engine_decompose.query(
    "Compare and contrast the sports environment of Houston and Boston. "
)

In [None]:
str(response_chatgpt)

In [None]:
from pymilvus import utility, connections

connections.connect(host="127.0.0.1", port=default_server.listen_port)
for collection in utility.list_collections():
    utility.drop_collection(collection)

default_server.stop()
default_server.cleanup()