In [6]:
! pip install llama-index nltk milvus pymilvus langchain openai python-dotenv requests llama-index-vector-stores-milvus langchain-openai



In [11]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /Users/Yuhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Yuhao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
from llama_index.core import (
    VectorStoreIndex, 
    SimpleKeywordTableIndex, 
    SimpleDirectoryReader,
    StorageContext,
    Settings
)
from llama_index.llms.openai import OpenAI

In [17]:
import os
from dotenv import load_dotenv
#import openai
#load_dotenv()
#openai.api_key = os.getenv("OPENAI_API_KEY")
openai_api_key = "sk-proj-wKv8NHK3Q2rqsBgWnvEfT3BlbkFJncnFCt2k16h19vvPQ73k"
os.environ["OPENAI_API_KEY"] = openai_api_key

from llama_index.vector_stores.milvus import MilvusVectorStore
from milvus import default_server

In [4]:
default_server.start()
#default_server.stop()

In [5]:
vector_store = MilvusVectorStore(
   host = "localhost",
   port = default_server.listen_port,
   dim=1536
)

In [6]:
wiki_titles = ["Toronto", "Seattle", "San Francisco", "Chicago", "Boston", "Washington, D.C.", "Cambridge, Massachusetts", "Houston"]

In [7]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)

In [12]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()

In [13]:
llm_predictor_chatgpt = OpenAI(temperature=0, model_name="gpt-3.5-turbo", api_key=openai_api_key)
Settings.llm = llm_predictor_chatgpt

In [14]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [18]:
# Build city document index
city_indices = {}
index_summaries = {}
for wiki_title in wiki_titles:
    city_indices[wiki_title] = VectorStoreIndex.from_documents(city_docs[wiki_title], storage_context=storage_context)
    # set summary text for city
    index_summaries[wiki_title] = f"Wikipedia articles about {wiki_title}"

In [20]:
from llama_index.core import ComposableGraph

In [21]:
graph = ComposableGraph.from_indices(
    SimpleKeywordTableIndex,
    [index for _, index in city_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

In [55]:
from llama_index.core.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor_chatgpt, verbose=True
)

In [57]:
from llama_index.core.query_engine.transform_query_engine import TransformQueryEngine
custom_query_engines = {}

In [62]:
for index in city_indices.values():
    query_engine = index.as_query_engine(llm=Settings.llm)
    transform_extra_info = {'index_summary': index.index_struct.summary}
    tranformed_query_engine = TransformQueryEngine(query_engine, decompose_transform, transform_metadata=transform_extra_info)
    custom_query_engines[index.index_id] = tranformed_query_engine

custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(
    retriever_mode='simple', 
    response_mode='tree_summarize', 
    llm=Settings.llm
)

query_engine_decompose = graph.as_query_engine(
    custom_query_engines=custom_query_engines,)

In [63]:
response_chatgpt = query_engine_decompose.query(
    "Compare and contrast the airports in Seattle and Houston. "
)

[1;3;33m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[1;3;38;5;200m> New query: What are some key features of the airports in Houston according to Wikipedia?
[0m[1;3;33m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[1;3;38;5;200m> New query: What are some key features of the airports in Houston according to Wikipedia?
[0m[1;3;33m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[1;3;38;5;200m> New query: What are some key features of the airport in Seattle according to the Wikipedia article?
[0m[1;3;33m> Current query: Compare and contrast the airports in Seattle and Houston. 
[0m[1;3;38;5;200m> New query: What are some key features of the Seattle airport according to the Wikipedia article?
[0m

In [64]:
print(str(response_chatgpt))

The airports in Houston are overseen by the Houston Airport System and operate three major public airports: George Bush Intercontinental Airport, William P. Hobby Airport, and Ellington Airport. These airports offer commercial aviation services to domestic and international destinations, serving a significant number of passengers annually. They have received awards for airport improvement programs and customer service, and are recognized for their international travel services. Additionally, they serve as hubs for major airlines like United Airlines.

On the other hand, the Seattle airport is known as Seattle–Tacoma International Airport, or Sea-Tac Airport, and is operated by the Port of Seattle. It offers commercial air service to destinations worldwide. Seattle also has Boeing Field, which is closer to downtown and is utilized for general aviation, cargo flights, and for testing and delivery of Boeing airliners. Paine Field, another airport in Seattle, opened in 2019 and is primaril

In [65]:
custom_query_engines = {}
for index in city_indices.values():
    query_engine = index.as_query_engine(llm=Settings.llm)
    custom_query_engines[index.index_id] = query_engine

custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(
    retriever_mode='simple', 
    response_mode='tree_summarize', 
    lm=Settings.llm
)

query_engine = graph.as_query_engine(
    custom_query_engines=custom_query_engines,    
)

NameError: name 'service_context' is not defined

In [None]:
response_chatgpt = query_engine.query(
    "Compare and contrast the airports in Seattle and Houston. "
)
str(response_chatgpt)

In [None]:
response_chatgpt = query_engine_decompose.query(
    "Compare and contrast the sports environment of Houston and Boston. "
)

In [None]:
str(response_chatgpt)

In [None]:
from pymilvus import utility, connections

connections.connect(host="127.0.0.1", port=default_server.listen_port)
for collection in utility.list_collections():
    utility.drop_collection(collection)

default_server.stop()
default_server.cleanup()