# NYC Wikipedia Embeddings Demo

In [15]:
%pip install langchain
%pip install --upgrade llama_index langchain

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [16]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex

### Setup + Data Prep

In [17]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [18]:
# fetch "New York City" page from Wikipedia
from pathlib import Path

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'New York City',
        'prop': 'extracts',
        # 'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
nyc_text = page['extract']

data_path = Path('data')
if not data_path.exists():
    Path.mkdir(data_path)

with open('data/nyc_text.txt', 'w') as fp:
    fp.write(nyc_text)

In [19]:
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] =  os.environ['API_KEY']

### GPTTreeIndex - Embedding-based Query

In [20]:
from llama_index import GPTTreeIndex, SimpleDirectoryReader
from IPython.display import Markdown

In [21]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTTreeIndex.from_documents(documents)


AttributeError: type object 'GPTTreeIndex' has no attribute 'from_documents'

In [22]:
index.save_to_disk('index.json')

NameError: name 'index' is not defined

In [None]:
new_index = GPTTreeIndex.load_from_disk('index.json')

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What is the name of the professional women's basketball team in New York City?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = new_index.query(
    "What battles took place in New York City in the American Revolution?", 
    mode="embedding"
)

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What are the airports in New York City?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

### GPTListIndex - Embedding-based Query

In [None]:
from llama_index import GPTListIndex, SimpleDirectoryReader
from IPython.display import Markdown

In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTListIndex.from_documents(documents)

In [None]:
index.save_to_disk('index_list_emb.json')

In [None]:
# try loading
new_index = GPTListIndex.load_from_disk('index_list_emb.json')

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What is the name of the professional women's basketball team in New York City?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What battles took place in New York City in the American Revolution?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What are the airports in New York City?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

## Try out other embeddings! 
(courtesy of langchain)

In [None]:
from llama_index import GPTListIndex, SimpleDirectoryReader, ServiceContext
from IPython.display import Markdown

In [None]:
# load in HF embedding model from langchain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

In [None]:
# try loading index
new_index = GPTListIndex.load_from_disk('index_list_emb.json')

In [None]:
# configure
service_context = ServiceContext.from_defaults(embed_model=embed_model)

# set Logging to DEBUG for more detailed outputs
response = new_index.query(
    "What is the name of the professional women's basketball team in New York City?", 
    mode="embedding", 
    service_context=service_context, 
)

In [None]:
response