# Introduction

This notebook shows how to use GPT index to store embeddings in a weaviate instance.

# Imports

In [1]:
import logging
import sys

import weaviate

from gpt_index import Document, GPTWeaviateIndex
from gpt_index.langchain_helpers.text_splitter import TokenTextSplitter
from gpt_index.readers import WikipediaReader
from gpt_index import LLMPredictor
from langchain import Cohere, HuggingFaceHub, OpenAI

from IPython.display import Markdown, HTML

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Dataset

Download an article from wikipedia and split it into chunks of 200 words.

In [3]:
# don't use the wikipedia reader for now, it's not working
# see: https://github.com/jerryjliu/gpt_index/issues/581
# docs = WikipediaReader().load_data(["cats"])
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=200, chunk_overlap=20)

from gpt_index.readers import SimpleDirectoryReader

docs = SimpleDirectoryReader("./sample_data").load_data()

In [4]:
text_chunks = text_splitter.split_text(docs[0].text)
doc_chunks = [Document(t) for t in text_chunks]


# Index

In [5]:
client = weaviate.Client("http://weaviate:8080")


In [6]:
client.schema.delete_all()


A service can sometimes be flaky so choose one of the following options:

In [7]:
# llm_predictor = LLMPredictor(llm=HuggingFaceHub())
# llm_predictor = LLMPredictor(llm=Cohere())
llm_predictor = LLMPredictor(llm=OpenAI())

In [8]:
index = GPTWeaviateIndex(doc_chunks, weaviate_client=client, llm_predictor=llm_predictor)


INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens


> [build_index_from_documents] Total LLM token usage: 0 tokens


INFO:root:> [build_index_from_documents] Total embedding token usage: 12421 tokens


> [build_index_from_documents] Total embedding token usage: 12421 tokens


In [9]:
# check that the docs are really in weaviate
client.cluster.get_nodes_status()

[{'gitHash': '5ce21bb',
  'name': 'node1',
  'shards': [{'class': 'Gpt_Index_8588076422103233428_Node',
    'name': 'RVZpGnqDvq57',
    'objectCount': 74}],
  'stats': {'objectCount': 74, 'shardCount': 1},
  'status': 'HEALTHY',
  'version': '1.17.3'}]

# Query

In [10]:
question = "What is the scientific name for cats?"

response = index.query(question)

print(response.response)

INFO:root:> [query] Total LLM token usage: 226 tokens


> [query] Total LLM token usage: 226 tokens


INFO:root:> [query] Total embedding token usage: 8 tokens


> [query] Total embedding token usage: 8 tokens

The scientific name for cats is Felis catus.
