### Weaviate Vector DB Creation

In [1]:
!docker compose up -d

 Container rag-comprehensive-weaviate-1  Starting
Error response from daemon: driver failed programming external connectivity on endpoint rag-comprehensive-weaviate-1 (8762104eacd08910fa6f73f5dff6fbeb0203242860fb295b6d1dc1b3085f0146): Bind for 0.0.0.0:50051 failed: port is already allocated


In [2]:
import os
import weaviate
from dotenv import load_dotenv
load_dotenv()

# Create weaviate client
google_api_key=os.getenv("GOOGLE_API_KEY")
client = weaviate.Client(
    url = "http://localhost:8080",  # Replace with your endpoint
    additional_headers = {
        "X-Google-Api-Key": google_api_key # Replace with your inference API key
    }
)
client.is_ready()

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


True

### Data loading

In [3]:
from llama_index.core import SimpleDirectoryReader
# Load documents
documents=SimpleDirectoryReader("./data").load_data()

### Hierarchical splitter

In [4]:
from llama_index.core.node_parser import TokenTextSplitter
text_splitter_ids = ["1024", "510"]
text_splitter_map = {}
for ids in text_splitter_ids:
    text_splitter_map[ids] = TokenTextSplitter(
        chunk_size=int(ids),
        chunk_overlap=200
    )

In [12]:
from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes
from llama_index.core.schema import MetadataMode

# This function takes in a directory of files, puts them in a giant document, and parses and returns them as:
# - a hierarchical node structure if it's a hierarchical implementation
# - a simple node structure if it's a non-hierarchial implementation

def hierachical_splitter(filepath, hierarchical=True):
    """Load documents from a directory."""
    documents=SimpleDirectoryReader(filepath).load_data()

    if hierarchical:
        # combine all documents into one
        documents = [
            Document(text="\n\n".join(
                    document.get_content(metadata_mode=MetadataMode.ALL)
                    for document in documents
                )
            )
        ]

        node_parser = HierarchicalNodeParser.from_defaults(node_parser_ids=text_splitter_ids, node_parser_map=text_splitter_map)

        parent_nodes = node_parser.get_nodes_from_documents(documents)
        return parent_nodes, get_leaf_nodes(parent_nodes)
    else:
        return parent_nodes

# After loading the documents, call the function:
parent_nodes, leaf_nodes = hierachical_splitter("data", hierarchical=True)

### Data Indexing

In [13]:
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

Settings.llm = Gemini(models='gemini-pro', api_key=google_api_key)
Settings.embed_model = GeminiEmbedding(model_name="models/embedding-001")
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

  from .autonotebook import tqdm as notebook_tqdm


#### Load and Parse Documents with Node Parser

We load the documentation and store parent nodes in a `SimpleDocumentStore` and leaf nodes in a `VectorStoreIndex`.

In [14]:
from llama_index.core.storage.docstore import SimpleDocumentStore

docstore = SimpleDocumentStore()
docstore.add_documents(parent_nodes)

In [15]:
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

# construct vector store 
vector_store = WeaviateVectorStore(weaviate_client=client)

# set up storage for embeddings
storage_context = StorageContext.from_defaults(docstore=docstore, vector_store=vector_store)
# set up the index 
index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context
)

### Data Querying

##### Query with Default Vector Search

In [16]:
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

display_response(response)


**`Final Response:`** The author wrote short stories and programmed on an IBM 1401 computer.