In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## Basic RAG with Weaviate

Now - let's try performing RAG with the chunks that we've created. 

We will:
- Load & chunk a document
- Add the chunks to Weaviate, and generate vectors
- And perform RAG

We assume some familiarity with Weaviate here. 

(If not, check out the [Weaviate Quickstart](https://docs.weaviate.io/weaviate/quickstart), or ask questions in the live session!)

### Load and chunk a document

In [2]:
from pathlib import Path

def get_chunks_using_markers(src_text: str) -> list[str]:
    """
    Split the source text into chunks using markers.
    """
    marker = "\n##"

    # Split by marker and reconstruct with markers (except first chunk)
    parts = src_text.split(marker)
    chunks = []

    # Add first chunk if it exists and isn't empty
    if parts[0].strip():
        chunks.append(parts[0].strip())

    # Add remaining chunks with markers reattached
    for part in parts[1:]:
        if part.strip():
            chunks.append(marker + part.strip())

    return chunks


md_file = Path("data/parsed/manual_bosch_WGG254Z0GR-parsed-text.md")
md_text = md_file.read_text(encoding="utf-8")
chunk_texts = get_chunks_using_markers(md_text)

### Set up Weaviate

In [3]:
import weaviate
import os

# Use Embedded Weaviate with:
# version: latest available (e.g. "1.32.0"), headers = {"X-Cohere-Api-Key": os.getenv("COHERE_API_KEY")}, env vars: {"LOG_LEVEL": "error"}
# BEGIN_SOLUTION
client = weaviate.connect_to_embedded(
    version="1.32.0",
    headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
    },
    environment_variables={"LOG_LEVEL": "error"}  # Reduce amount of logs
)
# END_SOLUTION

### Set up a collection

In [4]:
client.collections.delete("Chunks")

{"action":"load_all_shards","build_git_commit":"7cebee0421","build_go_version":"go1.24.5","build_image_tag":"HEAD","build_wv_version":"1.32.0","level":"error","msg":"failed to load all shards: context canceled","time":"2025-07-17T13:32:44+01:00"}


In [None]:
from weaviate.classes.config import Property, DataType, Configure, Tokenization

client.collections.create(
    name="Chunks",
    properties=[
        Property(
            name="document_title",
            data_type=DataType.TEXT,
        ),
        Property(
            name="chunk",
            data_type=DataType.TEXT,
        ),
        Property(
            name="chunk_number",
            data_type=DataType.INT,
        ),
        Property(
            name="filename",
            data_type=DataType.TEXT,
            tokenization=Tokenization.FIELD
        ),
    ],
    vector_config=[
        # Add `Configure.Vectors.text2vec_cohere` vector to the collection with:
        # name: "default", source properties: ["document_title", "chunk"], and model: "embed-v4.0"
        # BEGIN_SOLUTION
        Configure.Vectors.text2vec_cohere(
            name="default",
            source_properties=["document_title", "chunk"],
            model="embed-v4.0"
        )
        # END_SOLUTION
    ]
)

<weaviate.collections.collection.sync.Collection at 0x123c734c0>

### Import data

In [6]:
chunks = client.collections.get("Chunks")

In [None]:
from tqdm import tqdm

with chunks.batch.fixed_size(batch_size=100) as batch:
    for i, chunk_text in tqdm(enumerate(chunk_texts)):
        obj = {
            "document_title": "Bosch WGG254Z0GR Manual",
            "filename": "data/pdfs/manual_bosch_WGG254Z0GR.pdf",
            "chunk": chunk_text,
            "chunk_number": i + 1,
        }

        # Add object to batch for import with (batch.add_object())
        # BEGIN_SOLUTION
        batch.add_object(
            properties=obj
        )
        # END_SOLUTION

127it [00:00, 35072.20it/s]


### RAG queries



In [8]:
# Try a RAG query with:
# query (what to search for): "how to clean the washing machine" and
# grouped_task (prompt): "Briefly, what tasks do I need to perform to regularly maintain and clean the washing machine?"
# limit (how many objects to fetch): 10
# BEGIN_SOLUTION
response = chunks.generate.hybrid(
    query="how to clean the washing machine",
    limit=10,
    grouped_task="Briefly, what tasks do I need to perform to regularly maintain and clean the washing machine?"
)
# END_SOLUTION

print("Query response:")
print(response.generative.text)

Query response:
To regularly maintain and clean the washing machine, you need to perform the following tasks:

1. Clean the detergent drawer by removing it, cleaning it with water and a brush, and then drying it before fitting it back in place.
2. Start an empty washing cycle to remove any residual water by pouring water and washing powder into the detergent drawer and running the cycle without any laundry.
3. Regularly run a cleaning cycle for the drum or wash at temperatures of at least 60°C to prevent damage and maintain hygiene.
4. Prepare your laundry properly by brushing off sand and soil, sorting laundry by color and fabric type, and following care labels for washing instructions.
5. Rinse pretreated laundry thoroughly with water before washing to prevent any risk of explosion.
6. Use the washing machine only for machine-washable fabrics and hand-washable wool, with tap water and suitable detergents, in a domestic environment up to 4000m above sea level.
7. Follow the recommende

### Recap - what's happening under the hood

![assets/llm_3_rag_weaviate.png](assets/llm_3_rag_weaviate.png)

We can review the passages:

In [9]:
print("Supporting passages:")
for o in response.objects:
    print(f"\n> Object: {o.uuid}:")
    print(o.properties['chunk'][:200]+"...")

Supporting passages:

> Object: b32ac3ce-ae7f-5260-908c-717edd3c2b21:

##17.2 Cleaning the detergent drawer

1. Pull out the detergent drawer.
2. Press down on the insert and remove the detergent drawer.

<!-- image -->

- en Cleaning and servicing
3. Pull out the inser...

> Object: 912f65d6-4652-598c-a1f4-8bb277f0a3a5:

##6.1 Starting an empty washing cycle

Your appliance was inspected thoroughly before leaving the factory. To remove any residual water, run the first wash cycle without any laundry.

1. Turn the pro...

> Object: dc708050-4e09-5b4d-bc24-ba1bb2526d67:

##Risk of injury!

Permanently washing at low temperatures and a lack of ventilation for the appliance may damage the drum and cause injury.

- Regularly run a programme for cleaning the drum or wash...

> Object: 6d7eb232-171b-559f-b049-fb799102997c:

##Note

The appliance and fabrics are protected when you prepare your laundry.

- ¡ Brush off sand and soil
- ¡ Sort the laundry according to colour and textiles and obse

In [10]:
client.close()

{"build_git_commit":"7cebee0421","build_go_version":"go1.24.5","build_image_tag":"HEAD","build_wv_version":"1.32.0","error":"context canceled","level":"error","msg":"replication engine failed to start after FSM caught up","time":"2025-07-17T13:32:59+01:00"}
{"build_git_commit":"7cebee0421","build_go_version":"go1.24.5","build_image_tag":"HEAD","build_wv_version":"1.32.0","error":"cannot find peer","level":"error","msg":"transferring leadership","time":"2025-07-17T13:32:59+01:00"}
