### Load chunks as we did before

In [None]:
from chonkie import SemanticChunker
from pathlib import Path

md_filepath = Path("data/parsed/WEF_Artificial_Intelligence_in_Financial_Services_2025-parsed-w-imgs.md")
md_txt = md_filepath.read_text()

chunker = SemanticChunker(
    embedding_model="minishlab/potion-base-8M",  # Default model
    threshold=0.5,                               # Similarity threshold (0-1) or (1-100) or "auto"
    chunk_size=2048,                              # Maximum tokens per chunk
    min_sentences=1                              # Initial sentences per chunk
)
chunk_texts = chunker.chunk(md_txt)

### Set up Weaviate Collection

In [None]:
from helpers import update_creds

AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_SESSION_TOKEN = update_creds()

%store -r WEAVIATE_IP

In [None]:
import weaviate

client = weaviate.connect_to_local(
    WEAVIATE_IP,
    headers = {
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SESSION_TOKEN,
    }
)

client.is_ready()

In [None]:
client.collections.delete("Chunks")

In [None]:
from weaviate.classes.config import Property, DataType, Configure, Tokenization

client.collections.create(
    name="Chunks",
    properties=[
        Property(
            name="document_title",
            data_type=DataType.TEXT,
        ),
        Property(
            name="chunk",
            data_type=DataType.TEXT,
        ),
        Property(
            name="chunk_number",
            data_type=DataType.INT,
        ),
        Property(
            name="filename",
            data_type=DataType.TEXT,
            tokenization=Tokenization.FIELD
        ),
    ],
    vector_config=[
        Configure.Vectors.text2vec_aws(
            name="default",
            source_properties=["document_title", "chunk"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"
        )
    ]
)

In [None]:
chunks = client.collections.use("Chunks")

### Import data

In [None]:
from tqdm import tqdm

with chunks.batch.fixed_size(batch_size=100) as batch:
    for i, chunk_text in tqdm(enumerate(chunk_texts)):
        obj = {
            "document_title": "WEF AI Financial Services 2025",
            "filename": "data/pdfs/WEF_Artificial_Intelligence_in_Financial_Services_2025.pdf",
            "chunk": chunk_text.text,
            "chunk_number": i + 1,
        }

        # Add object to batch for import with (batch.add_object())
        # ADD YOUR CODE HERE

### RAG queries

In this scenario, let's:

- Retrieve text chunks
- Get images referred to in the text
- Convert the images to base64
- Send (retrieved text + images + prompt) to LLM for RAG

In [None]:
response = chunks.query.hybrid(
    query="AI spending by industry and sector",
    limit=10
)

for o in response.objects:
    print(f"\n" + "=" * 40)
    print(o.properties["chunk"][:1000] + "...")

In [None]:
import re

def extract_image_paths(text):
    """Extract image paths from markdown-style image references."""
    pattern = r'!\[.*?\]\((.*?)\)'
    return re.findall(pattern, text)

In [None]:
def get_image_base64s(image_paths, base_path=None):
    import base64
    base64_images = []
    for img_path in image_paths:
        full_path = Path(base_path) / img_path if base_path else Path(img_path)
        image_bytes = full_path.read_bytes()
        base64_string = base64.b64encode(image_bytes).decode("utf-8")
        base64_images.append(base64_string)

    return base64_images

In [None]:
all_chunks = ""
all_images = []

for o in response.objects:
    chunk_text = o.properties["chunk"]
    image_paths = extract_image_paths(chunk_text)
    print(f"Adding image paths: {image_paths}")
    all_images.extend(get_image_base64s(image_paths, base_path="data/parsed"))

    all_chunks += "\n\n" + chunk_text

In [None]:
message_list = [{
    "role": "user",
    "content": []
}]

for img in all_images:
    content = {
        "image": {
            "format": "png",
            "source": {"bytes": img},
        }
    }
    message_list[0]["content"].append(content)

task_text = """
Based on this, which industries are spending more on AI and why might that be?

Make sure to also describe any and all key details from the figures used in the analysis.
""" + "\n\n" + all_chunks
message_list[0]["content"].append({"text": task_text})

In [None]:
import boto3
import json

client = boto3.client(
    "bedrock-runtime",
    region_name="us-west-2",
)

# MODEL_ID = "us.amazon.nova-lite-v1:0"
# MODEL_ID = "us.amazon.nova-pro-v1:0"
MODEL_ID = "us.amazon.nova-premier-v1:0"

# Define your system prompt(s).
system_list = [{"text": "You are an expert. Answer the questions thoughtfully but succinctly based on the provided text and images."}]

native_request = {
    "schemaVersion": "messages-v1",
    "messages": message_list,
    "system": system_list,
    "inferenceConfig": {"maxTokens": 2000, "topP": 0.1, "topK": 20, "temperature": 0.3},
}

# Invoke the model and extract the response body.
response = client.invoke_model(modelId=MODEL_ID, body=json.dumps(native_request))
model_response = json.loads(response["body"].read())

# Print the text content for easy readability.
content_text = model_response["output"]["message"]["content"][0]["text"]
print("\n[Response Content Text]")
print(content_text)

In [None]:
client.close()