In [2]:
import weaviate
from weaviate.classes.config import Property, DataType

from llama_index.core import SimpleDirectoryReader

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

import ollama
import chunking as ck
from tqdm import tqdm

client: weaviate.WeaviateClient = weaviate.connect_to_local()

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)
Settings.llm = Ollama(model="llama3", request_timeout=60.0)

[nltk_data] Downloading package punkt to /Users/timotewb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
source_dir = "data/processed/pymupdf/txt"
data = SimpleDirectoryReader(source_dir).load_data()

In [4]:
try:
    client.collections.delete("Pymupdf")
except:
    pass

In [5]:
collection = client.collections.create(
    name = "Pymupdf", # Name of the data collection
    properties=[
        Property(name="content", data_type=DataType.TEXT), # Name and data type of the property
        Property(name="file_name", data_type=DataType.TEXT), # Name and data type of the property
    ],
)

In [6]:
with collection.batch.dynamic() as batch:
  for i, d in enumerate(data):

    # Chunk data
    chunks: list[str] = ck.sentence_based_chunking(d.to_dict()["text"], 5)

    for c in tqdm(chunks, desc=f"Embeding chunks for '{d.to_dict()["metadata"]["file_name"]}'"):
      
      # Generate embeddings
      response: ollama.EmbeddingsResponse = ollama.embeddings(model = "all-minilm", prompt = c)

      # Add data object with text and embedding
      batch.add_object(
          properties = {"content" : c, "file_name": d.to_dict()["metadata"]["file_name"]},
          vector = response["embedding"],
      )

Embeding chunks for 'An Overview on RAG Evaluation.txt': 100%|██████████| 57/57 [00:00<00:00, 59.85it/s]
Embeding chunks for 'Electric and hybrid cars.txt': 100%|██████████| 17/17 [00:00<00:00, 93.53it/s]
Embeding chunks for 'Evaluation Metrics for Search and Recommendation Systems.txt': 100%|██████████| 9/9 [00:00<00:00, 60.15it/s]
Embeding chunks for 'Geocoding via LINZ Address Matching.txt': 100%|██████████| 684/684 [00:06<00:00, 105.36it/s]
Embeding chunks for 'Is it worth buying a second-hand Nissan Leaf.txt': 100%|██████████| 20/20 [00:00<00:00, 118.03it/s]
Embeding chunks for 'Monthly operating report for February 2025 (PDF).txt': 100%|██████████| 3/3 [00:00<00:00, 81.62it/s]
Embeding chunks for 'Monthly operating report for January 2025 (PDF).txt': 100%|██████████| 3/3 [00:00<00:00, 82.41it/s]
Embeding chunks for 'NZC - English (Years 0-6).txt': 100%|██████████| 115/115 [00:01<00:00, 86.13it/s]
Embeding chunks for 'People Policy.txt': 100%|██████████| 5/5 [00:00<00:00, 76.45it/

In [7]:
client.close()