In [1]:
import weaviate
from weaviate.classes.config import Property, DataType

from llama_index.core import SimpleDirectoryReader

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

import ollama
import chunking as ck
from tqdm import tqdm

client: weaviate.WeaviateClient = weaviate.connect_to_local()

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)
Settings.llm = Ollama(model="llama3", request_timeout=60.0)

[nltk_data] Downloading package punkt to /Users/timotewb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
source_dir = "data/processed/pymupdf/txt"
data = SimpleDirectoryReader(source_dir).load_data()

In [3]:
try:
    client.collections.delete("Pymupdf")
except:
    pass

In [4]:
collection = client.collections.create(
    name = "Pymupdf", # Name of the data collection
    properties=[
        Property(name="content", data_type=DataType.TEXT), # Name and data type of the property
        Property(name="file_name", data_type=DataType.TEXT), # Name and data type of the property
    ],
)

In [6]:
with collection.batch.dynamic() as batch:
  for i, d in enumerate(data):

    # Chunk data
    chunks: list[str] = ck.recursive_chunking(d.to_dict()["text"], 1024, 512)

    for c in tqdm(chunks, desc=f"Embeding chunks for '{d.to_dict()["metadata"]["file_name"]}'"):
      
      # Generate embeddings
      response: ollama.EmbeddingsResponse = ollama.embeddings(model = "all-minilm", prompt = c)

      # Add data object with text and embedding
      batch.add_object(
          properties = {"content" : c, "file_name": d.to_dict()["metadata"]["file_name"]},
          vector = response["embedding"],
      )

Embeding chunks for 'An Overview on RAG Evaluation.txt': 100%|██████████| 64/64 [00:01<00:00, 57.63it/s]
Embeding chunks for 'Electric and hybrid cars.txt': 100%|██████████| 16/16 [00:00<00:00, 77.40it/s]
Embeding chunks for 'Evaluation Metrics for Search and Recommendation Systems.txt': 100%|██████████| 16/16 [00:00<00:00, 72.00it/s]
Embeding chunks for 'Geocoding via LINZ Address Matching.txt': 100%|██████████| 256/256 [00:03<00:00, 70.88it/s]
Embeding chunks for 'Is it worth buying a second-hand Nissan Leaf.txt': 100%|██████████| 16/16 [00:00<00:00, 70.63it/s]
Embeding chunks for 'Monthly operating report for February 2025 (PDF).txt': 100%|██████████| 32/32 [00:00<00:00, 64.05it/s]
Embeding chunks for 'Monthly operating report for January 2025 (PDF).txt': 100%|██████████| 32/32 [00:00<00:00, 67.25it/s]


In [None]:
client.close()