In [1]:
import weaviate
import requests, json
from weaviate.classes.config import Configure

client = weaviate.connect_to_local()

In [3]:
import weaviate.classes as wvc

collection = client.collections.create(
        name="medbooks",
        properties=[
            wvc.config.Property(name="path", data_type=wvc.config.DataType.TEXT),
            wvc.config.Property(name="chunk_id", data_type=wvc.config.DataType.INT),
            wvc.config.Property(name="text", data_type=wvc.config.DataType.TEXT),
        ],
        # no vectorizer_config => BYO vectors
    )

UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Medbooks already exists'}]}.

In [6]:
import os, glob, uuid, json, time
from typing import List
import requests
import weaviate
from weaviate.util import generate_uuid5
from tqdm import tqdm

OLLAMA_URL = "http://localhost:11434/api/embeddings"
OLLAMA_MODEL = "hf.co/mradermacher/MedEmbed-large-v0.1-GGUF:Q4_K_M"  # change to 'nomic-embed-text' if needed
DATA_DIR = "./med_txt"  # folder that contains your 10 .txt files
CHUNK_SIZE = 1200       # chars
CHUNK_OVERLAP = 200

def chunk_text(text: str, size: int, overlap: int) -> List[str]:
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + size, n)
        chunks.append(text[start:end])
        if end == n: break
        start = end - overlap
        if start < 0: start = 0
    return chunks

def embed(text: str) -> List[float]:
    # Ollama embeddings API uses 'prompt'
    payload = {"model": OLLAMA_MODEL, "prompt": text}
    r = requests.post(OLLAMA_URL, json=payload, timeout=120)
    r.raise_for_status()
    vec = r.json().get("embedding") or r.json().get("data", [{}])[0].get("embedding")
    if not vec:
        raise RuntimeError(f"Missing 'embedding' in Ollama response: {r.text[:200]}")
    return vec

In [None]:
coll = client.collections.get("medbooks")

files = sorted(glob.glob(os.path.join(DATA_DIR, "*.txt")))
if not files:
    raise SystemExit(f"No .txt files found under {DATA_DIR}")

to_insert = []   # accumulate for insert_many
vectors = []     # parallel list of vectors

for path in files:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
    for i, chunk in enumerate(chunks):
        # small guard against empty/whitespace
        c = chunk.strip()
        if not c:
            continue

        vec = embed(c)
        obj = {
            "path": os.path.basename(path),
            "chunk_id": i,
            "text": c,
        }
        to_insert.append(obj)
        vectors.append(vec)

# Efficient upsert: insert_many supports passing parallel vectors
# For very large corpora prefer collection.batch (dynamic batching). :contentReference[oaicite:5]{index=5}
res = coll.data.insert_many(objects=to_insert, vectors=vectors)
if res.has_errors:
    print("Some insert errors:", res.errors)
else:
    print(f"Inserted {len(to_insert)} chunks into 'med-books'")

In [6]:
import weaviate, requests

OLLAMA_URL = "http://localhost:11434/api/embeddings"
OLLAMA_MODEL = "hf.co/mradermacher/MedEmbed-large-v0.1-GGUF:Q4_K_M"

def embed(q: str):
    r = requests.post(OLLAMA_URL, json={"model": OLLAMA_MODEL, "prompt": q}, timeout=60)
    r.raise_for_status()
    return r.json()["embedding"]

client = weaviate.connect_to_local()
try:
    coll = client.collections.get("medbooks")
    qv = embed("pathophysiology")
    # near_vector search; request back selected properties
    out = coll.query.near_vector(
        near_vector=qv,
        limit=5,
        return_metadata=wvc.query.MetadataQuery(distance=True),
        return_properties=["path", "chunk_id", "text"]
    )
    for o in out.objects:
        print(f"[{o.properties['path']}#{o.properties['chunk_id']}] dist={o.metadata.distance:.4f}")
        print(o.properties["text"][:300], "\n---")
finally:
    client.close()

In [7]:
client = weaviate.connect_to_local()

try:
    coll = client.collections.get("medbooks")

    res = coll.query.fetch_objects(limit=5, return_properties=["path", "chunk_id", "text"])

    for o in res.objects:
        print(f"{o.properties['path']}#{o.properties['chunk_id']}")
        print(o.properties["text"][:200], "...\n---")
finally:
    client.close()