In [3]:
import weaviate
import requests, json
from weaviate.classes.config import Configure

client = weaviate.connect_to_local()

In [4]:
client.close()

In [6]:
jeopardy = client.collections.get("medbooks")
response = jeopardy.query.fetch_objects()
print(response)
for o in response.objects:
    print(o.properties)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('00265a1b-5533-4dc4-b28e-7a6211bad80d'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': 'bers that emerge from spinal cord level\n\nT1 and ascend in the sympathetic trunks to the highest ganglion in the neck (the superior cervical ganglion), where they synapse. Postganglionic fibers then travel along blood vessels to target tissues in the head, including blood vessels, sweat glands, small smooth muscles associated with the upper eyelids, and the dilator of the pupil.\n\n3.\u2002Sympathetic innervation of thoracic\n\nPreganglionic sympathetic fibers may synapse with postganglionic motor neurons', 'path': 'Anatomy_Gray.txt', 'chunk_id': 141}, references=None, vector={}, collection='Medbooks'), Object(uuid=_WeaviateUUIDInt('006cf58a-e9d0-408e-a8f0-548ac07a0747'), metadata=MetadataReturn(creation_tim

In [5]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType

client = weaviate.connect_to_local()  # http://localhost:8080

# IMPORTANT: if Weaviate is in Docker and Ollama is on the HOST,
# use host.docker.internal:11434 for the api_endpoint.
# If both are on the same host (no Docker for Weaviate), you can use http://localhost:11434

try:
    client.collections.delete("Medbooks")  # drop if exists, else ignore
except:
    pass

client.collections.create(
    name="Medbooks",
    properties=[
        Property(name="path", data_type=DataType.TEXT),
        Property(name="chunk_id", data_type=DataType.INT),
        Property(name="text", data_type=DataType.TEXT),
    ],
    vector_config=[
        Configure.Vectors.text2vec_ollama(
            name="text_vector",
            source_properties=["text"],  # vectorize the `text` field
            api_endpoint="http://host.docker.internal:11434",  # if Weaviate runs in Docker
            model="hf.co/mradermacher/MedEmbed-large-v0.1-GGUF:Q4_K_M",  # or "nomic-embed-text"
        )
    ],
)


<weaviate.collections.collection.sync.Collection at 0x18287b426d0>

In [10]:
import os, glob, uuid, json, time
from typing import List
import requests
import weaviate
from weaviate.util import generate_uuid5
from tqdm import tqdm

OLLAMA_URL = "http://localhost:11434/api/embeddings"
OLLAMA_MODEL = "hf.co/mradermacher/MedEmbed-large-v0.1-GGUF:Q4_K_M"  # change to 'nomic-embed-text' if needed
DATA_DIR = "./med_txt"  # folder that contains your 10 .txt files
CHUNK_SIZE = 1200       # chars
CHUNK_OVERLAP = 200

def chunk_text(text: str, size: int, overlap: int) -> List[str]:
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + size, n)
        chunks.append(text[start:end])
        if end == n: break
        start = end - overlap
        if start < 0: start = 0
    return chunks

def embed(text: str) -> List[float]:
    # Ollama embeddings API uses 'prompt'
    payload = {"model": OLLAMA_MODEL, "prompt": text}
    r = requests.post(OLLAMA_URL, json=payload, timeout=120)
    r.raise_for_status()
    vec = r.json().get("embedding") or r.json().get("data", [{}])[0].get("embedding")
    if not vec:
        raise RuntimeError(f"Missing 'embedding' in Ollama response: {r.text[:200]}")
    return vec

In [None]:
coll = client.collections.get("medbooks")

files = sorted(glob.glob(os.path.join(DATA_DIR, "*.txt")))
if not files:
    raise SystemExit(f"No .txt files found under {DATA_DIR}")

to_insert = []   # accumulate for insert_many
vectors = []     # parallel list of vectors

for path in files[-1:]:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
    for i, chunk in enumerate(chunks):
        # small guard against empty/whitespace
        c = chunk.strip()
        if not c:
            continue

        vec = embed(c)
        obj = {
            "path": os.path.basename(path),
            "chunk_id": i,
            "text": c,
        }
        to_insert.append(obj)
        vectors.append(vec)
        print(f"processing {i + 1}/{len(chunks)} chunks from {os.path.basename(path)}")

# Efficient upsert: insert_many supports passing parallel vectors
# For very large corpora prefer collection.batch (dynamic batching). :contentReference[oaicite:5]{index=5}
res = coll.data.insert_many(objects=to_insert, vectors=vectors)
if res.has_errors:
    print("Some insert errors:", res.errors)
else:
    print(f"Inserted {len(to_insert)} chunks into 'med-books'")

processing 1/11442 chunks from Surgery_Schwartz.txt
processing 2/11442 chunks from Surgery_Schwartz.txt
processing 3/11442 chunks from Surgery_Schwartz.txt
processing 4/11442 chunks from Surgery_Schwartz.txt


KeyboardInterrupt: 

: 

In [13]:
import weaviate, requests
import weaviate.classes as wvc


OLLAMA_URL = "http://localhost:11434/api/embeddings"
OLLAMA_MODEL = "hf.co/mradermacher/MedEmbed-large-v0.1-GGUF:Q4_K_M"

def embed(q: str):
    r = requests.post(OLLAMA_URL, json={"model": OLLAMA_MODEL, "prompt": q}, timeout=60)
    r.raise_for_status()
    return r.json()["embedding"]

client = weaviate.connect_to_local()
try:
    coll = client.collections.get("medbooks")
    qv = embed("CT scanner")
    # near_vector search; request back selected properties
    out = coll.query.hybrid(
        query="CT scanner",
        alpha=0.5,
        vector=qv,
        limit=3
        )
    print(out)

    # for o in out.objects:
    #     print(f"[{o.properties['path']}#{o.properties['chunk_id']}] dist={o.metadata.distance:.4f}")
    #     print(o.properties["text"][:300], "\n---")
finally:
    client.close()

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('0254090f-35a4-47f0-98f1-5b61e61ccd69'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': 'f blood to the left arm.\n\nThe patient was transferred from the emergency department to the CT scanner, and a scan was performed that included the chest, abdomen, and pelvis.\n\nThe CT scan demonstrated a dissecting thoracic aortic aneurysm. Aortic dissection occurs when the tunica intima and part of the tunica media of the wall of the aorta become separated from the remainder of the tunica media and the tunica adventitia of the aorta wall. This produces a false lumen. Blood passes not only in the', 'path': 'Anatomy_Gray.txt', 'chunk_id': 460}, references=None, vector={}, collection='Medbooks'), Object(uuid=_WeaviateUUIDInt('9a2f340f-943a-4605-9939-1e59dd7028fa'), metadata=MetadataReturn(creation_time=None, 

In [9]:
client = weaviate.connect_to_local()

try:
    coll = client.collections.get("medbooks")

    res = coll.query.fetch_objects(limit=5, return_properties=["path", "chunk_id", "text"])

    for o in res.objects:
        print(f"{o.properties['path']}#{o.properties['chunk_id']}")
        print(o.properties["text"][:200], "...\n---")
finally:
    client.close()

In [9]:
from weaviate.classes.query import MetadataQuery

client = weaviate.connect_to_local()

try:
    medbooks = client.collections.get("medbooks")
    response = medbooks.query.near_text(
        query="How CT scanners work",
    )

    for o in response.objects:
        print(o.properties)
        print(o.metadata.distance)
finally:
    client.close()


{'text': 'ions of CT scanners.\n\nA CT scanner obtains a series of images of the body (slices) in the axial plane. The patient lies on a bed, an X-ray tube passes around the body (Fig. 1.7), and a series of images are obtained. A computer carries out a complex mathematical transformation on the multitude of images to produce the final image (Fig. 1.8).\n\nNuclear magnetic resonance imaging was first described in 1946 and used to determine the structure of complex molecules. The process of magnetic resonance i', 'path': 'Anatomy_Gray.txt', 'chunk_id': 31}
None
{'text': 'shift determines whether the object is moving away from or toward the probe and the speed at which it is traveling. Precise measurements of blood flow and blood velocity can therefore be obtained, which in turn can indicate sites of blockage in blood vessels.\n\nComputed tomography (CT) was invented in the 1970s by Sir Godfrey Hounsfield, who was awarded the Nobel Prize in Medicine in 1979. Since this inspired invention 