In [21]:
import psycopg2

def get_cursor():
    # Connect to DB
    conn = psycopg2.connect(
        dbname="vector_db",
        user="user",
        password="123456",
        host="pgvector",
        port=5432
    )
    cur = conn.cursor()
    
    # 1. Enable pgvector extension
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
    return cur, conn

def select(sql_statement):
    cur, conn = get_cursor()
    cur.execute(sql_statement)
    res = cur.fetchall()
    return res

In [22]:
import json
# Store all documents in memory
docs = []
titles = []

def insert_json_entries(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        for entry in data:
            title = entry.get("title", "")
            text = entry.get("text", "")
            docs.append(text)
            titles.append(title)

insert_json_entries("test_data/entries_response.json")
print("Saved {} docs".format(len(docs)))

Saved 89 docs


In [23]:
def chunk_text(text, chunk_size=300, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        chunks.append(text[start:start+chunk_size])
        start += chunk_size - overlap
    return chunks

def get_all_chunks(docs, chunk_size=300, overlap=50):
    all_chunks = set()
    for doc in docs:
        chunks = chunk_text(doc, chunk_size=chunk_size, overlap=overlap)
        all_chunks.update(chunks)
    return list(all_chunks)

chunks = get_all_chunks(docs)
print(f"🔹 Total unique chunks: {len(chunks)}")

🔹 Total unique chunks: 188


In [24]:
import joblib
global vectorizer, doc_vectors
vectorizer = TfidfVectorizer().fit(chunks)
joblib.dump(vectorizer, 'vectorizer.joblib')
vectorizer = joblib.load('vectorizer.joblib')

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def build_and_store_tfidf_vectors():

    doc_vectors = vectorizer.transform(chunks).toarray()
    cur, conn = get_cursor()
    for i, vec in enumerate(doc_vectors):
        chunk_text = chunks[i]
        embedding_str = str(list(vec))

        sql = """
        INSERT INTO documents (content, embedding)
        VALUES (%s, %s::vector)
        """
        try:
            cur.execute(sql, (chunk_text, embedding_str))
        except Exception as e:
            print(f"❌ Error inserting chunk {i}: {e}")
            conn.rollback()
        else:
            conn.commit()
            print(f"✅ Inserted chunk {i+1}/{len(chunks)}")

build_and_store_tfidf_vectors()
print("✅ Done")

✅ Inserted chunk 1/188
✅ Inserted chunk 2/188
✅ Inserted chunk 3/188
✅ Inserted chunk 4/188
✅ Inserted chunk 5/188
✅ Inserted chunk 6/188
✅ Inserted chunk 7/188
✅ Inserted chunk 8/188
✅ Inserted chunk 9/188
✅ Inserted chunk 10/188
✅ Inserted chunk 11/188
✅ Inserted chunk 12/188
✅ Inserted chunk 13/188
✅ Inserted chunk 14/188
✅ Inserted chunk 15/188
✅ Inserted chunk 16/188
✅ Inserted chunk 17/188
✅ Inserted chunk 18/188
✅ Inserted chunk 19/188
✅ Inserted chunk 20/188
✅ Inserted chunk 21/188
✅ Inserted chunk 22/188
✅ Inserted chunk 23/188
✅ Inserted chunk 24/188
✅ Inserted chunk 25/188
✅ Inserted chunk 26/188
✅ Inserted chunk 27/188
✅ Inserted chunk 28/188
✅ Inserted chunk 29/188
✅ Inserted chunk 30/188
✅ Inserted chunk 31/188
✅ Inserted chunk 32/188
✅ Inserted chunk 33/188
✅ Inserted chunk 34/188
✅ Inserted chunk 35/188
✅ Inserted chunk 36/188
✅ Inserted chunk 37/188
✅ Inserted chunk 38/188
✅ Inserted chunk 39/188
✅ Inserted chunk 40/188
✅ Inserted chunk 41/188
✅ Inserted chunk 42/188
✅

In [26]:
res = select("SELECT * FROM documents")
print(res)

[(1, 'm Patientenkontakt, nach Kontakt mit potenziell kontaminierten Oberflächen sowie nach dem Toilettengang oder Husten/Niesen müssen die Hände mit Wasser und Seife mindestens 30 Sekunden lang gewaschen werden.</p></li><li><p><strong>Händedesinfektion</strong>: Eine alkoholbasierte Desinfektion sollte v', '[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.17303483,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.19211617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0