In [1]:
import weaviate
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import os
from dotenv import load_dotenv

load_dotenv()

client = weaviate.connect_to_embedded()
print("Connected to Weaviate Embedded")

{"action":"startup","build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","level":"info","msg":"Feature flag LD integration disabled: could not locate WEAVIATE_LD_API_KEY env variable","time":"2026-01-17T18:31:02+02:00"}
{"action":"startup","build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2026-01-17T18:31:02+02:00"}
{"action":"startup","auto_schema_enabled":{},"build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","level":"info","msg":"auto schema enabled setting is set to \"\u0026{\u003cnil\u003e {{{} {0 0}} 0 0 {{} 0} {{} 0}} true}\"","time":"2026-01-17T18:31:02+02:00"}
{"build_git_commi

Connected to Weaviate Embedded


In [3]:
collection = client.collections.get("ChaosKnowledgeBase")

response = collection.query.fetch_objects(
    limit=2000,
    include_vector=True
)

data = []
for obj in response.objects:
    props = obj.properties
    data.append({
        "uuid": str(obj.uuid),
        "page": props.get("page"),
        "type": props.get("type", "image"), 
        "content_preview": props.get("text", "")[:100] + "...", 
        "vector": obj.vector["default"] 
    })

df = pd.DataFrame(data)
print(f"{len(df)} chunks")
df.head(10)

678 chunks


Unnamed: 0,uuid,page,type,content_preview,vector
0,0088b529-0cf7-45e8-a6aa-8aed46fb78b5,45.0,text,"include the coefficient of friction, whose val...","[0.009702964685857296, 0.06464481353759766, -0..."
1,00ccc42e-1098-4e59-a35c-e0f72ff6f598,216.0,text,. “A Sound of Thunder.” In The Stories of Ray ...,"[0.022291049361228943, 0.02111099660396576, -0..."
2,011c6524-6f0c-448d-ae51-6cef698e6063,82.0,text,". To the often-heard question, “Why can’t we m...","[0.026930369436740875, 0.04832194373011589, 0...."
3,01b4a750-fe75-488d-bdd3-302579e1f5f7,12.0,text,. In the present instance the colloquial defin...,"[-0.012948527000844479, 0.08835332095623016, -..."
4,01d3322e-508f-4eac-93be-f9a16f482b15,12.0,text,manageable. It seems appropriate to call a rea...,"[-0.013000682927668095, 0.06436554342508316, -..."
5,020eeabb-7fc2-4703-810d-a221b583a393,107.0,text,Interest has not been so much in chaos itself ...,"[-0.019084632396697998, 0.0637802854180336, -0..."
6,025caa8c-52eb-474e-a547-a76ca3429508,208.0,text,APPENDIX 3 A Brief Dynamical-Systems Glossary ...,"[-0.0009571363916620612, 0.05603436008095741, ..."
7,036f9db9-7c06-4ecc-acb0-2402c4a4b2b2,117.0,text,the indicated arithmetic operations. For the t...,"[-0.003881527343764901, 0.003138219937682152, ..."
8,03a6994a-9eb5-44fa-b19f-9026810a1e51,140.0,text,". In fact, the differences more or less steadi...","[0.01966932788491249, 0.02676539309322834, -0...."
9,03dde052-972d-4c15-9cbf-59d4eeb4ab8d,178.0,text,. The remarkable thing is that if you choose t...,"[-0.015231113880872726, 0.03900129348039627, -..."


In [15]:
fig_pie = px.pie(
    df, 
    names='type', 
    title='Distribution of Text vs. Image Descriptions',
    color_discrete_map={'text': '#636EFA', 'image': '#EF553B'}
)
fig_pie.show()

In [13]:
fig_hist = px.histogram(
    df, 
    x="page", 
    color="type", 
    nbins=50,
    title="Knowledge Density per Book Page",
    labels={"page": "Page Number", "count": "Chunk Count"},
    color_discrete_map={'text': '#636EFA', 'image': '#FFB6C1'}
)
fig_hist.update_layout(bargap=0.1)
fig_hist.show()

In [17]:
vectors = list(df['vector'])

pca = PCA(n_components=3)
components = pca.fit_transform(vectors)

df['pca_x'] = components[:, 0]
df['pca_y'] = components[:, 1]
df['pca_z'] = components[:, 2]


fig_3d = px.scatter_3d(
    df,
    x='pca_x',
    y='pca_y',
    z='pca_z',
    color='type',        
    hover_data=['page', 'content_preview'], 
    title='3D Semantic Map of "The Essence of Chaos"',
    opacity=0.7,
    size_max=10,
    color_discrete_map={'text': '#636EFA', 'image': '#FFB6C1'} 
)

fig_3d.update_traces(marker=dict(size=5))
fig_3d.update_layout(
    scene=dict(
        xaxis_title='x',
        yaxis_title='y',
        zaxis_title='z'
    ),
    margin=dict(l=0, r=0, b=0, t=40)
)

fig_3d.show()