# Vectro embedding visualization (PCA + UMAP)

This notebook loads the sample embeddings and shows a quick 2D visualization using PCA (scikit-learn) and UMAP (optional, higher-quality manifold layout).

If you want an interactive demo, the final cell contains a minimal Streamlit app you can run locally.

Requirements:
- numpy, matplotlib, scikit-learn
- optionally: umap-learn (`pip install umap-learn`) and streamlit (`pip install streamlit`)

In [None]:
# Cell 2: imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# try importing umap, but continue gracefully if not installed
try:
    import umap
    _HAS_UMAP = True
except Exception:
    umap = None
    _HAS_UMAP = False

print('UMAP available:', _HAS_UMAP)

In [None]:
# Cell 3: load sample embeddings (adjust path if your data is in a different location)
import os
p = os.path.join('..', 'data', 'sample_embeddings.npy')
if not os.path.exists(p):
    # fallback: generate a small random dataset for demo
    print('sample_embeddings.npy not found; generating demo data')
    rng = np.random.default_rng(0)
    emb = rng.standard_normal((500, 128)).astype(np.float32)
else:
    emb = np.load(p)

print('embeddings shape =', emb.shape)

In [None]:
# Cell 4: PCA 2D projection and scatter plot
pca = PCA(n_components=2)
proj = pca.fit_transform(emb)

plt.figure(figsize=(8, 6))
plt.scatter(proj[:, 0], proj[:, 1], s=6, alpha=0.8)
plt.title('PCA (2D) projection of embeddings')
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.grid(True, alpha=0.2)
plt.show()

In [None]:
# Cell 5: UMAP 2D projection (if available)
if not _HAS_UMAP:
    print('UMAP not installed. To install: pip install umap-learn')
else:
    reducer = umap.UMAP(n_components=2, random_state=42)
    umap_proj = reducer.fit_transform(emb)
    plt.figure(figsize=(8, 6))
    plt.scatter(umap_proj[:, 0], umap_proj[:, 1], s=6, alpha=0.8, cmap='Spectral')
    plt.title('UMAP (2D) projection of embeddings')
    plt.xlabel('u1')
    plt.ylabel('u2')
    plt.grid(True, alpha=0.2)
    plt.show()

## Streamlit demo (minimal)

The following cell contains a minimal Streamlit app you can run locally. Save it as `app.py` and run `streamlit run app.py`. It will show interactive PCA/UMAP choices and allow selecting a subset of points.

In [None]:
# Cell 7: Minimal Streamlit app snippet (save to app.py)
streamlit_app = '''
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

st.title('Vectro embeddings demo')
# load embeddings file relative to repo
p = 'data/sample_embeddings.npy'
emb = np.load(p) if os.path.exists(p) else np.random.standard_normal((500,128)).astype(np.float32)
method = st.selectbox('Projection', ['PCA', 'UMAP'])
ncomp = st.slider('n components', 2, 8, 2)
if method == 'PCA':
    proj = PCA(n_components=ncomp).fit_transform(emb)
else:
    try:
        import umap
        proj = umap.UMAP(n_components=ncomp).fit_transform(emb)
    except Exception:
        st.error('UMAP not installed')
        proj = PCA(n_components=ncomp).fit_transform(emb)
fig, ax = plt.subplots(figsize=(8,6))
ax.scatter(proj[:,0], proj[:,1], s=6)
st.pyplot(fig)
'''
print('Streamlit app snippet: save streamlit_app into app.py to run')

In [None]:
# Cell 8: Reconstruct a single vector from a compressed file and show how to index it into Qdrant/Weaviate (pseudo-code)
from python.cli import reconstruct_slice_from_file
from python import storage, interface
import os

# Adjust path to your compressed file (VECTRO2 or VTRB01). Examples: data/sample_compressed.v2 or data/sample_compressed.vtrb
comp_path = os.path.join('..', 'data', 'sample_compressed.vtrb')
idx = 0  # index of vector to reconstruct

# Try VECTRO2 streaming helper first (works for VECTRO2 files)
try:
    vec = reconstruct_slice_from_file(comp_path, idx, idx+1).squeeze()
    print('Reconstructed vector (via streaming) shape =', vec.shape)
except Exception as e:
    # Fallback: read arrays from VTRB01 container and reconstruct single vector
    print('Streaming reconstruct failed, attempting VTRB01 fallback:', e)
    hdr = storage.read_header(comp_path)
    q = storage.read_array(comp_path, 'q')
    scales = storage.read_array(comp_path, 'scales')
    dims = int(storage.read_array(comp_path, 'dims')[0])
    # normalize q shape: if flattened, reshape to (n,dims)
    try:
        n = int(storage.read_array(comp_path, 'n')[0])
        q_arr = q.reshape((n, dims))
    except Exception:
        # already shaped as (n,d)
        q_arr = q
    vec = interface.reconstruct_embeddings(q_arr[idx].ravel(), np.array([scales[idx]], dtype=np.float32), dims).squeeze()
    print('Reconstructed vector (via VTRB01) shape =', vec.shape)

# Pseudo-code: index into Qdrant
# from qdrant_client import QdrantClient
# client = QdrantClient(url='http://localhost:6333')
# client.upsert(collection_name='embeddings', points=[{ 'id': idx, 'vector': vec.tolist(), 'payload': {'source': 'vectro'} }])

# Pseudo-code: index into Weaviate
# import weaviate
# client = weaviate.Client('http://localhost:8080')
# obj = { 'vector': vec.tolist(), 'meta': {'source': 'vectro'} }
# client.data_object.create(data_object=obj, class_name='EmbeddingClass', uuid=str(idx))

print('Ready to index vector into Qdrant/Weaviate (see pseudo-code above)')