In [1]:
!pip install -qU "numpy==2.0.2" "pandas==2.2.2"

!pip install -qU \
  "torch==2.8.0+cu126" "torchvision==0.19.0+cu126" "torchaudio==2.8.0+cu126" \
  --index-url https://download.pytorch.org/whl/cu126

!pip install -qU sentence-transformers aperturedb

[31mERROR: Could not find a version that satisfies the requirement torchvision==0.19.0+cu126 (from versions: 0.1.6, 0.2.0, 0.21.0+cu126, 0.22.0+cu126, 0.22.1+cu126, 0.23.0+cu126)[0m[31m
[0m[31mERROR: No matching distribution found for torchvision==0.19.0+cu126[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.6/486.6 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.8/137.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0

In [2]:
import os, json, math
import numpy as np
import pandas as pd
from datetime import datetime
from google.colab import userdata
import uuid
import re

import torch
from sentence_transformers import SentenceTransformer

from aperturedb.CommonLibrary import create_connector
from aperturedb.Utils import Utils
from aperturedb.ParallelLoader import ParallelLoader
from aperturedb import Connector

In [3]:
# ---------- CONFIG ----------
APERTUREDB_KEY=userdata.get('APERTUREDB_KEY') # HF_TOKEN is also needed
CSV_PATH = "/content/mlops-events-enriched.csv"
TALK_TITLE_COL = "Talk Title"          # must exist
YOUTUBE_ID_COL = "YouTube ID"          # preferred; we’ll fall back to URL
YOUTUBE_URL_COL = "YouTube Link"       # fallback for extracting ID
TRANSCRIPT_COL = "yt_transcript"         # JSON array of {text, timestamp}

TALK_CLASS = "Talk"
SET_NAME = "ds_transcript_chunks_v1"
CONNECTION_CLASS = "TalkHasTranscriptChunk"

CHUNK_LEN = 10
OVERLAP = 2
STRIDE = CHUNK_LEN - OVERLAP

EMBED_MODEL = "google/embeddinggemma-300m"
EMBED_DIM = 768
EMBED_BATCH = 64

DB_THREADS = 4
DB_BATCHSIZE = 16                                 # commands per transaction


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", DEVICE)

# ---------- Connect to ApertureDB ----------
con = create_connector(key=APERTUREDB_KEY)
utils = Utils(con)

# ---------- Ensure DescriptorSet exists (HNSW + cosine) ----------
# Safe to call repeatedly; adds set only if it doesn't already exist.
# Engine "HNSW" and metric ["CS"] (cosine similarity).
utils.add_descriptorset(SET_NAME, EMBED_DIM, metric=["CS"], engine="HNSW")
print("DescriptorSet ready:", SET_NAME)

# ---------- Load model ----------
model = SentenceTransformer("google/embeddinggemma-300m", device=DEVICE)
model.max_seq_length = 512

device: cuda


[{"AddDescriptorSet": {"name": "ds_transcript_chunks_v1", "dimensions": 768, "metric": ["CS"], "engine": "HNSW"}}]
[{"AddDescriptorSet": {"info": "A descriptor set with this name already exists!", "status": 2}}]


DescriptorSet ready: ds_transcript_chunks_v1


modules.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/18.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/312 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/9.44M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

3_Dense/model.safetensors:   0%|          | 0.00/9.44M [00:00<?, ?B/s]

In [4]:
# ---------- Helpers ----------
def extract_youtube_id(url: str) -> str | None:
    if not isinstance(url, str):
        return None
    m = re.search(r"[?&]v=([A-Za-z0-9_-]{6,})", url)
    if m: return m.group(1)
    m = re.search(r"youtu\.be/([A-Za-z0-9_-]{6,})", url)
    if m: return m.group(1)
    return None

def make_talk_id(talk_title: str, youtube_id: str | None) -> str:
    base = f"{(talk_title or '').strip()}|{(youtube_id or '').strip()}"
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

def ts_to_seconds(ts: str) -> int:
    """
    ts examples: '0:07', '1:05:12'
    """
    parts = [int(p) for p in ts.split(":")]
    if len(parts) == 2:
        m, s = parts
        return m * 60 + s
    elif len(parts) == 3:
        h, m, s = parts
        return h * 3600 + m * 60 + s
    return 0

def chunk_transcript(items, talk_id):
    """
    items: list of {'text': str, 'timestamp': 'M:SS' or 'H:MM:SS'}
    returns list of chunk dicts with seq, start_sec, end_sec, chunk_text, chunk_id, talk_id
    """
    chunks = []
    n = len(items)
    if n == 0:
        return chunks

    i = 0
    seq = 1
    while i < n:
        part = items[i:i+CHUNK_LEN]
        if not part:
            break
        start_sec = ts_to_seconds(part[0].get("timestamp", "0:00"))
        # End time: next item right after this window; for last window fall back to start_sec
        next_idx = i + CHUNK_LEN
        if next_idx < n:
            end_sec = ts_to_seconds(items[next_idx].get("timestamp", "0:00"))
        else:
            end_sec = start_sec

        chunk_text = " ".join([p.get("text", "").strip() for p in part if p.get("text")])
        chunk_id = f"{talk_id}#ch{seq:04d}"

        chunks.append({
            "seq": seq,
            "start_sec": int(start_sec),
            "end_sec": int(end_sec),
            "chunk_text": chunk_text,
            "chunk_id": chunk_id,
            "talk_id": talk_id,
        })
        seq += 1
        i += STRIDE
    return chunks

def load_transcript_cell(raw):
    """
    Returns a Python list of {text, timestamp} or None if parse fails.
    Handles (a) already-parsed list; (b) JSON string; (c) double-encoded JSON string.
    """
    if raw is None or (isinstance(raw, float) and np.isnan(raw)):
        return None
    if isinstance(raw, list):
        return raw
    if isinstance(raw, str) and raw.strip():
        try:
            data = json.loads(raw)
            if isinstance(data, str):
                # double-encoded: decode once more
                data = json.loads(data)
            if isinstance(data, list):
                return data
        except Exception:
            return None
    return None

def to_blob(vec: np.ndarray) -> bytes:
    """
    ApertureDB expects raw float32 little-endian bytes for descriptors.
    """
    return np.asarray(vec, dtype="<f4").tobytes()

In [5]:
# ---------- Build queries ----------
def build_queries_for_df(df: pd.DataFrame):
    queries = []

    for _, row in df.iterrows():
        title = str(row.get(TALK_TITLE_COL, "")).strip()
        if not title:
            continue

        yt_id = str(row.get(YOUTUBE_ID_COL)).strip() if pd.notna(row.get(YOUTUBE_ID_COL)) else None
        if not yt_id:
            yt_id = extract_youtube_id(str(row.get(YOUTUBE_URL_COL, "")).strip())

        talk_id = make_talk_id(title, yt_id)

        raw = row.get(TRANSCRIPT_COL, "")
        items = load_transcript_cell(raw)
        if not items:
            continue

        chunks = chunk_transcript(items, talk_id)
        if not chunks:
            continue

        # embed texts (normalized for cosine)
        texts = [c["chunk_text"] for c in chunks]
        embs = model.encode(texts, batch_size=EMBED_BATCH, show_progress_bar=False, normalize_embeddings=True)
        assert embs.shape[1] == EMBED_DIM, f"expected {EMBED_DIM}, got {embs.shape[1]}"

        for c, vec in zip(chunks, embs):
            cmd = [
                # find the Talk by talk_id (unique) to connect the descriptor
                {
                    "FindEntity": {
                        "_ref": 1,
                        "with_class": TALK_CLASS,
                        "unique": True,
                        "constraints": {"talk_id": ["==", c["talk_id"]]},
                        "results": {"count": True}
                    }
                },
                {
                    "AddDescriptor": {                                              # AddDescriptor command
                        "set": SET_NAME,
                        "properties": {
                            "chunk_id": c["chunk_id"],
                            "talk_id": c["talk_id"],           # keep as metadata
                            "seq": c["seq"],
                            "start_sec": c["start_sec"],
                            "end_sec": c["end_sec"],
                            "chunk_text": c["chunk_text"],
                        },
                        "if_not_found": {"chunk_id": ["==", c["chunk_id"]]},        # idempotent upsert
                        "connect": { "class": CONNECTION_CLASS, "ref": 1, "direction": "in" }  # Talk -> Descriptor
                    }
                }
            ]
            queries.append((cmd, [to_blob(vec)]))
    return queries

In [None]:
df = pd.read_csv(CSV_PATH)
print("rows read:", len(df))
has_tx = df[TRANSCRIPT_COL].notna() & (df[TRANSCRIPT_COL].astype(str).str.len() > 2)
print("rows with non-empty transcript:", has_tx.sum())

# Peek one parsed transcript to verify shape
sample_raw = df.loc[has_tx.idxmax(), TRANSCRIPT_COL]
sample_items = load_transcript_cell(sample_raw)
print("sample items len:", len(sample_items))
print("first 2 items:", sample_items[:2])


rows read: 278
rows with non-empty transcript: 278
sample items len: 316
first 2 items: [{'text': '[Applause] [Music]', 'timestamp': '0:00'}, {'text': 'hello everyone my name is OSI and together with DAR Adam will to today', 'timestamp': '0:07'}]


In [None]:
# ---------- Run ----------
df = pd.read_csv(CSV_PATH)
print("rows read:", len(df))

queries = build_queries_for_df(df)
print("descriptor chunks to ingest:", len(queries))

loader = ParallelLoader(con)
loader.ingest(queries, batchsize=DB_BATCHSIZE, numthreads=DB_THREADS, stats=True)  # parallel/batched
print("Done.")

rows read: 278
descriptor chunks to ingest: 16887


Progress: 100%|██████████| 16.9k/16.9k [03:56<00:00, 71.5items/s]

Total time (s): 236.04042387008667
Total queries executed: 1056
Avg Query time (s): 0.8875651413744147
Query time std: 0.27220850394400053
Avg Query Throughput (q/s): 4.506711466615182
Overall insertion throughput (element/s): 71.5428303471204
Total inserted elements: 16887
Total successful commands: 33774
Done.





In [20]:
# List all talk titles from Talk entities
q = [{
  "FindEntity": {
    "with_class": "Talk",
    "sort": { "key": "talk_title", "order": "ascending" },
    "limit": 10000,
    "results": { "list": ["talk_title"] }
  }
}]

resp, _ = con.query(q)
rows = resp[0]["FindEntity"].get("entities", [])
titles = [r["talk_title"] for r in rows]

# Pretty print
for i, t in enumerate(titles, 1):
    print(f"{i:3d}. {t}")


  1. 2022 AI Index Report Briefing
  2. 7 Questions for Data Scientists
  3. A Data Scientist Guide to Unit & End to End Testing
  4. A Guide to Putting Together a Continuous ML Stack
  5. A machine learning-driven phenotyping platform for rapid in vivo target validation and precision medicine development
  6. A Practical Guide to Efficient AI
  7. A Practitioner's Guide To Safeguarding Your LLM Applications
  8. Advances in Algorithmic Recourse: Ensuring Causal Consistency, Fairness, & Robustness
  9. Agentic AI: Learning Iteratively, Acting Autonomously
 10. Agentic AI: Unlocking Emergent Behavior in LLMs for Adaptive Workflow Automation
 11. AI @ Scale to Understand Impact of Weather on Businesses
 12. AI & Sustainability: A $50 trillion opportunity
 13. AI Agents with Function Calling/Tool Use
 14. AI as an Engineering Discipline
 15. AI Features Demand Evidence-Based Decisions
 16. AI for AI-Scotiabank's Award-Winning ML Models
 17. AI for Hospitals at Scale
 18. AI Governance: Ac

### Sanity check queries

In [6]:
client = create_connector(key=APERTUREDB_KEY)

def run(q):
    resp, blobs = client.query(q)
    client.print_last_response()
    return resp

1) How many transcript chunks are in the set?

In [None]:
run([{
  "FindDescriptor": {
    "set": "ds_transcript_chunks_v1",
    "results": { "count": True }
  }
}])


[
    {
        "FindDescriptor": {
            "count": 16887,
            "returned": 0,
            "status": 0
        }
    }
]


[{'FindDescriptor': {'count': 16887, 'returned': 0, 'status': 0}}]


2) How many chunks for a specific talk (by talk_title)?



In [None]:
run([
  {
    "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "unique": True,
      "constraints": { "talk_title": ["==", "LLMs, from Playgrounds to Production-ready Pipelines"] },
      "results": { "list": ["talk_id","talk_title"] }
    }
  },
  {
    "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "results": { "count": True }
    }
  }
]
)

[
    {
        "FindEntity": {
            "entities": [
                {
                    "talk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de",
                    "talk_title": "LLMs, from Playgrounds to Production-ready Pipelines"
                }
            ],
            "returned": 1,
            "status": 0
        }
    },
    {
        "FindDescriptor": {
            "count": 36,
            "returned": 0,
            "status": 0
        }
    }
]


[{'FindEntity': {'entities': [{'talk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de',
     'talk_title': 'LLMs, from Playgrounds to Production-ready Pipelines'}],
   'returned': 1,
   'status': 0}},
 {'FindDescriptor': {'count': 36, 'returned': 0, 'status': 0}}]

3) Show the first 3 chunks (by seq) for that talk

In [None]:
run([
  {
    "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "unique": True,
      "constraints": { "talk_title": ["==", "LLMs, from Playgrounds to Production-ready Pipelines"] },
      "results": { "list": ["talk_id"] }
    }
  },
  {
    "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "sort": { "key": "seq", "order": "ascending" },
      "limit": 3,
      "results": { "list": ["chunk_id","seq","start_sec","end_sec","chunk_text"] }
    }
  }
]
)

[
    {
        "FindEntity": {
            "entities": [
                {
                    "talk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de"
                }
            ],
            "returned": 1,
            "status": 0
        }
    },
    {
        "FindDescriptor": {
            "entities": [
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0001",
                    "chunk_text": "yeah thanks for coming my name is Inu I'm a CPO and co-founder of SL aai and um yeah I'm also uh UT Austin computer science alumni so I can see some few longer so welcome welcome to uh Austin so yeah I'm going to start with um today's goal here uh we're going to start from the inference demo with llama 2 C and then uh starting from there we can do uh we'll do some training fine-tuning serving and um lastly we do some uh putting everything together as a cicd pipelines uh I'm going to do some rag examples so yeah I don't know if you guys have familiar with 

[{'FindEntity': {'entities': [{'talk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de'}],
   'returned': 1,
   'status': 0}},
 {'FindDescriptor': {'entities': [{'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0001',
     'chunk_text': "yeah thanks for coming my name is Inu I'm a CPO and co-founder of SL aai and um yeah I'm also uh UT Austin computer science alumni so I can see some few longer so welcome welcome to uh Austin so yeah I'm going to start with um today's goal here uh we're going to start from the inference demo with llama 2 C and then uh starting from there we can do uh we'll do some training fine-tuning serving and um lastly we do some uh putting everything together as a cicd pipelines uh I'm going to do some rag examples so yeah I don't know if you guys have familiar with these terms um so yeah llama 2 is uh published by meta and Lama 2.c is uh uh is a mini uh Mini model of llama 2 uh which is made by uh Andre Kacy from open AI so you can learn uh I mean you can training on

4) Fetch chunks for a time range within a talk (e.g., 60s–300s)

In [None]:
run([
  {
    "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "unique": True,
      "constraints": { "talk_title": ["==", "LLMs, from Playgrounds to Production-ready Pipelines"] },
      "results": { "list": ["talk_id"] }
    }
  },
  {
    "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "constraints": {
        "start_sec": [">=", 60],
        "end_sec": ["<=", 300]
      },
      "sort": "seq",
      "results": { "list": ["seq","start_sec","end_sec","chunk_id"] }
    }
  }
])


[
    {
        "FindEntity": {
            "entities": [
                {
                    "talk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de"
                }
            ],
            "returned": 1,
            "status": 0
        }
    },
    {
        "FindDescriptor": {
            "entities": [
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0002",
                    "end_sec": 128,
                    "seq": 2,
                    "start_sec": 60
                },
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0003",
                    "end_sec": 185,
                    "seq": 3,
                    "start_sec": 116
                },
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0004",
                    "end_sec": 240,
                    "seq": 4,
                    "start_sec": 170
                },
                {
                   

[{'FindEntity': {'entities': [{'talk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de'}],
   'returned': 1,
   'status': 0}},
 {'FindDescriptor': {'entities': [{'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0002',
     'end_sec': 128,
     'seq': 2,
     'start_sec': 60},
    {'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0003',
     'end_sec': 185,
     'seq': 3,
     'start_sec': 116},
    {'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0004',
     'end_sec': 240,
     'seq': 4,
     'start_sec': 170},
    {'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0005',
     'end_sec': 297,
     'seq': 5,
     'start_sec': 229}],
   'returned': 4,
   'status': 0}}]

5) Grab a specific chunk by chunk_id

In [None]:
run([{
  "FindDescriptor": {
    "set": "ds_transcript_chunks_v1",
    "unique": True,
    "constraints": { "chunk_id": ["==", "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0004"] },
    "results": { "list": ["chunk_id","talk_id","seq","start_sec","end_sec","chunk_text"] }
  }
}]
)

[
    {
        "FindDescriptor": {
            "entities": [
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0004",
                    "chunk_text": "language model system.org leaderboard um unfortunately no open source model has beaten the propriety models yet so the top five models are propriety models it's uh GPT 4 Cloud one two and another cloud and GPT 3.5 turbo so um yeah uh also there's like hyper gap between open and close model uh due to the investment because um I mean uh for instance like an entropic interview uh a CEO of the entropic they're going to uh invest like1 billion by 2025 uh that's a lot of investment and it's about like 10% of Enterprise value of open AI um yeah well let alone with the investment open AI has a great uh infrastructure um so I do a little bit of math here so let's say you're going to fine tuning with open AI versus hiring engineers and tuning on",
                    "end_sec": 240,
                    "s

[{'FindDescriptor': {'entities': [{'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0004',
     'chunk_text': "language model system.org leaderboard um unfortunately no open source model has beaten the propriety models yet so the top five models are propriety models it's uh GPT 4 Cloud one two and another cloud and GPT 3.5 turbo so um yeah uh also there's like hyper gap between open and close model uh due to the investment because um I mean uh for instance like an entropic interview uh a CEO of the entropic they're going to uh invest like1 billion by 2025 uh that's a lot of investment and it's about like 10% of Enterprise value of open AI um yeah well let alone with the investment open AI has a great uh infrastructure um so I do a little bit of math here so let's say you're going to fine tuning with open AI versus hiring engineers and tuning on",
     'end_sec': 240,
     'seq': 4,
     'start_sec': 170,
     'talk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de'}],
   'returned': 1,
   

6) List the first & last chunk for a talk (check boundaries)

In [None]:
run([
  {
    "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "unique": True,
      "constraints": { "talk_title": ["==", "LLMs, from Playgrounds to Production-ready Pipelines"] }
    }
  },
  {
    "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "sort": { "key": "seq", "order": "ascending" },
      "limit": 1,
      "results": { "list": ["seq","start_sec","end_sec","chunk_id"] }
    }
  },
  {
    "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "sort": { "key": "seq", "order": "descending" },
      "limit": 1,
      "results": { "list": ["seq","start_sec","end_sec","chunk_id"] }
    }
  }
])


[
    {
        "FindEntity": {
            "returned": 0,
            "status": 0
        }
    },
    {
        "FindDescriptor": {
            "entities": [
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0001",
                    "end_sec": 76,
                    "seq": 1,
                    "start_sec": 4
                }
            ],
            "returned": 1,
            "status": 0
        }
    },
    {
        "FindDescriptor": {
            "entities": [
                {
                    "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0036",
                    "end_sec": 1911,
                    "seq": 36,
                    "start_sec": 1911
                }
            ],
            "returned": 1,
            "status": 0
        }
    }
]


[{'FindEntity': {'returned': 0, 'status': 0}},
 {'FindDescriptor': {'entities': [{'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0001',
     'end_sec': 76,
     'seq': 1,
     'start_sec': 4}],
   'returned': 1,
   'status': 0}},
 {'FindDescriptor': {'entities': [{'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0036',
     'end_sec': 1911,
     'seq': 36,
     'start_sec': 1911}],
   'returned': 1,
   'status': 0}}]

7) Pull chunks for multiple talks and group by source (nice visual check)

In [None]:
run([
  {
    "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "constraints": {
        "talk_title": ["in", ["LLMs, from Playgrounds to Production-ready Pipelines","Leverage Kubernetes To Optimize the Utilization of Your AI Accelerators","LLMs, Big Data, and Audio: Breaching an Untapped Gold Mine"]]
      },
      "results": { "list": ["talk_title","talk_id"] }
    }
  },
  {
    "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "group_by_source": True,
      "sort": "seq",
      "limit": 2,
      "results": { "list": ["seq","chunk_id"] }
    }
  }
]
)

[
    {
        "FindEntity": {
            "entities": [
                {
                    "talk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de",
                    "talk_title": "LLMs, from Playgrounds to Production-ready Pipelines"
                },
                {
                    "talk_id": "02f5e445-8821-5cb1-bada-88586b0f9ac8",
                    "talk_title": "Leverage Kubernetes To Optimize the Utilization of Your AI Accelerators"
                },
                {
                    "talk_id": "95c82848-d9f9-54c3-a7eb-2f3fed38adac",
                    "talk_title": "LLMs, Big Data, and Audio: Breaching an Untapped Gold Mine"
                }
            ],
            "returned": 3,
            "status": 0
        }
    },
    {
        "FindDescriptor": {
            "entities": {
                "7.12.260": [],
                "7.2.260": [
                    {
                        "chunk_id": "cade4e0c-5922-57c4-9dba-98ea942da9de#ch0001",
                  

[{'FindEntity': {'entities': [{'talk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de',
     'talk_title': 'LLMs, from Playgrounds to Production-ready Pipelines'},
    {'talk_id': '02f5e445-8821-5cb1-bada-88586b0f9ac8',
     'talk_title': 'Leverage Kubernetes To Optimize the Utilization of Your AI Accelerators'},
    {'talk_id': '95c82848-d9f9-54c3-a7eb-2f3fed38adac',
     'talk_title': 'LLMs, Big Data, and Audio: Breaching an Untapped Gold Mine'}],
   'returned': 3,
   'status': 0}},
 {'FindDescriptor': {'entities': {'7.12.260': [],
    '7.2.260': [{'chunk_id': 'cade4e0c-5922-57c4-9dba-98ea942da9de#ch0001',
      'seq': 1}],
    '7.3.260': [{'chunk_id': '02f5e445-8821-5cb1-bada-88586b0f9ac8#ch0001',
      'seq': 1}]},
   'group_by_source': True,
   'returned': 2,
   'status': 0}}]

8) Quick K-NN sanity check from Python (embed a short phrase)

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: Strategies to drive adoption of GenAI tools"], normalize_embeddings=True)[0].astype("<f4")

q = [{
  "FindDescriptor": {
    "set": "ds_transcript_chunks_v1",
    "k_neighbors": 20,
    "distances": True,
    "results": { "list": ["_distance","chunk_id","talk_id","seq","start_sec","end_sec","chunk_text"] }
  }
}]

resp, _ = con.query(q, blobs=[qvec.tobytes()])  # vector must be supplied as a blob
print(resp)


[{'FindDescriptor': {'entities': [{'_distance': 0.6840075254440308, 'chunk_id': '9726ea3b-4357-5b5b-823e-9d396e90da8c#ch0091', 'chunk_text': 'wonderful talk', 'end_sec': 1723, 'seq': 91, 'start_sec': 1723, 'talk_id': '9726ea3b-4357-5b5b-823e-9d396e90da8c'}, {'_distance': 0.6814993619918823, 'chunk_id': '2a206d50-b361-55a8-88ef-038e761ca717#ch0108', 'chunk_text': 'audience thank you', 'end_sec': 5344, 'seq': 108, 'start_sec': 5344, 'talk_id': '2a206d50-b361-55a8-88ef-038e761ca717'}, {'_distance': 0.678869366645813, 'chunk_id': 'a2d77613-e36e-5abd-bce1-89e08c2a075d#ch0010', 'chunk_text': "Enterprises are saying that they're already using generative AI or they have plans to use generative AI in the next one to two years and I would say that the these organizations are very smart to do so I truly believe that those organizations who are embracing and employing generative AI across their business are going to succeed and they are going to win over the organizations", 'end_sec': 212, 'seq': 

### Delete filler chunks
As observed from above query, a lot of small filler chunks are addig a lot of noise so lets delete them

In [None]:
MIN_LEN = 200           # threshold
PAGE = 2000            # descriptors per page to scan
BATCH_DELETE = 500     # delete up to this many chunk_ids per DeleteDescriptor
DRY_RUN = True         # set to False to actually delete

SET_NAME = "ds_transcript_chunks_v1"

def page_descriptors(offset=0, limit=PAGE):
    q = [{
        "FindDescriptor": {
            "set": SET_NAME,
            "offset": offset,             # shared param: skip N results
            "limit": limit,               # shared param: cap result size
            "results": {"list": ["chunk_id", "chunk_text"]}
        }
    }]
    resp, _ = con.query(q)
    items = resp[0]["FindDescriptor"].get("entities", [])
    return items, len(items)

In [None]:
# 1) Scan & collect short ones
to_delete = []
to_del_text = []
offset = 0
total_scanned = 0

while True:
    items, n = page_descriptors(offset=offset, limit=PAGE)
    if n == 0:
        break
    total_scanned += n
    for it in items:
        txt = (it.get("chunk_text") or "").strip()
        if len(txt) < MIN_LEN:
            cid = it.get("chunk_id")
            ct = it.get("chunk_text")
            if cid:
                to_delete.append(cid)
                to_del_text.append(ct)
    offset += n
    print(f"scanned={total_scanned}, collected_for_delete={len(to_delete)}")

print(f"\nScan complete. total_scanned={total_scanned}, short_chunks={len(to_delete)}, threshold={MIN_LEN} chars")



scanned=2000, collected_for_delete=13
scanned=4000, collected_for_delete=26
scanned=6000, collected_for_delete=41
scanned=8000, collected_for_delete=60
scanned=10000, collected_for_delete=72
scanned=12000, collected_for_delete=82
scanned=14000, collected_for_delete=98
scanned=16000, collected_for_delete=110
scanned=16887, collected_for_delete=117

Scan complete. total_scanned=16887, short_chunks=117, threshold=200 chars


In [None]:

# 2) Dry-run sample
print("\nshort chunks):")
for c in to_del_text:
  print(c)



short chunks):
usually that's hidden in the developer platforms
now well yeah okay I I'll do it and I'll do it in a sec and then if you want to if you want to come see it come down by the booth and I can I can show you the URL cool all right thanks everyone
right there neb. me and my cooworker Christian in the back will'll be around to answer any questions thank you all so much and have a wonderful day zun everybody zun
and follow the LinkedIn um yeah any questions I mean there's a my there so you can um okay thanks for calling [Applause] me
all right so time is 2:30 thank you guys again for for your for being here I hope you enjoy the rest of the summit thank you
also if you need my email address there we are so thank you very much [Applause]
again Nathan we greatly appreciate you taking time out of your day to be here with us thanks everyone for joining by now
diand for providing us with the unique data set and for NVIDIA for providing the amazing Hardware to build these large model

In [None]:
# 3) Delete in batches
if to_delete:
    for i in range(0, len(to_delete), BATCH_DELETE):
        batch_ids = to_delete[i:i+BATCH_DELETE]
        q = [{
            "DeleteDescriptor": {
                "set": SET_NAME,
                "constraints": { "chunk_id": ["in", batch_ids] }
            }
        }]
        resp, _ = con.query(q)
        print("Deleted batch", i, "->", i+len(batch_ids), resp)
    print("Deletion completed.")
else:
    print("\nNothing to delete.")

Deleted batch 0 -> 117 [{'DeleteDescriptor': {'count': 117, 'status': 0}}]
Deletion completed.


No filler results now :D

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: Strategies to drive adoption of GenAI tools"], normalize_embeddings=True)[0].astype("<f4")

q = [{
  "FindDescriptor": {
    "set": "ds_transcript_chunks_v1",
    "k_neighbors": 20,
    "distances": True,
    "results": { "list": ["_distance","chunk_id","talk_id","seq","start_sec","end_sec","chunk_text"] }
  }
}]

resp, _ = con.query(q, blobs=[qvec.tobytes()])  # vector must be supplied as a blob
print(resp)


[{'FindDescriptor': {'entities': [{'_distance': 0.678869366645813, 'chunk_id': 'a2d77613-e36e-5abd-bce1-89e08c2a075d#ch0010', 'chunk_text': "Enterprises are saying that they're already using generative AI or they have plans to use generative AI in the next one to two years and I would say that the these organizations are very smart to do so I truly believe that those organizations who are embracing and employing generative AI across their business are going to succeed and they are going to win over the organizations", 'end_sec': 212, 'seq': 10, 'start_sec': 184, 'talk_id': 'a2d77613-e36e-5abd-bce1-89e08c2a075d'}, {'_distance': 0.6416062712669373, 'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0001', 'chunk_text': "[Applause] [Music] yep thanks everyone so nice to meet you Michael triffle here with Rea AI um so to rea the future of AI is multimodal so our mission is to develop Next Generation AI to empower the most capable agents that can see hear as well as speak we'll play a litt

In [18]:
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: How are models deployed in production?"], normalize_embeddings=True)[0].astype("<f4")

q = [{
  "FindDescriptor": {
    "set": "ds_transcript_chunks_v1",
    "k_neighbors": 20,
    "distances": True,
    "results": { "list": ["_distance","chunk_id","talk_id","seq","start_sec","end_sec","chunk_text"] }
  }
}]

resp, _ = con.query(q, blobs=[qvec.tobytes()])  # vector must be supplied as a blob
print(resp)


[{'FindDescriptor': {'entities': [{'_distance': 0.656283974647522, 'chunk_id': '9e8a70e7-08e2-5335-87f2-2e959c0253ea#ch0088', 'chunk_text': 'later you realize that that model is deprecated and you really have to change your entire system um I would think that is a problem that should get resolved at the model provider level itself like we should have more stable models over time and like I was mentioning if there are metrics that you can kind of um', 'end_sec': 1769, 'seq': 88, 'start_sec': 1749, 'talk_id': '9e8a70e7-08e2-5335-87f2-2e959c0253ea'}, {'_distance': 0.6542110443115234, 'chunk_id': '902d023c-6c80-5dda-94f8-783668244f8e#ch0044', 'chunk_text': 'this so that we ingest model inputs and outputs into our data warehouse to enable observability this way we can achieve some division of labor so this is how our vision for the ml stack looks like we are already familiar with the model sending inputs and their inferences', 'end_sec': 950, 'seq': 44, 'start_sec': 926, 'talk_id': '902d023

A) Semantic search → talks (dedupe to the source talks)

Goal: Find the top-10 chunks semantically; then get the Talk objects they belong to (one per talk).

In [7]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: tools adoption in genai orgs"], normalize_embeddings=True)[0].astype("<f4")

q = [
  {
    "FindDescriptor": {
      "_ref": 1,
      "set": "ds_transcript_chunks_v1",
      "k_neighbors": 10,
      "results": { "list": ["_distance","chunk_id","talk_id","seq","start_sec","end_sec"] }
    }
  },
  {
    "FindEntity": {
      "with_class": "Talk",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "results": { "list": ["talk_title","talk_id","youtube_url","speaker_name"] }
    }
  }
]

resp, _ = con.query(q, blobs=[qvec.tobytes()])  # the vector goes as a blob for the kNN call
print(resp)


[{'FindDescriptor': {'entities': [{'_distance': None, 'chunk_id': 'fc3efe32-e399-5dfc-9cd5-70912f545122#ch0030', 'end_sec': 537, 'seq': 30, 'start_sec': 513, 'talk_id': 'fc3efe32-e399-5dfc-9cd5-70912f545122'}, {'_distance': None, 'chunk_id': '73e19415-188a-5f42-bbbe-21d291723217#ch0036', 'end_sec': 1807, 'seq': 36, 'start_sec': 1807, 'talk_id': '73e19415-188a-5f42-bbbe-21d291723217'}, {'_distance': None, 'chunk_id': '4b5ed1f3-6dc6-5b0f-9dfb-03f9e027f218#ch0005', 'end_sec': 117, 'seq': 5, 'start_sec': 89, 'talk_id': '4b5ed1f3-6dc6-5b0f-9dfb-03f9e027f218'}, {'_distance': None, 'chunk_id': 'f3320a7b-3e1f-5af6-9697-c040e107e22e#ch0042', 'end_sec': 941, 'seq': 42, 'start_sec': 901, 'talk_id': 'f3320a7b-3e1f-5af6-9697-c040e107e22e'}, {'_distance': None, 'chunk_id': 'd717b6a8-bb78-52e0-bbaa-e8b45ca81348#ch0042', 'end_sec': 941, 'seq': 42, 'start_sec': 901, 'talk_id': 'd717b6a8-bb78-52e0-bbaa-e8b45ca81348'}, {'_distance': None, 'chunk_id': 'df52f16e-7e4c-56b8-a5b7-215b1ef25406#ch0037', 'end_se

In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: which talks discuss using ai agent memory in production?"], normalize_embeddings=True)[0].astype("<f4")

q = [
  {
    "FindDescriptor": {
      "_ref": 1,
      "set": "ds_transcript_chunks_v1",
      "k_neighbors": 10,
      "results": { "list": ["_distance","chunk_id","talk_id","seq","start_sec","end_sec"] }
    }
  },
  {
    "FindEntity": {
      "with_class": "Talk",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "results": { "list": ["talk_title","talk_id","youtube_url","speaker_name"] }
    }
  }
]

resp, _ = con.query(q, blobs=[qvec.tobytes()])  # the vector goes as a blob for the kNN call
print(resp)


[{'FindDescriptor': {'entities': [{'_distance': None, 'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0001', 'end_sec': 38, 'seq': 1, 'start_sec': 0, 'talk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7'}, {'_distance': None, 'chunk_id': 'ae406be8-88aa-584a-adc8-5e46f08655b2#ch0003', 'end_sec': 73, 'seq': 3, 'start_sec': 45, 'talk_id': 'ae406be8-88aa-584a-adc8-5e46f08655b2'}, {'_distance': None, 'chunk_id': '4b5ed1f3-6dc6-5b0f-9dfb-03f9e027f218#ch0015', 'end_sec': 282, 'seq': 15, 'start_sec': 282, 'talk_id': '4b5ed1f3-6dc6-5b0f-9dfb-03f9e027f218'}, {'_distance': None, 'chunk_id': 'a2d77613-e36e-5abd-bce1-89e08c2a075d#ch0001', 'end_sec': 29, 'seq': 1, 'start_sec': 0, 'talk_id': 'a2d77613-e36e-5abd-bce1-89e08c2a075d'}, {'_distance': None, 'chunk_id': '4b5ed1f3-6dc6-5b0f-9dfb-03f9e027f218#ch0013', 'end_sec': 268, 'seq': 13, 'start_sec': 245, 'talk_id': '4b5ed1f3-6dc6-5b0f-9dfb-03f9e027f218'}, {'_distance': None, 'chunk_id': '542c989f-4348-54a3-a1bc-bb7461956ef3#ch0064', 'end_sec': 1504, '

B) Semantic search → best talk → show that talk’s top 3 earliest chunks

Goal: Hit kNN, pick the most similar chunk, hop to its talk, then list the first 3 chunks of that talk (by seq) to inspect context.

In [8]:
model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: ai agents with memory"], normalize_embeddings=True)[0].astype("<f4")

q = [
  { "FindDescriptor": {
      "_ref": 1,
      "set": "ds_transcript_chunks_v1",
      "k_neighbors": 1,
      "results": { "list": ["chunk_id","talk_id","seq"] }
  }},
  { "FindEntity": {
      "_ref": 2,
      "with_class": "Talk",
      "unique": True,
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "results": { "list": ["talk_title","talk_id","speaker_name"] }
  }},
  { "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 2, "connection_class": "TalkHasTranscriptChunk" },
      "sort": { "key": "seq", "order": "ascending" },
      "limit": 3,
      "results": { "list": ["seq","start_sec","end_sec","chunk_id","chunk_text"] }
  }}
]

resp, _ = con.query(q, blobs=[qvec.tobytes()])
print(resp)


[{'FindDescriptor': {'entities': [{'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0001', 'seq': 1, 'talk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7'}], 'returned': 1, 'status': 0}}, {'FindEntity': {'entities': [{'speaker_name': 'Michael Thriffiley', 'talk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7', 'talk_title': 'Multimodal Agents You Can Deploy Anywhere'}], 'returned': 1, 'status': 0}}, {'FindDescriptor': {'entities': [{'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0001', 'chunk_text': "[Applause] [Music] yep thanks everyone so nice to meet you Michael triffle here with Rea AI um so to rea the future of AI is multimodal so our mission is to develop Next Generation AI to empower the most capable agents that can see hear as well as speak we'll play a little", 'end_sec': 38, 'seq': 1, 'start_sec': 0}, {'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0002', 'chunk_text': "as speak we'll play a little video at Rea we recognize that most data extends Beyond text it also cont

C) Talk → semantic search confined to that talk only

Goal: Start from a specific talk, then restrict kNN to only its chunks.

In [8]:
model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: ai agents with memory"], normalize_embeddings=True)[0].astype("<f4")

q = [
  { "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "unique": True,
      "constraints": { "talk_title": ["==", "Multimodal Agents You Can Deploy Anywhere"] },
      "results": { "list": ["talk_id","talk_title"] }
  }},
  { "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "k_neighbors": 5,
      "results": { "list": ["_distance","seq","start_sec","end_sec","chunk_id","chunk_text"] }
  }}
]

resp, _ = con.query(q, blobs=[qvec.tobytes()])
print(resp)


[{'FindEntity': {'entities': [{'talk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7', 'talk_title': 'Multimodal Agents You Can Deploy Anywhere'}], 'returned': 1, 'status': 0}}, {'FindDescriptor': {'entities': [{'_distance': None, 'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0001', 'chunk_text': "[Applause] [Music] yep thanks everyone so nice to meet you Michael triffle here with Rea AI um so to rea the future of AI is multimodal so our mission is to develop Next Generation AI to empower the most capable agents that can see hear as well as speak we'll play a little", 'end_sec': 38, 'seq': 1, 'start_sec': 0}], 'returned': 1, 'status': 0}}]


D) Multiple talks → grouped semantic search (see which talk matches best)

Goal: Pick a few talks of interest, then run kNN only within those talks, and group results by talk.

In [9]:
model = SentenceTransformer("google/embeddinggemma-300m")
qvec = model.encode(["query: what are ai agents?"], normalize_embeddings=True)[0].astype("<f4")

q = [
  { "FindEntity": {
      "_ref": 1,
      "with_class": "Talk",
      "constraints": { "talk_title": ["in", ["Multimodal Agents You Can Deploy Anywhere","Demystifying Large Language Models: ChatGPT, GPT-4, and the Future of AI Communication","Panel: How companies can navigate and approach the new advancements in generative AI"]] },
      "results": { "list": ["talk_title","talk_id"] }
  }},
  { "FindDescriptor": {
      "set": "ds_transcript_chunks_v1",
      "is_connected_to": { "ref": 1, "connection_class": "TalkHasTranscriptChunk" },
      "k_neighbors": 8,
      "group_by_source": True,
      "results": { "list": ["_distance","seq","chunk_id","chunk_text"] }
  }}
]

resp, _ = con.query(q, blobs=[qvec.tobytes()])
print(resp)


[{'FindEntity': {'entities': [{'talk_id': 'ce87713c-db71-561a-8ec9-c840306348ca', 'talk_title': 'Demystifying Large Language Models: ChatGPT, GPT-4, and the Future of AI Communication'}, {'talk_id': '7255dc0f-0f29-5aad-9a95-a7956c44673a', 'talk_title': 'Panel: How companies can navigate and approach the new advancements in generative AI'}, {'talk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7', 'talk_title': 'Multimodal Agents You Can Deploy Anywhere'}], 'returned': 3, 'status': 0}}, {'FindDescriptor': {'entities': {'7.141.262': [], '7.19.260': [], '7.241.262': [{'_distance': None, 'chunk_id': '14b1947c-c2ac-5777-8a3b-4593460325f7#ch0001', 'chunk_text': "[Applause] [Music] yep thanks everyone so nice to meet you Michael triffle here with Rea AI um so to rea the future of AI is multimodal so our mission is to develop Next Generation AI to empower the most capable agents that can see hear as well as speak we'll play a little", 'seq': 1}]}, 'group_by_source': True, 'returned': 1, 'status': 0}

### Creating Remaining DescriptorSets

In [7]:
SET_META = "ds_talk_meta_v1"
SET_BIO  = "ds_speaker_bio_v1"

utils.add_descriptorset(SET_META, EMBED_DIM, metric=["CS"], engine="HNSW")
utils.add_descriptorset(SET_BIO,  EMBED_DIM, metric=["CS"], engine="HNSW")
print("DescriptorSets ready:", SET_META, SET_BIO)

[{"AddDescriptorSet": {"name": "ds_talk_meta_v1", "dimensions": 768, "metric": ["CS"], "engine": "HNSW"}}]
[{"AddDescriptorSet": {"info": "A descriptor set with this name already exists!", "status": 2}}]
[{"AddDescriptorSet": {"name": "ds_speaker_bio_v1", "dimensions": 768, "metric": ["CS"], "engine": "HNSW"}}]
[{"AddDescriptorSet": {"info": "A descriptor set with this name already exists!", "status": 2}}]


DescriptorSets ready: ds_talk_meta_v1 ds_speaker_bio_v1


In [8]:
# CSV column names used
ABSTRACT_COL   = "Abstract"
WYL_COL        = "What You'll Learn"
PREREQ_COL     = "Prerequiste Knowledge (if required)"
CAT1_COL       = "Category 1"
KW_COL         = "Top 3 keywords (in order)"
UNIQUE_COL     = "What is Unique about your session"
JOB_COL        = "Job Title"
BIO_COL        = "Bio"

MIN_TEXT_LEN = 30  # skip ultra-short/noisy rows

def _safe(x):
    return str(x).strip() if (isinstance(x, str) and x.strip()) else ""

def meta_text_from_row(r: pd.Series) -> str:
    parts = [
        _safe(r.get(TALK_TITLE_COL)),
        _safe(r.get(ABSTRACT_COL)),
        _safe(r.get(WYL_COL)),
        _safe(r.get(PREREQ_COL)),
        _safe(r.get(CAT1_COL)),
        _safe(r.get(KW_COL)),
        _safe(r.get(UNIQUE_COL)),
    ]
    txt = " | ".join([p for p in parts if p])
    return txt

def bio_text_from_row(r: pd.Series) -> str:
    parts = [_safe(r.get(JOB_COL)), _safe(r.get(BIO_COL))]
    txt = " | ".join([p for p in parts if p])
    return txt


In [9]:
def build_meta_queries(df: pd.DataFrame):
    queries, skipped = [], 0
    for _, row in df.iterrows():
        title = _safe(row.get(TALK_TITLE_COL))
        if not title:
            continue

        yt_id = _safe(row.get(YOUTUBE_ID_COL)) or extract_youtube_id(_safe(row.get(YOUTUBE_URL_COL)))
        talk_id = make_talk_id(title, yt_id)

        txt = meta_text_from_row(row)
        if len(txt) < MIN_TEXT_LEN:
            skipped += 1
            continue

        vec = model.encode([txt], normalize_embeddings=True, show_progress_bar=False)[0]
        uid = f"{talk_id}::{SET_META}"

        cmd = [
            { "FindEntity": {
                "_ref": 1, "with_class": TALK_CLASS, "unique": True,
                "constraints": {"talk_id": ["==", talk_id]},
                "results": {"count": True}
            }},
            { "AddDescriptor": {                                     # AddDescriptor + connect
                "set": SET_META,
                "properties": {
                    "descriptor_uid": uid,       # set-unique for idempotency across sets
                    "talk_id": talk_id,
                    "meta_text": txt
                },
                "if_not_found": { "descriptor_uid": ["==", uid] },  # safe conditional-add
                "connect": { "class": "TalkHasMeta", "ref": 1, "direction": "in" }
            }}
        ]
        queries.append((cmd, [to_blob(np.asarray(vec, dtype="<f4"))]))
    print(f"[meta] prepared={len(queries)} skipped_short_or_empty={skipped}")
    return queries

def build_bio_queries(df: pd.DataFrame):
    queries, skipped = [], 0
    for _, row in df.iterrows():
        title = _safe(row.get(TALK_TITLE_COL))
        if not title:
            continue

        yt_id = _safe(row.get(YOUTUBE_ID_COL)) or extract_youtube_id(_safe(row.get(YOUTUBE_URL_COL)))
        talk_id = make_talk_id(title, yt_id)

        txt = bio_text_from_row(row)
        if len(txt) < MIN_TEXT_LEN:
            skipped += 1
            continue

        vec = model.encode([txt], normalize_embeddings=True, show_progress_bar=False)[0]
        uid = f"{talk_id}::{SET_BIO}"

        cmd = [
            { "FindEntity": {
                "_ref": 1, "with_class": TALK_CLASS, "unique": True,
                "constraints": {"talk_id": ["==", talk_id]},
                "results": {"count": True}
            }},
            { "AddDescriptor": {
                "set": SET_BIO,
                "properties": {
                    "descriptor_uid": uid,
                    "talk_id": talk_id,
                    "bio_text": txt
                },
                "if_not_found": { "descriptor_uid": ["==", uid] },
                "connect": { "class": "TalkHasSpeakerBio", "ref": 1, "direction": "in" }
            }}
        ]
        queries.append((cmd, [to_blob(np.asarray(vec, dtype="<f4"))]))
    print(f"[bio] prepared={len(queries)} skipped_short_or_empty={skipped}")
    return queries


In [10]:
df = pd.read_csv(CSV_PATH)
print("rows read:", len(df))

meta_queries = build_meta_queries(df)
bio_queries  = build_bio_queries(df)

loader = ParallelLoader(con)

if meta_queries:
    loader.ingest(meta_queries, batchsize=DB_BATCHSIZE, numthreads=DB_THREADS, stats=True)   #
if bio_queries:
    loader.ingest(bio_queries,  batchsize=DB_BATCHSIZE, numthreads=DB_THREADS, stats=True)

print("Done.")

rows read: 278
[meta] prepared=278 skipped_short_or_empty=0
[bio] prepared=269 skipped_short_or_empty=9


Progress: 100%|██████████| 278/278 [00:08<00:00, 34.7items/s]


Total time (s): 8.00862741470337
Total queries executed: 20
Avg Query time (s): 1.5235175132751464
Query time std: 0.6177937849136516
Avg Query Throughput (q/s): 2.6255031301879117
Overall insertion throughput (element/s): 34.71256503824967
Total inserted elements: 278
Total successful commands: 556


Progress: 100%|██████████| 269/269 [00:08<00:00, 33.6items/s]

Total time (s): 8.008178949356079
Total queries executed: 20
Avg Query time (s): 1.4736083269119262
Query time std: 0.5892354024135589
Avg Query Throughput (q/s): 2.7144254867114834
Overall insertion throughput (element/s): 33.59065796370968
Total inserted elements: 269
Total successful commands: 538
Done.





### sanity check queries

1) Counts per set

In [11]:
for s in [SET_META, SET_BIO]:
    q = [{"FindDescriptor": {"set": s, "results": {"count": True}}}]
    print(s, con.query(q)[0])


ds_talk_meta_v1 [{'FindDescriptor': {'count': 278, 'returned': 0, 'status': 0}}]
ds_speaker_bio_v1 [{'FindDescriptor': {'count': 269, 'returned': 0, 'status': 0}}]


2) Hop from one talk to each set

In [12]:
title = "Panel: How companies can navigate and approach the new advancements in generative AI"
q = [
  {"FindEntity": {"_ref": 1, "with_class": "Talk", "unique": True,
                  "constraints": {"talk_title": ["==", title]}, "results": {"list": ["talk_id"]}}},
  {"FindDescriptor": {"set": SET_META, "is_connected_to": {"ref": 1, "connection_class": "TalkHasMeta"},
                      "results": {"list": ["descriptor_uid","meta_text"]}}},
  {"FindDescriptor": {"set": SET_BIO,  "is_connected_to": {"ref": 1, "connection_class": "TalkHasSpeakerBio"},
                      "results": {"list": ["descriptor_uid","bio_text"]}}}
]
print(con.query(q))


([{'FindEntity': {'entities': [{'talk_id': '7255dc0f-0f29-5aad-9a95-a7956c44673a'}], 'returned': 1, 'status': 0}}, {'FindDescriptor': {'entities': [{'descriptor_uid': '7255dc0f-0f29-5aad-9a95-a7956c44673a::ds_talk_meta_v1', 'meta_text': 'Panel: How companies can navigate and approach the new advancements in generative AI | Business and stakeholder alignment | Generative AI, Business Strategy, Panel Discussion'}], 'returned': 1, 'status': 0}}, {'FindDescriptor': {'entities': [{'bio_text': "Principal Director, Experimental Development, Chief Commercialization Officer & VP, Industry Innovation, Chief Operating Officer, Interim Head, Sara - Head of Cohere for AI | Michel - As Director, AI Activation at Mila, the Quebec Artificial Intelligence Institute, Michel Dubois actively participates in the development of AI for the benefit of all. He holds a master's degree in mathematics and is currently a PhD candidate in engineering (machine learning). He is also the author of a patent on the math

3) kNN on meta set → back to talks

In [13]:
qvec = model.encode(["query: evaluation pipelines and deployment"], normalize_embeddings=True)[0].astype("<f4")
q = [
  {"FindDescriptor": {"_ref": 1, "set": SET_META, "k_neighbors": 5,
                      "results": {"list": ["_distance","descriptor_uid","talk_id","meta_text"]}}},
  {"FindEntity": {"with_class": "Talk", "is_connected_to": {"ref": 1, "connection_class": "TalkHasMeta"},
                  "results": {"list": ["talk_title","speaker_name","youtube_url"]}}}
]
print(con.query(q, blobs=[qvec.tobytes()]))


([{'FindDescriptor': {'entities': [{'_distance': None, 'descriptor_uid': '6b8d79ff-1417-57e1-9263-12d335800919::ds_talk_meta_v1', 'meta_text': 'Optimized AI Deployment Platform | Showcasing CentMLs ability to streamline the process of deploying and optimizing LLMs in production. | Deployment and integration | CentML, LLM deployment, Optimization', 'talk_id': '6b8d79ff-1417-57e1-9263-12d335800919'}, {'_distance': None, 'descriptor_uid': '0c7e727c-214e-5fc1-954e-3aa19bd8a513::ds_talk_meta_v1', 'meta_text': 'Deploying and Evaluating RAG pipelines with Lightning Studios | Learn how to use Lightning Studios to quickly deploy AI agents and accelerate your evaluation of RAG pipelines. | Learn how to use Lightning Studios to quickly deploy AI agents and accelerate your evaluation of RAG pipelines. | Deployment and integration | Lightning Studios, RAG Pipelines, AI Agents', 'talk_id': '0c7e727c-214e-5fc1-954e-3aa19bd8a513'}, {'_distance': None, 'descriptor_uid': '4e0c7f79-d08a-5eed-8500-eda5faa

In [14]:
run([
  { "GetSchema": { } }
]
)

[
    {
        "GetSchema": {
            "connections": {
                "classes": {
                    "TalkHasMeta": {
                        "dst": "_Descriptor",
                        "matched": 278,
                        "properties": null,
                        "src": "Talk"
                    },
                    "TalkHasSpeaker": {
                        "dst": "Person",
                        "matched": 373,
                        "properties": null,
                        "src": "Talk"
                    },
                    "TalkHasSpeakerBio": {
                        "dst": "_Descriptor",
                        "matched": 269,
                        "properties": null,
                        "src": "Talk"
                    },
                    "TalkHasTranscriptChunk": {
                        "dst": "_Descriptor",
                        "matched": 16770,
                        "properties": null,
                        "src": "Talk"
     

[{'GetSchema': {'connections': {'classes': {'TalkHasMeta': {'dst': '_Descriptor',
      'matched': 278,
      'properties': None,
      'src': 'Talk'},
     'TalkHasSpeaker': {'dst': 'Person',
      'matched': 373,
      'properties': None,
      'src': 'Talk'},
     'TalkHasSpeakerBio': {'dst': '_Descriptor',
      'matched': 269,
      'properties': None,
      'src': 'Talk'},
     'TalkHasTranscriptChunk': {'dst': '_Descriptor',
      'matched': 16770,
      'properties': None,
      'src': 'Talk'},
     '_DescriptorSetToDescriptor': {'dst': '_Descriptor',
      'matched': 17317,
      'properties': None,
      'src': '_DescriptorSet'}},
    'returned': 5},
   'entities': {'classes': {'Person': {'matched': 338,
      'properties': {'name': [338, True, 'String']}},
     'Talk': {'matched': 278,
      'properties': {'abstract': [274, False, 'String'],
       'bio': [269, False, 'String'],
       'category_primary': [277, False, 'String'],
       'company_name': [278, False, 'String'],