Meant to work with sequential vector indices and some prompting as per instructions
here prompts/user_per_speaker_01.md to create action items per speaker. 

Here we are summarizing people, and what they said. May not be great for interwoven conversations.

In [None]:
import json
import os
from datetime import datetime

from langchain_core.prompts import PromptTemplate
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from pathlib import Path

import whisperx_transcribe.utils as ut
from whisperx_transcribe import logger, start_time

In [None]:
HISTORY = "../.history"
PROMPT_FILE = "../prompts/instructions_01.md"
SEG_FILE = "../assets/sample/audio_diarized.json"
GROUP_SEG_FILE = f"../{SEG_FILE.split('.')[0]}_grouped.json"

In [None]:
segments = ut.load_segments(segments_file=SEG_FILE)
grouped_segments = ut.group_speakers(segments=segments, speaker_names={})
with open(GROUP_SEG_FILE, "w") as f:
    json.dump(grouped_segments, f, indent=4)

with open(PROMPT_FILE, "r") as f:
    instructions = f.read()

In [None]:
len(grouped_segments)

In [None]:
ut.speaker_segment(grouped_segments[2])

In [None]:
SPEAKERS = ut.get_all_speakers(segments=grouped_segments)
SPEAKERS

In [None]:
model = "gemma3:4b"
llm = Ollama(model=model, request_timeout=1000)
embed_model = HuggingFaceEmbedding(
    "Qwen/Qwen3-Embedding-0.6B",
    trust_remote_code=True,
    cache_folder="../model",
)
Settings.embed_model = embed_model
Settings.llm = llm

persist_dir = "storage"
Path(persist_dir).mkdir(parents=True, exist_ok=True)

In [None]:
QUERY_PROMPT = open("../prompts/user_final_01.md", "r").read()

In [None]:
from llama_index.core import VectorStoreIndex, StorageContext, SummaryIndex

index = SummaryIndex.from_documents(documents=[], show_progress=True)
# storage_context=StorageContext.from_defaults(persist_dir=persist_dir)
index.set_index_id(f"vector_index_{start_time}")
index.storage_context.persist(persist_dir)
len_segments = len(grouped_segments)
for idx, segment in enumerate(grouped_segments):
    doc = ut.speaker_segment(segment)
    index.insert(doc)

In [None]:
from llama_index.core.vector_stores import (
    FilterOperator,
    MetadataFilter,
    MetadataFilters,
    ExactMatchFilter,
)
from llama_index.core.prompts import RichPromptTemplate, SelectorPromptTemplate
from llama_index.core.query_engine import RetrieverQueryEngine

In [None]:
def query_per_speaker(speaker: str):
    """Query the index for a specific speaker."""
    template_var_mappings = {
        "speaker": speaker,
        # context_str mapping is done internally
    }
    base = RichPromptTemplate(
        template_str=open("../prompts/user_per_speaker_01.md", "r").read(),
        template_format="f-string",
        template_var_mappings=template_var_mappings,
    )

    filters = MetadataFilters(filters=[ExactMatchFilter(key="speaker", value=speaker)])
    qe = index.as_query_engine(
        # text_qa_template=base,
        # 'response_synthesizer:summary_template'
        summary_template=base,
        response_mode="tree_summarize",
        filters=filters,
    )
    # qe = RetrieverQueryEngine.from_args(
    #     retriever=retriever,
    #     llm=Settings.llm,
    #     summary_template=base,
    #     text_qa_template=base,
    #     response_mode="tree_summarize",
    # )
    prompts_dict = qe.get_prompts()
    print(prompts_dict)
    # return

    return qe.query(" ")  # query_str is not used.

Here we are summarizing people, and what they said. May not be great for interwoven conversations. OK for single speakers. 

In [None]:
for speaker in SPEAKERS:
    response = query_per_speaker(speaker)
    print(f"Summary for {speaker}:", response)
    print("\n" + "=" * 80 + "\n")