Meant to work with sequential vector indices and some prompting as per instructions
here prompts/user_per_speaker_01.md to create action items per speaker. 

Here we are summarizing people, and what they said. May not be great for interwoven conversations.

In [None]:
import json
import os
from datetime import datetime

from langchain_core.prompts import PromptTemplate
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from pathlib import Path

import whisperx_transcribe.utils as ut
from whisperx_transcribe import logger, start_time

In [None]:
HISTORY = "../.history"
PROMPT_FILE = "../prompts/instructions_01.md"
SEG_FILE = "../assets/sample/audio_diarized.json"
GROUP_SEG_FILE = f"../{SEG_FILE.split('.')[0]}_grouped.json"

No grouping is necessary, the transcript is fed into the index step by step.

In [None]:
segments = ut.load_segments(segments_file=SEG_FILE)
with open(PROMPT_FILE, "r") as f:
    instructions = f.read()

Still a good idea to group the continuous sections of the transcript to make it easy for retriever. 

In [None]:
grouped_segments = ut.group_speakers(segments=segments)

In [None]:
len(segments), len(grouped_segments)

In [None]:
ut.speaker_segment(grouped_segments[0])

Each Document in LlamaIndex will here represent a single turn or a small group of related turns.

In [None]:
SPEAKERS = ut.get_all_speakers(segments=grouped_segments)
SPEAKERS

In [None]:
model = "gemma3:4b"
llm = Ollama(model=model, request_timeout=1000)
embed_model = HuggingFaceEmbedding(
    "Qwen/Qwen3-Embedding-0.6B",
    trust_remote_code=True,
    cache_folder="../model",
)
Settings.embed_model = embed_model
Settings.llm = llm

persist_dir = "storage"
Path(persist_dir).mkdir(parents=True, exist_ok=True)

In [None]:
from llama_index.core import VectorStoreIndex, StorageContext, SummaryIndex

summary_index = SummaryIndex.from_documents(documents=[], show_progress=True)
# vector_index = VectorStoreIndex.from_documents(documents=[], show_progress=True)
# storage_context=StorageContext.from_defaults(persist_dir=persist_dir)
summary_index.set_index_id(f"summary_index_{start_time}")
summary_index.storage_context.persist(persist_dir)
len_segments = len(grouped_segments)
for idx, segment in enumerate(grouped_segments):
    doc = ut.speaker_segment(segment, turn_id=idx)
    summary_index.insert(doc)

In [None]:
len(summary_index.docstore.docs)

In [None]:
summary_index.docstore.docs[list(summary_index.docstore.docs.keys())[2]]

In [None]:
from llama_index.core.vector_stores import (
    FilterOperator,
    MetadataFilter,
    MetadataFilters,
    ExactMatchFilter,
)
from llama_index.core.prompts import RichPromptTemplate, SelectorPromptTemplate
from llama_index.core.query_engine import RetrieverQueryEngine

In [None]:
from llama_index.core.tools import QueryEngineTool

In [None]:
base = RichPromptTemplate(
    template_str=open("../prompts/user_conversation_01.md", "r").read(),
    template_format="f-string",
)
qe_summarizer = summary_index.as_query_engine(
    summary_template=base,
    response_mode="tree_summarize",
)

Multiple qeuery tools that focus on different aspects of the conversation. 

In [None]:
summary_tool = QueryEngineTool.from_defaults(
    query_engine=qe_summarizer,
    description=("Useful for summarization of the full conversation between speakers"),
)

In [None]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector, LLMMultiSelector
from llama_index.core.selectors import (
    PydanticMultiSelector,
    PydanticSingleSelector,
)

query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        # add more indices for each task
    ],
)

In [None]:
response = query_engine.query("What is the summary of the document?")
print(str(response))

In [None]:
response.metadata["selector_result"].reasons

Here we are summarizing people, and what they said. May not be great for interwoven conversations. OK for single speakers. 

In [None]:
for speaker in SPEAKERS:
    response = query_per_speaker(speaker)
    print(f"Summary for {speaker}:", response)
    print("\n" + "=" * 80 + "\n")