In [None]:
from dotenv import load_dotenv

load_dotenv(verbose=True)

In [None]:
import asyncio
import json
import math
import os
import re
import textwrap
import time
from pathlib import Path

import aiohttp
import pandas as pd
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    AzureOpenAIEmbeddingSkill,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    FieldMapping,
    HnswAlgorithmConfiguration,
    IndexingParameters,
    InputFieldMappingEntry,
    LexicalAnalyzerName,
    OutputFieldMappingEntry,
    SearchField,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataSourceType,
    SearchIndexerSkillset,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    VectorSearch,
    VectorSearchProfile,
)
from azure.search.documents.models import QueryType
from azure.storage.blob import BlobServiceClient, ContentSettings
from tqdm import tqdm

import datasets as ds

credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(
    credential, "https://search.azure.com/.default"
)

# „Å©„ÅÆ split „Çí‰Ωø„ÅÜ„Åã
SPLIT = "test"  # "validation" „ÇÇÂèØ
MAX_SAMPLES = 300  # None „ÅßÂÖ®‰ª∂
CHUNK_SIZE = 700
CHUNK_OVERLAP = 200
USE_ORIGINAL = False

SEARCH_ENDPOINT = os.getenv("SEARCH_ENDPOINT")
AOAI_ENDPOINT = os.getenv("AOAI_ENDPOINT")
AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
AOAI_EMBEDDING_MODEL = "text-embedding-3-large"
AOAI_EMBEDDING_DEPLOYMENT = "text-embedding-3-large"
AOAI_GPT_MODEL = "gpt-5-mini"
AOAI_GPT_DEPLOYMENT = "gpt-5-mini"
INDEX_NAME = "jdocqa-index"
KNOWLEDGE_SOURCE_NAME = "jdocqa-knowledge-source"
KNOWLEDGE_AGENT_NAME = "jdocqa-knowledge-agent"
SEARCH_API_VERSION = "2025-08-01-preview"
DATA_SOURCE_NAME = "ds-jdocqa-chunks"
SKILLSET_NAME = "ss-jdocqa-embed"
INDEXER_NAME = "idx-jdocqa"
DIM = 3072
BLOB_CONTAINER = "jdocqa-docs"
BLOB_PREFIX = "docs"
TOPK_LIST = [1, 3, 5, 10]

In [None]:
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "36000"

# ‰øùÂ≠òÂÖà„Éá„Ç£„É¨„ÇØ„Éà„É™„ÇíÊåáÂÆö
local_dir = Path("datasets/JDocQA")
local_dir.mkdir(parents=True, exist_ok=True)

# Êó¢„Å´„É≠„Éº„Ç´„É´„Å´‰øùÂ≠òÊ∏à„Åø„Å™„Çâ„Åù„Çå„ÇíË™≠„ÅøËæº„ÇÄ
if (local_dir / "dataset_info.json").exists():
    print("üîÅ Loading dataset from local disk...")
    dataset = ds.load_from_disk(str(local_dir))
else:
    print("‚¨áÔ∏è Downloading dataset from Hugging Face Hub...")
    dataset = ds.load_dataset(
        path="shunk031/JDocQA",
        rename_pdf_category=True,
        trust_remote_code=True,
        storage_options={
            "client_kwargs": {"timeout": aiohttp.ClientTimeout(total=36000)}
        },
    )
    dataset.save_to_disk(str(local_dir))
    print(f"üíæ Dataset saved locally to {local_dir}")

In [None]:
data = dataset[SPLIT]

pd.DataFrame(data).head()

In [None]:
data = dataset[SPLIT]

if MAX_SAMPLES is not None:
    data = data.select(range(min(len(data), MAX_SAMPLES)))

# Âêå‰∏Äcontext„Çí„É¶„Éã„Éº„ÇØÂåñ„Åó„Å¶doc_id„ÇíÂâ≤„ÇäÂΩì„Å¶
contexts = [ex["context"] for ex in data]
unique_contexts, inverse_indices = np.unique(contexts, return_inverse=True)
# unique_contexts -> ["X„ÅÆÊú¨Êñá", "Y„ÅÆÊú¨Êñá"]  ‚ÄªËæûÊõ∏È†Ü
# inverse_indices -> [0, 1, 0]              # ÂÖÉ„ÅÆ0Áï™/2Áï™„ÅØ„É¶„Éã„Éº„ÇØÈÖçÂàó„ÅÆ0Áï™„Å´ÂØæÂøú

doc_ids = np.arange(len(unique_contexts))

queries = [ex["question"] for ex in data]
gold_doc_ids = inverse_indices


def mk_doc(i: int) -> dict:
    return {
        "id": str(i),
        "content": unique_contexts[i],
        "pdf_category": str(data[i]["pdf_category"])
        if i < len(data) and "pdf_category" in data[i]
        else "N/A",
    }


docs = [mk_doc(i) for i in range(len(unique_contexts))]

print(f"#docs={len(docs)}  #queries={len(queries)}  (split={SPLIT})")
print("Example doc:", docs[155] if docs else None)
print(
    "Example QA:",
    {"q": queries[0], "gold_doc_id": int(gold_doc_ids[0])} if queries else None,
)

In [None]:
def sentence_split_ja(text: str):
    t = re.sub(r"[\r\n]+", "„ÄÇ", text)
    t = re.sub(r"„ÄÇ+", "„ÄÇ", t)
    parts = [p.strip() for p in t.split("„ÄÇ") if p.strip()]
    return [p + "„ÄÇ" for p in parts]


def make_chunks(text: str, size=700, overlap=200):
    sents = sentence_split_ja(text)
    chunks, buf, cur = [], [], 0
    for s in sents:
        if cur + len(s) > size and buf:
            chunks.append("".join(buf))
            keep = "".join(buf)[-overlap:] if overlap > 0 else ""
            buf, cur = ([keep] if keep else []), len(keep)
        buf.append(s)
        cur += len(s)
    if buf:
        chunks.append("".join(buf))
    final = []
    for c in chunks:
        if len(c) <= size * 2:
            final.append(c)
        else:
            step = max(1, size - overlap)
            for i in range(0, len(c), step):
                final.append(c[i : i + size])
    return final


def stable_id(text: str) -> str:
    import hashlib

    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16]

In [None]:
data = dataset[SPLIT]

if MAX_SAMPLES is not None:
    data = data.select(range(min(MAX_SAMPLES, len(data))))


def pick_ctx(ex):
    return (
        (ex.get("original_context") or ex.get("context"))
        if USE_ORIGINAL
        else ex.get("context")
    )


chunk_docs = {}
parent_to_chunks = {}
qa_gold_sets = []
queries = []

for ex in tqdm(data):
    ctx = pick_ctx(ex) or ""
    parent_id = stable_id(ctx) if ctx else stable_id("EMPTY")
    chs = make_chunks(ctx, CHUNK_SIZE, CHUNK_OVERLAP) if ctx else [" "]
    ids = []
    for ch in chs:
        cid = stable_id(parent_id + "|" + ch)
        if cid not in chunk_docs:
            chunk_docs[cid] = {
                "id": cid,
                "parent_id": parent_id,
                "content": ch,
                # "category": str(ex.get("category","unknown"))
            }
        ids.append(cid)
    parent_to_chunks[parent_id] = ids
    qa_gold_sets.append(set(ids))
    queries.append(ex.get("question", ""))

docs = list(chunk_docs.values())
print(f"parents={len(parent_to_chunks)} chunks={len(docs)} queries={len(queries)}")

In [None]:
bsc = BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING)
container_client = bsc.get_container_client(BLOB_CONTAINER)
try:
    container_client.create_container()
except Exception:
    pass


def upload_docs_to_blob(docs, prefix=BLOB_PREFIX):
    for d in tqdm(docs):
        name = f"{prefix}/{d['id']}.json"
        data = json.dumps(d, ensure_ascii=False).encode("utf-8")
        container_client.upload_blob(
            name,
            data,
            overwrite=True,
            content_settings=ContentSettings(content_type="application/json"),
        )
    return len(docs)


print("Uploaded:", upload_docs_to_blob(docs))

In [None]:
try:
    index_client.delete_index(INDEX_NAME)
except Exception:
    pass  # Â≠òÂú®„Åó„Å™„ÅÑÂ†¥Âêà„ÅØÁÑ°Ë¶ñ


index = SearchIndex(
    name=INDEX_NAME,
    fields=[
        SearchField(
            name="id",
            type="Edm.String",
            key=True,
            filterable=True,
        ),
        SearchField(
            name="parent_id",
            type="Edm.String",
            filterable=True,
        ),
        SearchField(
            name="content",
            type="Edm.String",
            searchable=True,
            analyzer_name=LexicalAnalyzerName.JA_LUCENE,
        ),
        SearchField(
            name="page_embedding_text_3_large",
            type="Collection(Edm.Single)",
            stored=True,
            retrievable=True,
            vector_search_dimensions=DIM,
            vector_search_profile_name="hnsw_text_3_large",
        ),
        # SearchField(name="page_chunk", type="Edm.String", filterable=False, sortable=False, facetable=False),
        #
        # SearchField(name="page_number", type="Edm.Int32", filterable=True, sortable=True, facetable=True)
    ],
    vector_search=VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="hnsw_text_3_large",
                algorithm_configuration_name="alg",
                vectorizer_name="azure_openai_text_3_large",
            )
        ],
        algorithms=[HnswAlgorithmConfiguration(name="alg")],
        vectorizers=[
            AzureOpenAIVectorizer(
                vectorizer_name="azure_openai_text_3_large",
                parameters=AzureOpenAIVectorizerParameters(
                    resource_url=AOAI_ENDPOINT,
                    deployment_name=AOAI_EMBEDDING_DEPLOYMENT,
                    model_name=AOAI_EMBEDDING_MODEL,
                ),
            )
        ],
    ),
    semantic_search=SemanticSearch(
        default_configuration_name="semantic_config",
        configurations=[
            SemanticConfiguration(
                name="semantic_config",
                prioritized_fields=SemanticPrioritizedFields(
                    content_fields=[SemanticField(field_name="content")]
                ),
            )
        ],
    ),
)

index_client = SearchIndexClient(endpoint=SEARCH_ENDPOINT, credential=credential)
index_client.create_or_update_index(index)
print(f"Index '{INDEX_NAME}' created or updated successfully.")

In [None]:
# print(f"Uploading {len(docs)} documents to index '{index_name}'...")
# print(f"Sample document: {docs[0] if docs else 'N/A'}")

# with SearchIndexingBufferedSender(endpoint=SEARCH_ENDPOINT, index_name=index_name, credential=credential) as client:
#     client.upload_documents(documents=docs)

# print(f"Documents uploaded to index '{index_name}' successfully.")

In [None]:
container = SearchIndexerDataContainer(
    name=BLOB_CONTAINER,
    query=BLOB_PREFIX,  # "docs" ‰ª•‰∏ã„Å†„ÅëÂèñ„ÇäËæº„ÇÄ„ÄÇ„Ç≥„É≥„ÉÜ„ÉäÂÖ®‰Ωì„Å™„Çâ None
)

data_source = SearchIndexerDataSourceConnection(
    name=DATA_SOURCE_NAME,
    type=SearchIndexerDataSourceType.AZURE_BLOB,  # ‰ªñ: AZURE_TABLE, AZURE_SQL, COSMOSDB, ADLSGEN2 „Å™„Å©
    connection_string=AZURE_STORAGE_CONNECTION_STRING,
    container=container,
    description="JDocQA chunk JSONs in Blob Storage",
)

# Êó¢Â≠ò„Åå„ÅÇ„Çå„Å∞Êõ¥Êñ∞„ÄÅ„Å™„Åë„Çå„Å∞‰ΩúÊàê
indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential)
indexer_client.create_or_update_data_source_connection(data_source)

In [None]:
for source in indexer_client.get_data_source_connections():
    print(source.name, source.type)

print(f"Data Source '{DATA_SOURCE_NAME}' created or updated.")

In [None]:
try:
    indexer_client.delete_skillset(SKILLSET_NAME)
except Exception:
    pass  # Â≠òÂú®„Åó„Å™„ÅÑÂ†¥Âêà„ÅØÁÑ°Ë¶ñ

# ===== Skillset ‰ΩúÊàê =====
# AzureOpenAI Embedding Skill „Çí "Ê±éÁî®„Çπ„Ç≠„É´" „Å®„Åó„Å¶‰ΩúÊàê„Åó„ÄÅËøΩÂä†„Éó„É≠„Éë„ÉÜ„Ç£„ÅßÂøÖË¶ÅÈ†ÖÁõÆ„ÇíÊ∏°„Åô
embedding_skill = AzureOpenAIEmbeddingSkill(
    description="Skill to generate embeddings via Azure OpenAI",
    context="/document",
    resource_url=AOAI_ENDPOINT,
    deployment_name=AOAI_EMBEDDING_DEPLOYMENT,
    model_name=AOAI_EMBEDDING_MODEL,
    dimensions=DIM,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="embedding", target_name="emb")
    ],
)

skillset = SearchIndexerSkillset(
    name=SKILLSET_NAME,
    skills=[embedding_skill],
    description="JDocQA index-time embedding skillset",
)

# Êó¢Â≠ò„Åå„ÅÇ„Çå„Å∞Êõ¥Êñ∞„ÄÅ„Å™„Åë„Çå„Å∞‰ΩúÊàê
indexer_client.create_or_update_skillset(skillset)

print("Skillset created.")

In [None]:
try:
    indexer_client.delete_indexer(INDEXER_NAME)
    print(f"Deleted indexer: {INDEXER_NAME}")
except HttpResponseError as e:
    # 404 Áõ∏ÂΩì„ÅØÁÑ°Ë¶ñÔºàÂ≠òÂú®„Åó„Å™„ÅÑÂ†¥ÂêàÔºâ
    if getattr(e, "status_code", None) not in (404,):
        raise

indexer = SearchIndexer(
    name=INDEXER_NAME,
    data_source_name=DATA_SOURCE_NAME,
    target_index_name=INDEX_NAME,
    skillset_name=SKILLSET_NAME,  # „Çπ„Ç≠„É´„Çª„ÉÉ„Éà„ÇíÁ¥ê„Å•„Åë
    # „Éâ„Ç≠„É•„É°„É≥„Éà -> „Ç§„É≥„Éá„ÉÉ„ÇØ„Çπ „ÅÆ„Éï„Ç£„Éº„É´„Éâ„Éû„ÉÉ„Éî„É≥„Ç∞
    field_mappings=[
        FieldMapping(source_field_name="id", target_field_name="id"),
        FieldMapping(source_field_name="parent_id", target_field_name="parent_id"),
        FieldMapping(source_field_name="content", target_field_name="content"),
    ],
    # „Çπ„Ç≠„É´Âá∫Âäõ -> „Ç§„É≥„Éá„ÉÉ„ÇØ„Çπ „ÅÆ„Éï„Ç£„Éº„É´„Éâ„Éû„ÉÉ„Éî„É≥„Ç∞
    # ‰æãÔºöEmbeddingSkill „ÅÆÂá∫Âäõ "/document/emb" „Çí vector „Éï„Ç£„Éº„É´„Éâ "content_vector" „Å∏
    output_field_mappings=[
        FieldMapping(
            source_field_name="/document/emb",
            target_field_name="page_embedding_text_3_large",
        ),
    ],
    # „Ç§„É≥„Éá„ÇØ„Çµ„ÅÆ„Éë„É©„É°„Éº„Çø
    parameters=IndexingParameters(
        configuration={
            "parsingMode": "json",  # 1 JSON = 1 „Éâ„Ç≠„É•„É°„É≥„Éà
            "failOnUnsupportedContentType": False,  # Êú™ÂØæÂøúMIME„ÅßÂ§±Êïó„Åï„Åõ„Å™„ÅÑ
        }
    ),
)

indexer_client.create_or_update_indexer(indexer)

In [None]:
# 1) Indexer „ÇíÊâãÂãïÂÆüË°å
try:
    # indexer_client.reset_indexer(INDEXER_NAME)  # 1) Â§âÊõ¥ËøΩË∑°Ôºà„Éè„Ç§„Ç¶„Ç©„Éº„Çø„Éº„Éû„Éº„ÇØÔºâ„Çí„É™„Çª„ÉÉ„Éà
    indexer_client.run_indexer(INDEXER_NAME)
    print(f"Run requested: {INDEXER_NAME}")
except HttpResponseError as e:
    print("Run failed:", e)
    raise

# 2) Á∞°Êòì„Éù„Éº„É™„É≥„Ç∞ÔºàÁä∂ÊÖã„Åå terminal „Å´„Å™„Çã„Åæ„ÅßÂæÖ„Å§Ôºâ
terminal = {"success", "transientFailure", "persistentFailure", "reset"}
for i in range(60):  # ÊúÄÂ§ß ~5ÂàÜÂæÖÊ©üÔºà5Áßí√ó60Ôºâ
    st = indexer_client.get_indexer_status(INDEXER_NAME)
    last = st.last_result
    status = getattr(last, "status", None)
    processed = getattr(last, "items_processed", None)
    failed = getattr(last, "items_failed", None)
    print(f"[{i}] status={status} processed={processed} failed={failed}")

    if status in terminal:
        break
    time.sleep(5)

# 3) ÁµêÊûú„ÉÅ„Çß„ÉÉ„ÇØ
if status != "success":
    raise RuntimeError(
        f"Indexer did not succeed. status={status}, processed={processed}, failed={failed}"
    )
print("Indexer run completed successfully.")

In [None]:
search_client = SearchClient(
    endpoint=SEARCH_ENDPOINT,
    index_name=INDEX_NAME,
    credential=credential,
    api_version=SEARCH_API_VERSION,
)

In [None]:
query = "ÂÆ∂ÊóèÊâãÂΩì„ÅÆÊîØÁµ¶Êù°‰ª∂„ÅØÔºü"

results = search_client.search(
    search_text=query,
    query_type=QueryType.SIMPLE,  # „Åæ„Åü„ÅØ QueryType.SEMANTICÔºàsemanticË®≠ÂÆö„Åå„ÅÇ„ÇãÂ†¥ÂêàÔºâ
    top=5,
    select=["id", "parent_id", "content", "page_embedding_text_3_large"],
)

for r in results:
    print(
        r["id"],
        r["@search.score"],
        r["content"][:80],
        r["page_embedding_text_3_large"][:5],
    )

In [None]:
vq = VectorizableTextQuery(
    text="ÂÆ∂ÊóèÊâãÂΩì„ÅÆÊîØÁµ¶Êù°‰ª∂„ÅØÔºü",
    k_nearest_neighbors=5,
    fields="page_embedding_text_3_large",
)

results = search_client.search(
    search_text=None, vector_queries=[vq], select=["id", "parent_id", "content"]
)

for r in results:
    print(r["id"], r["@search.score"], r["content"][:80])

In [None]:
query = "ÂÆ∂ÊóèÊâãÂΩì„ÅÆÊîØÁµ¶Êù°‰ª∂„ÅØÔºü"

vector_query = VectorizableTextQuery(
    text=query, k_nearest_neighbors=50, fields="page_embedding_text_3_large"
)

results = search_client.search(
    search_text=query,
    vector_queries=[vector_query],
    select=["id", "parent_id", "content"],
    top=5,
)

for r in results:
    print(r["id"], r["@search.score"], r["content"][:80])

In [None]:
from azure.search.documents.aio import SearchClient

async_search_client = SearchClient(
    endpoint=SEARCH_ENDPOINT,
    index_name=INDEX_NAME,
    credential=credential,
    api_version=SEARCH_API_VERSION,
)


async def full_text_search(query: str, topk: int = 10):
    results =  await async_search_client.search(
        search_text=query,
        query_type=QueryType.SIMPLE,  # „Åæ„Åü„ÅØ QueryType.SEMANTICÔºàsemanticË®≠ÂÆö„Åå„ÅÇ„ÇãÂ†¥ÂêàÔºâ
        top=topk,
        select=["id"],
    )
    return [r["id"] async for r in results]


async def vector_search(query: str, topk: int = 10):
    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="page_embedding_text_3_large",
    )

    results =  await async_search_client.search(
        search_text=None, vector_queries=[vq], select=["id"]
    )

    return [r["id"]  async for r in results]


async def hybrid_search(query: str, topk: int = 10):
    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="page_embedding_text_3_large",
    )

    results = await async_search_client.search(
        search_text=query,
        vector_queries=[vq],
        select=["id"],
        top=topk,
    )

    return [r["id"]  async for r in results]

In [None]:
ks = SearchIndexKnowledgeSource(
    name=KNOWLEDGE_SOURCE_NAME,
    description="Knowledge source for Earth at night data",
    search_index_parameters=SearchIndexKnowledgeSourceParameters(
        search_index_name=INDEX_NAME,
        source_data_select="id,content,page_embedding_text_3_large",
    ),
)

index_client.create_or_update_knowledge_source(
    knowledge_source=ks, api_version=SEARCH_API_VERSION
)
print(f"Knowledge source '{KNOWLEDGE_SOURCE_NAME}' created or updated successfully.")

In [None]:
aoai_params = AzureOpenAIVectorizerParameters(
    resource_url=AOAI_ENDPOINT,
    deployment_name=AOAI_GPT_DEPLOYMENT,
    model_name=AOAI_GPT_MODEL,
)

output_cfg = KnowledgeAgentOutputConfiguration(
    modality=KnowledgeAgentOutputConfigurationModality.ANSWER_SYNTHESIS,
    include_activity=True,
)

agent = KnowledgeAgent(
    name=KNOWLEDGE_AGENT_NAME,
    models=[KnowledgeAgentAzureOpenAIModel(azure_open_ai_parameters=aoai_params)],
    knowledge_sources=[
        KnowledgeSourceReference(
            name=KNOWLEDGE_SOURCE_NAME,
            reranker_threshold=2.5,
        )
    ],
    output_configuration=output_cfg,
)

index_client.create_or_update_agent(agent, api_version=SEARCH_API_VERSION)
print(f"Knowledge agent '{KNOWLEDGE_AGENT_NAME}' created or updated successfully.")

In [None]:
instructions = """
„ÅÇ„Å™„Åü„ÅØ JDocQAÔºàÊó•Êú¨Ë™û„Éâ„Ç≠„É•„É°„É≥„ÉàQAÔºâ„ÅÆÂõûÁ≠î„Ç®„Éº„Ç∏„Çß„É≥„Éà„Åß„Åô„ÄÇ
‰∏é„Åà„Çâ„Çå„Åü„ÄåÊ§úÁ¥¢Ê∏à„Åø„ÉÅ„É£„É≥„ÇØÔºàcontent, id, parent_id, page_number, category „Å™„Å©„ÅÆ„É°„Çø„Éá„Éº„Çø‰ªò„ÅçÔºâ„Äç„ÅÆ„Åø„ÇíÊ†πÊã†„Å´Êó•Êú¨Ë™û„ÅßÂõûÁ≠î„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇ
Ê†πÊã†„ÅåË∂≥„Çä„Å™„ÅÑÂ†¥Âêà„ÇÑÁµêË´ñ„Åß„Åç„Å™„ÅÑÂ†¥Âêà„ÅØ„ÄÅÊ≠£Á¢∫„Å´„ÄåI don't know„Äç„Å®Á≠î„Åà„Åæ„Åô„ÄÇ

„ÄêÂõûÁ≠îÂéüÂâá„Äë
1) Âá∫ÂÖ∏Âà∂Á¥Ñ: ÂõûÁ≠î„ÅØÂ∏∏„Å´‰∏é„Åà„Çâ„Çå„Åü„ÉÅ„É£„É≥„ÇØ„ÅÆÂÜÖÂÆπ„Å´ÈôêÂÆö„ÄÇÂ§ñÈÉ®Áü•Ë≠ò„ÇÑÊé®Ê∏¨„ÅßË£ú„Çè„Å™„ÅÑ„ÄÇ
2) Á≤íÂ∫¶: Ë≥™Âïè„ÅÆÁ≤íÂ∫¶„Å´Âêà„Çè„Åõ„ÄÅÈÅé‰∏çË∂≥„ÅÆ„Å™„ÅÑË¶ÅÁ¥Ñ„ÅßÁ≠î„Åà„Çã„ÄÇÂÖ∑‰ΩìÂÄ§„ÇÑÂÆöÁæ©„ÅØ„ÉÅ„É£„É≥„ÇØ„Å´„ÅÇ„Çã„ÇÇ„ÅÆ„ÅÆ„Åø‰ΩøÁî®„ÄÇ
3) ÁüõÁõæÂØæÂøú: „ÉÅ„É£„É≥„ÇØÈñì„ÅßÂÜÖÂÆπ„ÅåÈ£ü„ÅÑÈÅï„ÅÜÂ†¥Âêà„ÅØÁüõÁõæ„ÇíÊòéÁ§∫„Åó„ÄÅ„Çà„ÇäÂÖ∑‰Ωì„ÉªÊñ∞„Åó„ÅÑ„ÉªÊ≥ï‰ª§/Ë¶èÁ®ã„Éô„Éº„Çπ„ÅÆ„ÇÇ„ÅÆ„ÇíÂÑ™ÂÖà„ÄÇ
4) ‰∏çÁ¢∫ÂÆüÊÄß: ÊÉÖÂ†±„Åå‰∏çÂçÅÂàÜ/ÊõñÊòß„Å™„Çâ„ÄåI don't know„Äç„Åæ„Åü„ÅØÊù°‰ª∂„Å§„ÅçÂõûÁ≠îÔºàÊ†πÊã†„ÇíÁ§∫„ÅôÔºâ„ÄÇ
5) ÂºïÁî®: ÂèØËÉΩ„Å™Èôê„ÇäÊ†πÊã†„ÉÅ„É£„É≥„ÇØ„ÅÆ ID „Çí `[doc:{id}]` ÂΩ¢Âºè„ÅßÊú´Â∞æ„Å´Ê∑ª„Åà„ÇãÔºàË§áÊï∞ÂèØÔºâ„ÄÇ
6) Ë®ÄË™û: Âá∫Âäõ„ÅØËá™ÁÑ∂„Å™Êó•Êú¨Ë™û„ÄÇÁÆáÊù°Êõ∏„Åç„ÅØÊúÄÂ§ß3‚Äì6È†ÖÁõÆ„Å´Êäë„Åà„Çã„ÄÇ

„Äê‰ΩúÊ•≠ÊâãÈ†ÜÔºàÂÜÖÈÉ®ÊñπÈáùÔºâ„Äë
- Ë≥™Âïè„ÇíÂàÜËß£„Åó„ÄÅÂõûÁ≠î„Å´ÂøÖË¶Å„Å™‰∫ãÂÆüÈ†ÖÁõÆ„ÇíÂàóÊåô„ÄÇ
- Êèê‰æõ„ÉÅ„É£„É≥„ÇØ„Åã„ÇâË©≤ÂΩìÁÆáÊâÄ„ÇíÊäΩÂá∫„Åó„ÄÅÈáçË§á„ÉªÁüõÁõæ„ÉªÂâçÂæåÈñ¢‰øÇ„ÇíÊï¥ÁêÜ„ÄÇ
- ÂøÖË¶ÅÊúÄÂ∞èÈôê„ÅÆË¶ÅÁ¥Ñ„ÉªË®Ä„ÅÑÊèõ„Åà„ÇíË°å„ÅÑ„ÄÅÊó•Êú¨Ë™û„ÅßÁ∞°ÊΩî„Å´Ë®òËø∞„ÄÇ
- Ë©≤ÂΩì„ÉÅ„É£„É≥„ÇØ„ÅÆ `id` „ÇíÂºïÁî®„Å®„Åó„Å¶‰ªò‰∏éÔºà‰æã: [doc:0053c7...]Ôºâ„ÄÇ
- ÂçÅÂàÜ„Å™Ê†πÊã†„Åå„Å™„Åë„Çå„Å∞„ÄåI don't know„Äç„ÄÇ

„ÄêÁ¶ÅÊ≠¢‰∫ãÈ†Ö„Äë
- Êñ≠ÂÆö„ÅÆ„Åü„ÇÅ„ÅÆË£úÂÆåÊé®Ê∏¨„ÄÅÂπ¥‰ª£/Êï∞ÂÄ§„ÅÆÂâµ‰Ωú„ÄÅURL„ÇÑÂõ≥Ë°®„ÅÆÂâµ‰Ωú„ÄÇ
- JDocQA„Å´Âê´„Åæ„Çå„Å™„ÅÑÁØÑÂõ≤„Å∏Ë©±„ÇíÂ∫É„Åí„Çã„Åì„Å®„ÄÇ

„ÄêÂõûÁ≠î„Éï„Ç©„Éº„Éû„ÉÉ„Éà‰æã„Äë
- ÂçòÊñáÂõûÁ≠î: „Äå„Äú„Åß„Åô„ÄÇ[doc:abc123]„Äç
- ÁÆáÊù°Êõ∏„Åç: Ë§áÊï∞Êù°‰ª∂/ÊâãÈ†Ü„Åå„ÅÇ„Çã„Å®„Åç„ÅÆ„Åø‰ΩøÁî®„ÄÇ
- ‰∏çÊòé: „ÄåI don't know„Äç
"""


instructions_formatted = """
You are a Japanese RAG answerer for JDocQA. Use ONLY the provided chunks.
If insufficient evidence: reply with JSON whose "answer" is exactly "I don't know".

Output MUST be a single JSON object:
{
  "answer": string,            // Japanese final answer or "I don't know"
  "citations": [ { "id": string } ],  // chunk ids you used (at least one if answer != "I don't know")
  "confidence": "high" | "medium" | "low"
}

Rules:
- No outside knowledge. No speculation.
- Keep it brief and specific. If chunks conflict, note it briefly and choose the most specific source.
- If you used multiple chunks, include all their ids in citations.
"""

messages = [{"role": "system", "content": instructions}]

In [None]:
agent_client = KnowledgeAgentRetrievalClient(
    endpoint=SEARCH_ENDPOINT, agent_name=KNOWLEDGE_AGENT_NAME, credential=credential
)
query_1 = """
    ÂÆ∂ÊóèÊâãÂΩì„ÅÆÊîØÁµ¶Êù°‰ª∂„ÅØÔºü
    """

messages.append({"role": "user", "content": query_1})

req = KnowledgeAgentRetrievalRequest(
    messages=[
        KnowledgeAgentMessage(
            role=m["role"],
            content=[KnowledgeAgentMessageTextContent(text=m["content"])],
        )
        for m in messages
        if m["role"] != "system"
    ],
    knowledge_source_params=[
        SearchIndexKnowledgeSourceParams(
            knowledge_source_name=KNOWLEDGE_SOURCE_NAME, kind="searchIndex"
        )
    ],
)

result = agent_client.retrieve(retrieval_request=req, api_version=SEARCH_API_VERSION)
print(f"Retrieved content from '{KNOWLEDGE_SOURCE_NAME}' successfully.")

In [None]:
print("Response")
print(textwrap.fill(result.response[0].content[0].text, width=120))

print("Activity")
print(json.dumps([a.as_dict() for a in result.activity], indent=2, ensure_ascii=False))

print("Results")
print(
    json.dumps([r.as_dict() for r in result.references], indent=2, ensure_ascii=False)
)

In [None]:
def dcg(rels):
    return sum((rel / math.log2(i + 2)) for i, rel in enumerate(rels))


def ndcg_at_k(ranked_ids, gold_set, k):
    rels = [1 if rid in gold_set else 0 for rid in ranked_ids[:k]]
    ideal = sorted(rels, reverse=True)
    return 0.0 if sum(ideal) == 0 else dcg(rels) / dcg(ideal)


def recall_at_k(ranked_ids, gold_set, k):
    return 1.0 if any(rid in gold_set for rid in ranked_ids[:k]) else 0.0


def mrr(ranked_ids, gold_set):
    for i, rid in enumerate(ranked_ids):
        if rid in gold_set:
            return 1.0 / (i + 1)
    return 0.0


def ap_multi(ranked_ids, gold_set):
    if not gold_set:
        return 0.0
    hits = 0
    score = 0.0
    for i, rid in enumerate(ranked_ids, start=1):
        if rid in gold_set:
            hits += 1
            score += hits / i
    return score / max(len(gold_set), 1)

In [None]:
async def evaluate_search(run_fn, queries, gold_sets, topk_list=[1, 3, 5, 10]):
    recalls = {f"Recall@{k}": [] for k in topk_list}
    ndcgs = {f"nDCG@{k}": [] for k in topk_list}
    mrrs, maps = [], []
    for q, gold in tqdm(list(zip(queries, gold_sets, strict=False))):
        ranked = await run_fn(q, topk=max(topk_list))
        for k in topk_list:
            recalls[f"Recall@{k}"].append(recall_at_k(ranked, gold, k))
            ndcgs[f"nDCG@{k}"].append(ndcg_at_k(ranked, gold, k))
        mrrs.append(mrr(ranked, gold))
        maps.append(ap_multi(ranked, gold))
    out = {"MRR": float(np.mean(mrrs)), "MAP": float(np.mean(maps))}
    for k in topk_list:
        out[f"Recall@{k}"] = float(np.mean(recalls[f"Recall@{k}"]))
        out[f"nDCG@{k}"] = float(np.mean(ndcgs[f"nDCG@{k}"]))
    return out

In [None]:
import asyncio
import numpy as np

# recall_at_k, ndcg_at_k, mrr, ap_multi „ÅØÊó¢Â≠ò„ÅÆ„Åæ„Åæ‰Ωø„ÅÜÊÉ≥ÂÆö

async def evaluate_search_async(
    run_fn,                      # async def run_fn(query: str, topk: int) -> list[str]
    queries,
    gold_sets,
    topk_list=(1, 3, 5, 10),
    concurrency: int = 32,       # ÂêåÊôÇÁô∫Ë°å„ÅÆ‰∏äÈôê
    show_progress: bool = True   # tqdmË°®Á§∫ÔºàJupyterÂØæÂøúÔºâ
):
    assert len(queries) == len(gold_sets)
    kmax = max(topk_list)

    sem = asyncio.Semaphore(concurrency)

    async def _one(i, q):
        async with sem:
            ranked = await run_fn(q, topk=kmax)
            return i, ranked

    tasks = [asyncio.create_task(_one(i, q)) for i, q in enumerate(queries)]

    # „Åì„Åì„Åå„Éù„Ç§„É≥„ÉàÔºöas_completed „ÅØÂêåÊúü„Ç§„ÉÜ„É¨„Éº„Çø„ÄÇfor„ÅßÂõû„Åó„Å¶ await fut„ÄÇ
    results_ordered = [None] * len(queries)
    for fut in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        i, ranked = await fut
        results_ordered[i] = ranked


    # ---- „É°„Éà„É™„ÇØ„ÇπÈõÜË®à ----
    recalls = {f"Recall@{k}": [] for k in topk_list}
    ndcgs   = {f"nDCG@{k}":   [] for k in topk_list}
    mrrs, maps = [], []

    for ranked, gold in zip(results_ordered, gold_sets):
        for k in topk_list:
            recalls[f"Recall@{k}"].append(recall_at_k(ranked, gold, k))
            ndcgs[f"nDCG@{k}"].append(ndcg_at_k(ranked, gold, k))
        mrrs.append(mrr(ranked, gold))
        maps.append(ap_multi(ranked, gold))

    out = {"MRR": float(np.mean(mrrs)), "MAP": float(np.mean(maps))}
    for k in topk_list:
        out[f"Recall@{k}"] = float(np.mean(recalls[f"Recall@{k}"]))
        out[f"nDCG@{k}"]   = float(np.mean(ndcgs[f"nDCG@{k}"]))
    return out


In [None]:
vector_scores = await evaluate_search_async(vector_search, queries, qa_gold_sets, TOPK_LIST)

In [None]:
hybrid_scores = await evaluate_search_async(hybrid_search, queries, qa_gold_sets, TOPK_LIST)

In [None]:
full_scores, vector_scores, hybrid_scores = await asyncio.gather(
    evaluate_search_async(full_text_search, queries, qa_gold_sets, TOPK_LIST),
    evaluate_search_async(vector_search,   queries, qa_gold_sets, TOPK_LIST),
    evaluate_search_async(hybrid_search,   queries, qa_gold_sets, TOPK_LIST),
)

In [None]:
pd.DataFrame(
    [full_text_scores, vector_scores, hybrid_scores],
    index=["FullText", "Vector", "Hybrid"],
)