In [None]:
from dotenv import load_dotenv

load_dotenv(verbose=True)

In [None]:
import asyncio
import json
import math
import os
import re
import textwrap
import time
from pathlib import Path
from math import ceil

import aiohttp
import pandas as pd
from azure.core.exceptions import HttpResponseError
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    AzureOpenAIEmbeddingSkill,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    FieldMapping,
    HnswAlgorithmConfiguration,
    IndexingParameters,
    InputFieldMappingEntry,
    LexicalAnalyzerName,
    OutputFieldMappingEntry,
    SearchField,
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataSourceType,
    SearchIndexerSkillset,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    VectorSearch,
    VectorSearchProfile,

)
from azure.search.documents.models import QueryType, VectorizableTextQuery
from azure.storage.blob import BlobServiceClient, ContentSettings
from tqdm import tqdm
from azure.search.documents import SearchClient
from azure.search.documents.aio import SearchClient as AsyncSearchClinet
from azure.search.documents.indexes.models import SearchIndexKnowledgeSource, SearchIndexKnowledgeSourceParameters, KnowledgeAgentOutputConfiguration, KnowledgeAgentOutputConfigurationModality, KnowledgeAgent, KnowledgeAgentAzureOpenAIModel, KnowledgeSourceReference
from azure.search.documents.agent import KnowledgeAgentRetrievalClient
from azure.search.documents.agent.models import KnowledgeAgentRetrievalRequest, KnowledgeAgentMessage, KnowledgeAgentMessageTextContent, SearchIndexKnowledgeSourceParams

import datasets as ds

credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(
    credential, "https://search.azure.com/.default"
)

# „Å©„ÅÆ split „Çí‰Ωø„ÅÜ„Åã
SPLIT = "test"  # "validation" „ÇÇÂèØ
MAX_SAMPLES = 300  # None „ÅßÂÖ®‰ª∂
CHUNK_SIZE = 700
CHUNK_OVERLAP = 200
USE_ORIGINAL = False

SEARCH_ENDPOINT = os.getenv("SEARCH_ENDPOINT")
AOAI_ENDPOINT = os.getenv("AOAI_ENDPOINT")
AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
AZURE_AI_FOUNDRY_PROJECT_ENDPOINT = os.getenv("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT")

AOAI_EMBEDDING_MODEL = "text-embedding-3-large"
AOAI_EMBEDDING_DEPLOYMENT = "text-embedding-3-large"
AOAI_GPT_MODEL = "gpt-5-mini"
AOAI_GPT_DEPLOYMENT = "gpt-5-mini"

AGENT_MODEL = "gpt-5-mini"
AGENT_NAME = "jqara-agent"

INDEX_NAME = "jqara-index"
KNOWLEDGE_SOURCE_NAME = "jqara-knowledge-source"
KNOWLEDGE_AGENT_NAME = "jqara-knowledge-agent"
SEARCH_API_VERSION = "2025-08-01-preview"
DATA_SOURCE_NAME = "ds-jqara-chunks"
SKILLSET_NAME = "ss-jqara-embed"
INDEXER_NAME = "idx-jqara"
DIM = 3072
BLOB_CONTAINER = "jqara-docs"
BLOB_PREFIX = "docs"
TOPK_LIST = [1, 3, 5, 10]

In [None]:
# ‰øùÂ≠òÂÖà„Éá„Ç£„É¨„ÇØ„Éà„É™„ÇíÊåáÂÆö
local_dir = Path("datasets/JQaRA")
local_dir.mkdir(parents=True, exist_ok=True)

# Êó¢„Å´„É≠„Éº„Ç´„É´„Å´‰øùÂ≠òÊ∏à„Åø„Å™„Çâ„Åù„Çå„ÇíË™≠„ÅøËæº„ÇÄ
if (local_dir / "dataset_info.json").exists():
    print("üîÅ Loading dataset from local disk...")
    dataset = ds.load_from_disk(str(local_dir))
else:
    print("‚¨áÔ∏è Downloading dataset from Hugging Face Hub...")
    dataset = ds.load_dataset(
        path="hotchpotch/JQaRA",
        trust_remote_code=True,
        storage_options={
            "client_kwargs": {"timeout": aiohttp.ClientTimeout(total=36000)}
        },
    )
    dataset.save_to_disk(str(local_dir))
    print(f"üíæ Dataset saved locally to {local_dir}")

In [None]:
# dev_data = dataset["dev"]
test_data = dataset[SPLIT]
df =pd.DataFrame(test_data).head()
json_str = json.dumps(df.to_dict(orient="records"), ensure_ascii=False, indent=2)
print(json_str)

In [None]:
print(test_data.features)

In [None]:
def normalize_id(raw_id: str) -> str:
    # Ë®±ÂèØÊñáÂ≠ó‰ª•Â§ñ„Çí "_" „Å´ÁΩÆ„ÅçÊèõ„Åà
    return re.sub(r"[^A-Za-z0-9_-]", "_", raw_id)

# id, text, title „Å†„ÅëÊäΩÂá∫„Åó„Å¶ JSON/JSONL „Å´Â§âÊèõ„Éª‰øùÂ≠ò
columns_to_keep = ["id", "text", "title"]

subset = test_data.select_columns(columns_to_keep)

docs = [
    {
        "id": normalize_id(row["id"]),
        "raw_id": row["id"],
        "text": row["text"],
        "title": row["title"],
    }
    for row in subset
]

In [None]:
bsc = BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING)
container_client = bsc.get_container_client(BLOB_CONTAINER)
try:
    container_client.create_container()
except Exception:
    pass

In [None]:

def chunked(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

def upload_ndjson_batches(docs, prefix=BLOB_PREFIX, batch_size=10_000):
    total = 0
    for i, batch in enumerate(chunked(docs, batch_size)):
        # 1Ë°å1„Éâ„Ç≠„É•„É°„É≥„Éà„ÅÆNDJSON
        payload = "\n".join(json.dumps(d, ensure_ascii=False) for d in batch).encode("utf-8")
        name = f"{prefix}/batch_{i:06d}.jsonl"
        container_client.upload_blob(
            name,
            payload,
            overwrite=True,
            content_settings=ContentSettings(content_type="application/x-ndjson"),
        )
        total += len(batch)
    return total

print("Uploaded:", upload_ndjson_batches(docs))

# Client„Çí‰ΩúÊàê„Åô„Çã

In [None]:
index_client = SearchIndexClient(
    endpoint=SEARCH_ENDPOINT, 
    credential=credential
)

indexer_client = SearchIndexerClient(
    endpoint=SEARCH_ENDPOINT, 
    credential=credential,
)

print(f"{SEARCH_ENDPOINT}")

# „Éá„Éº„Çø„ÇΩ„Éº„Çπ„Çí‰ΩúÊàê„Åô„Çã

In [None]:
container = SearchIndexerDataContainer(
    name=BLOB_CONTAINER,
    query=BLOB_PREFIX,  # "docs" ‰ª•‰∏ã„Å†„ÅëÂèñ„ÇäËæº„ÇÄ„ÄÇ„Ç≥„É≥„ÉÜ„ÉäÂÖ®‰Ωì„Å™„Çâ None
)

data_source = SearchIndexerDataSourceConnection(
    name=DATA_SOURCE_NAME,
    type=SearchIndexerDataSourceType.AZURE_BLOB,
    connection_string=AZURE_STORAGE_CONNECTION_STRING,
    container=container,
    description="JQaRA JSONs in Blob Storage",
)

indexer_client.create_or_update_data_source_connection(data_source)

# Index„ÅÆÂÆöÁæ©„ÇíË°å„ÅÜ

In [None]:
# index_client.delete_index(INDEX_NAME)

fields = [
    # „Ç≠„Éº
    SearchField(
        name="id",
        type="Edm.String",
        key=True,
        filterable=True,
        sortable=True,
    ),    
    SearchField(
        name="raw_id",
        type="Edm.String",
        filterable=True,
        sortable=True,
    ),
    SearchField(
        name="title", 
        type="Edm.String", 
        searchable=True,
        analyzer_name=LexicalAnalyzerName.JA_LUCENE,
    ),
    SearchField(
        name="text",
        type="Edm.String",
        searchable=True,
        analyzer_name=LexicalAnalyzerName.JA_LUCENE,
    ),
    SearchField(
        name="text_vector",
        type="Collection(Edm.Single)",
        searchable=True,
        stored=True,
        vector_search_dimensions=DIM,
        vector_search_profile_name="aoai-hnsw",
    ),
]

vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(name="hnsw")
    ],
    profiles=[
        VectorSearchProfile(
            name="aoai-hnsw",
            algorithm_configuration_name="hnsw",
            vectorizer_name="aoai-vectorizer",
        ),
    ],
    # „ÇØ„Ç®„É™ÊôÇ„ÅÆËá™Âãï„Éô„ÇØ„Éà„É´Âåñ
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="aoai-vectorizer",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=AOAI_ENDPOINT,
                deployment_name=AOAI_EMBEDDING_DEPLOYMENT,
                model_name=AOAI_EMBEDDING_MODEL,
            ),
        ),
    ],
)

semantic_search = SemanticSearch(
    default_configuration_name="semantic_config",
    configurations=[
        SemanticConfiguration(
            name="semantic_config",
            prioritized_fields=SemanticPrioritizedFields(
                title_field=SemanticField(field_name="title"),
                content_fields=[SemanticField(field_name="text")]
            ),
        )
    ],
)

index = SearchIndex(
    name=INDEX_NAME,
    fields=fields,
    vector_search=vector_search,
    semantic_search=semantic_search,
)

index_client.create_or_update_index(index)
print(f"Index '{INDEX_NAME}' created or updated successfully.")

# skillset„ÅÆÂÆöÁæ©

In [None]:
embedding_skill = AzureOpenAIEmbeddingSkill(
    description="Skill to generate embeddings via Azure OpenAI",
    context="/document",
    resource_url=AOAI_ENDPOINT,
    deployment_name=AOAI_EMBEDDING_DEPLOYMENT,
    model_name=AOAI_EMBEDDING_MODEL,
    dimensions=DIM,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/text"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="embedding", target_name="text_vector")
    ],
)

skillset = SearchIndexerSkillset(
    name=SKILLSET_NAME,
    skills=[embedding_skill],
    description="JQaRA index-time embedding skillset",
)

indexer_client.create_or_update_skillset(skillset)
print("Skillset created.")

# Indexer„ÅÆÂÆöÁæ©

In [None]:
indexer = SearchIndexer(
    name=INDEXER_NAME,
    data_source_name=DATA_SOURCE_NAME,
    target_index_name=INDEX_NAME,
    skillset_name=SKILLSET_NAME,  # „Çπ„Ç≠„É´„Çª„ÉÉ„Éà„ÇíÁ¥ê„Å•„Åë
    # „Éâ„Ç≠„É•„É°„É≥„Éà -> „Ç§„É≥„Éá„ÉÉ„ÇØ„Çπ „ÅÆ„Éï„Ç£„Éº„É´„Éâ„Éû„ÉÉ„Éî„É≥„Ç∞
    field_mappings=[
        FieldMapping(source_field_name="id", target_field_name="id"),
        FieldMapping(source_field_name="raw_id", target_field_name="raw_id"),
        FieldMapping(source_field_name="title", target_field_name="title"),
        FieldMapping(source_field_name="text", target_field_name="text"),
    ],
    # „Çπ„Ç≠„É´Âá∫Âäõ -> „Ç§„É≥„Éá„ÉÉ„ÇØ„Çπ „ÅÆ„Éï„Ç£„Éº„É´„Éâ„Éû„ÉÉ„Éî„É≥„Ç∞
    output_field_mappings=[
        FieldMapping(
            source_field_name="/document/text_vector",
            target_field_name="text_vector",
        ),
    ],
    # „Ç§„É≥„Éá„ÇØ„Çµ„ÅÆ„Éë„É©„É°„Éº„Çø
    parameters=IndexingParameters(
        configuration={
            "parsingMode": "jsonLines",  # 1 JSON = 1 „Éâ„Ç≠„É•„É°„É≥„Éà
            "failOnUnsupportedContentType": False,  # Êú™ÂØæÂøúMIME„ÅßÂ§±Êïó„Åï„Åõ„Å™„ÅÑ
        }
    ),
)
indexer_client.create_or_update_indexer(indexer)

# Indexing„ÇíÂÆüË°å

In [None]:
# 1) Indexer „ÇíÊâãÂãïÂÆüË°å
try:
    # indexer_client.reset_indexer(INDEXER_NAME)  # 1) Â§âÊõ¥ËøΩË∑°Ôºà„Éè„Ç§„Ç¶„Ç©„Éº„Çø„Éº„Éû„Éº„ÇØÔºâ„Çí„É™„Çª„ÉÉ„Éà
    indexer_client.run_indexer(INDEXER_NAME)
    print(f"Run requested: {INDEXER_NAME}")
except HttpResponseError as e:
    print("Run failed:", e)
    raise

# 2) Á∞°Êòì„Éù„Éº„É™„É≥„Ç∞ÔºàÁä∂ÊÖã„Åå terminal „Å´„Å™„Çã„Åæ„ÅßÂæÖ„Å§Ôºâ
terminal = {"success", "transientFailure", "persistentFailure", "reset"}
for i in range(60):  # ÊúÄÂ§ß ~5ÂàÜÂæÖÊ©üÔºà5Áßí√ó60Ôºâ
    st = indexer_client.get_indexer_status(INDEXER_NAME)
    last = st.last_result
    status = getattr(last, "status", None)
    processed = getattr(last, "items_processed", None)
    failed = getattr(last, "items_failed", None)
    print(f"[{i}] status={status} processed={processed} failed={failed}")

    if status in terminal:
        break
    time.sleep(5)

# 3) ÁµêÊûú„ÉÅ„Çß„ÉÉ„ÇØ
if status != "success":
    raise RuntimeError(
        f"Indexer did not succeed. status={status}, processed={processed}, failed={failed}"
    )
print("Indexer run completed successfully.")

# Search Client„ÇíÂÆöÁæ©

In [None]:
search_client = SearchClient(
    endpoint=SEARCH_ENDPOINT,
    index_name=INDEX_NAME,
    credential=credential,
    api_version=SEARCH_API_VERSION,
)

async_search_client = AsyncSearchClinet(
    endpoint=SEARCH_ENDPOINT,
    index_name=INDEX_NAME,
    credential=credential,
    api_version=SEARCH_API_VERSION,
)

In [None]:
query = "ÊëÇÊ∞è„Åß„ÅØ„Éû„Ç§„Éä„Çπ273.15Â∫¶„Å´„ÅÇ„Åü„Çã„ÄÅÂÖ®„Å¶„ÅÆÂéüÂ≠ê„ÅÆÊåØÂãï„ÅåÂÅúÊ≠¢„Åô„ÇãÊúÄ„ÇÇ‰Ωé„ÅÑÊ∏©Â∫¶„Çí‰Ωï„Å®„ÅÑ„ÅÜ„Åß„Åó„Çá„ÅÜ?"
results = search_client.search(
        search_text=query,
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name="semantic_config",
        query_caption="extractive",
        query_answer="extractive",
        query_answer_threshold=0.1,
        query_answer_count=3,
        query_rewrites="generative",
        query_rewrites_count=3,
        query_language="ja-jp",        
        debug="queryRewrites",
        top=5,
        select=["raw_id", "title", "text"],
    )

def to_jsonable(x):
    if hasattr(x, "as_dict"):
        return x.as_dict()
    if isinstance(x, list):
        return [to_jsonable(v) for v in x]
    if isinstance(x, dict):
        return {k: to_jsonable(v) for k, v in x.items()}
    return x

docs = []
for r in results:
    d = (to_jsonable(dict(r)))
    debug_info = r.get("@search.debugInfo")
    if debug_info:
        d["@search.debugInfo"] = to_jsonable(debug_info)
    docs.append(d)

print(json.dumps(docs, ensure_ascii=False, indent=2))
# print(output)

In [None]:
async def full_text_search(query: str, topk: int = 10):
    results =  await async_search_client.search(
        search_text=query,
        query_type=QueryType.SIMPLE,  # „Åæ„Åü„ÅØ QueryType.SEMANTICÔºàsemanticË®≠ÂÆö„Åå„ÅÇ„ÇãÂ†¥ÂêàÔºâ
        top=topk,
        select=["raw_id"],
    )
    return [r["raw_id"] async for r in results]


async def vector_search(query: str, topk: int = 10):
    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="text_vector",
    )

    results =  await async_search_client.search(
        search_text=None, vector_queries=[vq], select=["raw_id"]
    )

    return [r["raw_id"]  async for r in results]


async def hybrid_search(query: str, topk: int = 10):
    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="text_vector",
    )

    results = await async_search_client.search(
        search_text=query,
        vector_queries=[vq],
        select=["raw_id"],
        top=topk,
    )

    return [r["raw_id"]  async for r in results]

In [None]:
# index_client.delete_agent(KNOWLEDGE_AGENT_NAME)
# index_client.delete_knowledge_source(KNOWLEDGE_SOURCE_NAME)

# Knowledge Source„ÅÆ‰ΩúÊàê

In [None]:
ks = SearchIndexKnowledgeSource(
    name=KNOWLEDGE_SOURCE_NAME,
    description="Knowledge source for Earth at night data",
    search_index_parameters=SearchIndexKnowledgeSourceParameters(
        search_index_name=INDEX_NAME,
        source_data_select="id,raw_id,title,text",
    ),
)

index_client.create_or_update_knowledge_source(
    knowledge_source=ks, api_version=SEARCH_API_VERSION
)
print(f"Knowledge source '{KNOWLEDGE_SOURCE_NAME}' created or updated successfully.")

# Knowledge Agent„ÅÆ‰ΩúÊàê

In [None]:
aoai_params = AzureOpenAIVectorizerParameters(
    resource_url=AOAI_ENDPOINT,
    deployment_name=AOAI_GPT_DEPLOYMENT,
    model_name=AOAI_GPT_MODEL,
)

output_cfg = KnowledgeAgentOutputConfiguration(
    modality=KnowledgeAgentOutputConfigurationModality.ANSWER_SYNTHESIS,
    include_activity=True,
)

agent = KnowledgeAgent(
    name=KNOWLEDGE_AGENT_NAME,
    models=[KnowledgeAgentAzureOpenAIModel(azure_open_ai_parameters=aoai_params)],
    knowledge_sources=[
        KnowledgeSourceReference(
            name=KNOWLEDGE_SOURCE_NAME,
            reranker_threshold=2.0,
        )
    ],
    output_configuration=output_cfg,
)

index_client.create_or_update_agent(agent, api_version=SEARCH_API_VERSION)
print(f"Knowledge agent '{KNOWLEDGE_AGENT_NAME}' created or updated successfully.")

# KnoldegeAgentRetrievalClient„ÅÆ‰ΩúÊàê

In [None]:
agent_client = KnowledgeAgentRetrievalClient(
    endpoint=SEARCH_ENDPOINT, 
    agent_name=KNOWLEDGE_AGENT_NAME, 
    credential=credential
)

In [None]:

messages = []

query_1 = """    
    „Ç¢„É°„É™„Ç´„Å®„É≠„Ç∑„Ç¢„ÅåÂØæÂ≥ô„Åô„Çã„Åì„Å®„Åã„Çâ„ÄåÁ±≥„É≠Êµ∑Â≥°„Äç„ÅÆÂà•Âêç„ÇÇ„ÅÇ„Çã„ÄÅ„Ç¢„É©„Çπ„Ç´„ÅÆ„Çπ„ÉØ„Éº„ÉâÂçäÂ≥∂„Å®„ÄÅÊù±„Ç∑„Éô„É™„Ç¢„ÅÆ„ÉÅ„É•„ÇØ„ÉÅÂçäÂ≥∂„Å®„ÅÆÈñì„Å´„ÅÇ„ÇãÊµ∑Â≥°„ÅØ‰Ωï„Åß„Åó„Çá„ÅÜ?
    „Åæ„Åü„ÄÅ„Åù„ÅÆÊµ∑Â≥°„ÇíÊåü„Çì„ÅßÂçóÂåó„Å´‰ΩçÁΩÆ„Åô„Çã‰∫å„Å§„ÅÆÊµ∑„ÅØ‰Ωï„Åß„Åó„Çá„ÅÜ?
    """
query_2 = """
    ÁõÆ„ÅÆÊÑõË≠∑„Éá„Éº„ÅØ‰ΩïÊúà‰ΩïÊó•„Åß„Åó„Çá„ÅÜ?„Åæ„Åü„ÄÅËÄ≥„ÅÆÊó•„ÅØ‰ΩïÊúà‰ΩïÊó•„Åß„Åó„Çá„ÅÜ?„Åù„ÅÆÁêÜÁî±„ÇÇÊïô„Åà„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇ
    When is the International Day of Nonviolence?
    „Åï„Çâ„Å´„ÄÅ„Åù„Çå„Åû„Çå„ÅÆÊó•„Å´‰Ωï„Çí„Åô„Çå„Å∞ËâØ„ÅÑ„ÅãÊïô„Åà„Å¶„Åè„Å†„Åï„ÅÑ
"""
messages.append({"role": "user", "content": query_2})

req = KnowledgeAgentRetrievalRequest(
    messages=[
        KnowledgeAgentMessage(
            role=m["role"],
            content=[KnowledgeAgentMessageTextContent(text=m["content"])],
        )
        for m in messages
        if m["role"] != "system"
    ],
    knowledge_source_params=[
        SearchIndexKnowledgeSourceParams(
            knowledge_source_name=KNOWLEDGE_SOURCE_NAME, kind="searchIndex"
        )
    ],
)

result = agent_client.retrieve(retrieval_request=req, api_version=SEARCH_API_VERSION)
print(f"Retrieved content from '{KNOWLEDGE_SOURCE_NAME}' successfully.")

In [None]:
print("Response")
print(textwrap.fill(result.response[0].content[0].text, width=120))

print("Activity")
print(json.dumps([a.as_dict() for a in result.activity], indent=2, ensure_ascii=False))

print("Results")
print(
    json.dumps([r.as_dict() for r in result.references], indent=2, ensure_ascii=False)
)

# Ê§úÁ¥¢Ë©ï‰æ°

In [None]:
from typing import Dict, List, Tuple, Iterable


qid_to_query: Dict[str, str] = {}
qrels: Dict[str, Dict[str, int]] = {}

cnt = 0
for rec in test_data:
    cnt += 1
    qid   = rec["q_id"]
    query = rec.get("question")
    docid = rec.get("id")
    rel   = int(rec.get("label", 0))

    if qid not in qid_to_query:
        qid_to_query[qid] = query
    qrels.setdefault(qid, {})[docid] = rel

print(f"Loaded {cnt} records")
print(f"Unique queries: {len(qid_to_query)}")


In [None]:
def full_text_search(query: str, topk: int = 10):
    results = search_client.search(
        search_text=query,
        query_type=QueryType.SIMPLE,  # „Åæ„Åü„ÅØ QueryType.SEMANTICÔºàsemanticË®≠ÂÆö„Åå„ÅÇ„ÇãÂ†¥ÂêàÔºâ
        top=topk,
        select=["raw_id"],
    )
    ranked = []
    for r in results:
        ranked.append((r["raw_id"], float(r.get("@search.score", 0.0))))
    return ranked

In [None]:
from tqdm import tqdm

run_dict = {}  # {qid: {docid: score}}
for qid, query in tqdm(qid_to_query.items(), total=len(qid_to_query)):
    ranked = full_text_search(
        query=query,
    )
    run_dict[qid] = {docid: score for docid, score in ranked}

len(run_dict), list(run_dict.keys())[:3]

In [None]:
async def async_full_text_search(query: str, topk: int = 10) -> List[Tuple[str, float]]:
    start = time.perf_counter()

    results = await async_search_client.search(
        search_text=query,
        query_type=QueryType.SIMPLE,   # Semantic„Å´„Åó„Åü„ÅÑÂ†¥Âêà„ÅØ QueryType.SEMANTIC
        top=topk,
        select=["raw_id"],             # ÂøÖË¶Å„Å´Âøú„Åò„Å¶„Éï„Ç£„Éº„É´„ÉâËøΩÂä†
    )
    ranked: List[Tuple[str, float]] = []
    async for r in results:
        docid = r.get("raw_id")
        score = float(r.get("@search.score", 0.0))
        if docid is not None:
            ranked.append((docid, score))
    
    elapsed = time.perf_counter() - start
    return ranked, elapsed

In [None]:
async def async_vector_search(query: str, topk: int = 10) -> List[Tuple[str, float]]:
    start = time.perf_counter()

    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="text_vector",
    )

    results =  await async_search_client.search(
        search_text=None, 
        vector_queries=[vq], 
        select=["raw_id"]
    )

    ranked: List[Tuple[str, float]] = []
    async for r in results:
        docid = r.get("raw_id")
        score = float(r.get("@search.score", 0.0))
        if docid is not None:
            ranked.append((docid, score))
    
    elapsed = time.perf_counter() - start
    return ranked, elapsed

In [None]:
async def async_hybrid_search(query: str, topk: int = 10) -> List[Tuple[str, float]]:
    start = time.perf_counter()

    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="text_vector",
    )

    results =  await async_search_client.search(
        search_text=query, 
        vector_queries=[vq], 
        select=["raw_id"]
    )

    ranked: List[Tuple[str, float]] = []
    async for r in results:
        docid = r.get("raw_id")
        score = float(r.get("@search.score", 0.0))
        if docid is not None:
            ranked.append((docid, score))
    
    elapsed = time.perf_counter() - start
    return ranked, elapsed

In [None]:
async def async_full_text_semantic_search(query: str, topk: int = 10) -> List[Tuple[str, float]]:
    start = time.perf_counter()

    results = await async_search_client.search(
        search_text=query,
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name="semantic_config",
        top=topk,
        select=["raw_id"],             # ÂøÖË¶Å„Å´Âøú„Åò„Å¶„Éï„Ç£„Éº„É´„ÉâËøΩÂä†
    )
    ranked: List[Tuple[str, float]] = []
    async for r in results:
        docid = r.get("raw_id")
        score = float(r.get("@search.score", 0.0))
        if docid is not None:
            ranked.append((docid, score))
    
    elapsed = time.perf_counter() - start
    return ranked, elapsed

In [None]:
async def async_vector_semantic_search(query: str, topk: int = 10) -> List[Tuple[str, float]]:
    start = time.perf_counter()

    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="text_vector",
    )

    results =  await async_search_client.search(
        search_text=None,                 
        vector_queries=[vq], 
        select=["raw_id"],
        semantic_query=query,
        semantic_configuration_name="semantic_config",
    )

    ranked: List[Tuple[str, float]] = []
    async for r in results:
        docid = r.get("raw_id")
        score = float(r.get("@search.score", 0.0))
        if docid is not None:
            ranked.append((docid, score))
    
    elapsed = time.perf_counter() - start
    return ranked, elapsed

In [None]:
async def async_hybrid_semantic_search(query: str, topk: int = 10) -> List[Tuple[str, float]]:
    start = time.perf_counter()

    vq = VectorizableTextQuery(
        text=query,
        k_nearest_neighbors=topk,
        fields="text_vector",
    )

    results =  await async_search_client.search(
        search_text=query, 
        vector_queries=[vq], 
        semantic_query=query,
        semantic_configuration_name="semantic_config",
        select=["raw_id"]
    )

    ranked: List[Tuple[str, float]] = []
    async for r in results:
        docid = r.get("raw_id")
        score = float(r.get("@search.score", 0.0))
        if docid is not None:
            ranked.append((docid, score))
    
    elapsed = time.perf_counter() - start
    return ranked, elapsed

In [None]:
from typing import Callable, Awaitable, List, Tuple, Dict

async def build_run_dict_async(
        search_fn: Callable[[str, int], Awaitable[List[Tuple[str, float]]]],
        qid_to_query: Dict[str, str]
    ) -> Dict[str, Dict[str, float]]:
    CONCURRENCY = 10
    sem = asyncio.Semaphore(CONCURRENCY)    
    run_dict: Dict[str, Dict[str, float]] = {}
    latencies = []

    async def _one(qid: str, q: str):
        async with sem:
            ranked, elapsed = await search_fn(q)
            return qid, ranked, elapsed

    tasks = [_one(qid, q) for qid, q in qid_to_query.items()]
    
    for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        qid, ranked, elapsed = await coro
        run_dict[qid] = {docid: score for docid, score in ranked}
        latencies.append(elapsed)
    
    return run_dict, latencies

In [None]:
fulltext_search_run_dict, fulltext_search_latencies = await build_run_dict_async(async_full_text_search, qid_to_query)

In [None]:
fulltext_semantic_search_run_dict, fulltext_semantic_search_latencies = await build_run_dict_async(async_full_text_semantic_search, qid_to_query)

In [None]:

vector_search_run_dict, vector_search_latencies = await build_run_dict_async(async_vector_search, qid_to_query)

In [None]:

vector_semantic_search_run_dict, vector_semantic_search_latencies = await build_run_dict_async(async_vector_semantic_search, qid_to_query)

In [None]:
hybrid_search_run_dict, hybrid_search_latencies = await build_run_dict_async(async_hybrid_search, qid_to_query)

In [None]:
hybrid_semantic_search_run_dict, hybrid_semantic_search_latencies = await build_run_dict_async(async_hybrid_semantic_search, qid_to_query)

In [None]:
# result = await async_vector_search(query="ÊëÇÊ∞è„Åß„ÅØ„Éû„Ç§„Éä„Çπ273.15Â∫¶„Å´„ÅÇ„Åü„Çã")
# result

query = "ÊëÇÊ∞è„Åß„ÅØ„Éû„Ç§„Éä„Çπ273.15Â∫¶„Å´„ÅÇ„Åü„Çã"
vq = VectorizableTextQuery(
    text=query,
    k_nearest_neighbors=10,
    fields="text_vector",
)

results = search_client.search(
    search_text=None,                 
    vector_queries=[vq], 
    select=["raw_id"],
    semantic_query=query,
    semantic_configuration_name="semantic_config",
)

def to_jsonable(x):
    if hasattr(x, "as_dict"):
        return x.as_dict()
    if isinstance(x, list):
        return [to_jsonable(v) for v in x]
    if isinstance(x, dict):
        return {k: to_jsonable(v) for k, v in x.items()}
    return x

docs = []
for r in results:
    d = (to_jsonable(dict(r)))
    debug_info = r.get("@search.debugInfo")
    if debug_info:
        d["@search.debugInfo"] = to_jsonable(debug_info)
    docs.append(d)

print(json.dumps(docs, ensure_ascii=False, indent=2))
   

In [None]:
from ranx import Qrels, Run, evaluate

def evaluate_run_dict(run_dict: Dict[str, Dict[str, float]], qrels: Dict[str, Dict[str, int]]):
    qrels_ranx = Qrels(qrels)
    run_ranx   = Run(run_dict)

    metrics = [
        "ndcg@1", "ndcg@3", "ndcg@5", "ndcg@10",
        "map", "mrr", "precision@10", "recall@10", "recall@100",
    ]

    scores = evaluate(qrels_ranx, run_ranx, metrics=metrics)
    for m in metrics:
        print(f"{m:>12}: {scores[m]:.4f}")

In [None]:
import numpy as np

def print_latency_stats(latencies: List[float]):
    l = np.array(latencies)

    print(f"Count    : {len(l)}")
    print(f"Mean     : {l.mean():.4f} sec")
    print(f"Median   : {np.median(l):.4f} sec")
    print(f"p90      : {np.percentile(l,90):.4f} sec")
    print(f"p95      : {np.percentile(l,95):.4f} sec")
    print(f"p99      : {np.percentile(l,99):.4f} sec")
    print(f"Min/Max  : {l.min():.4f} / {l.max():.4f} sec")

In [None]:
evaluate_run_dict(fulltext_search_run_dict, qrels)
print_latency_stats(fulltext_search_latencies)

In [None]:
evaluate_run_dict(fulltext_semantic_search_run_dict, qrels)
print_latency_stats(fulltext_semantic_search_latencies)

In [None]:
evaluate_run_dict(vector_search_run_dict, qrels)
print_latency_stats(vector_search_latencies)

In [None]:
evaluate_run_dict(vector_semantic_search_run_dict, qrels)
print_latency_stats(vector_semantic_search_latencies)

In [None]:
evaluate_run_dict(hybrid_search_run_dict, qrels)
print_latency_stats(hybrid_search_latencies)

# Ragas„Å´„Çà„ÇãË©ï‰æ°