# MIRACL Benchmark with analyzers

This experiment is to compare the performance of the Microsoft and Lucene analyzers with the MIRACL benchmark.

In [None]:
%pip install datasets pandas azure-identity "azure-search-documents==11.6.0b1" azure-cosmos

In [None]:
## Load the MIRACL Corpus (this a dump of a bunch of data from Japanese Wikipedia)
import datasets

lang='ja'  # or any of the 16 languages
miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', lang, cache_dir='.cache', trust_remote_code=True)['train']


In [None]:
## Upload the documents in the corpus to Cosmos DB in batches (50 at a time)
# This will take hours..
from azure.cosmos.aio import CosmosClient
from azure.cosmos.partition_key import PartitionKey
import os
import asyncio

# Replace the connection string with your own.
client = CosmosClient(os.environ["AZURE_COSMOS_URI"], credential=os.environ["AZURE_COSMOS_KEY"])

db = await client.create_database_if_not_exists(id='miracl')
# setup container for this sample
container = await db.create_container_if_not_exists(id='corpus',
                                             partition_key=PartitionKey(path='/docid', kind='Hash'))

for i in range(0, 40): # ~270,000 documents
   test_corpus = miracl_corpus.shard(1000, i)

   for batch in test_corpus.to_pandas(batched=True, batch_size=50):
      documents = batch.to_dict(orient='records')
      tasks = []
      for doc in documents:
         doc['id'] = doc['docid'].replace("#", "i")
         tasks.append(container.upsert_item(doc))
      await asyncio.gather(*tasks)
      print(f"Added batch of 50 documents to CosmosDB in batch {i}, last index - {doc['docid']}.")
   print(f"Added batch {i} to CosmosDB index.")


In [None]:
""" Capture a list of all the documents we did upload with the same sharding criteria as above """
indexed_docs = []
for i in range(0, 40):
   test_corpus = miracl_corpus.shard(1000, i)
   for batch in test_corpus.to_pandas(batched=True, batch_size=50):
      indexed_docs.extend([doc['docid'] for doc in batch.to_dict(orient='records')])


In [None]:
import datasets
import os
import json

token = os.environ["HUGGING_FACE_TOKEN"]
lang='ja'  # or any of the 16 languages
miracl = datasets.load_dataset('miracl/miracl', lang, use_auth_token=token, cache_dir='.cache')

# training set:
questions = {}

for data in miracl['train']:  # or 'dev', 'testA'
  query_id = data['query_id']
  query = data['query']
  if data['query_id'] not in questions:
    questions[data['query_id']] = {'query': data['query'], 'positive_passages': [], 'negative_passages': []}

  positive_passages = data['positive_passages']
  negative_passages = data['negative_passages']
  for entry in positive_passages:
    if entry['docid'] in indexed_docs:
      questions[data['query_id']]['positive_passages'].append(entry['docid'])
  for entry in negative_passages:
    if entry['docid'] in indexed_docs:
      questions[data['query_id']]['negative_passages'].append(entry['docid'])

# Clean up the questions and remove any that don't have positive passages
searchable_questions = {}
for query_id, question in questions.items():
  if len(question['positive_passages']) > 0:
    searchable_questions[query_id] = question

with open('data/miracl_questions.json', 'w') as f:
  json.dump(searchable_questions, f)


In [None]:
len(miracl['train'])

In [None]:
# TODO: Automate the following:
# - Create a datasource in Azure Search for Cosmos DB with data change tracking
# - Create three indexes in Azure Search for the datasource with the right fields. One for each analyzer option
# - Create a skillset in Azure Search with vectorizers
# - Create an indexer in Azure Search to index the Cosmos DB data into the index
# - Test

In [10]:
from azure.identity.aio import DefaultAzureCredential
from azure.search.documents.aio import SearchClient
from azure.search.documents.models import VectorizableTextQuery
import os
import json

service_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_cred = DefaultAzureCredential()

search_clients = {
  "ja-microsoft": SearchClient(service_endpoint, 'test-miracl-index-ja-microsoft', azure_cred),
  "ja-lucene": SearchClient(service_endpoint, 'test-miracl-index-ja-lucene', azure_cred),
  "no-analyzer": SearchClient(service_endpoint, 'test-miracl-index-no-analyzer', azure_cred)
}

with open('data/miracl_questions.json', 'r') as f:
  questions = json.load(f)

results = {}

for query_id, query in questions.items():
  print(f"Query: {query['query']}")
  results[query_id] = {
    "query": query['query'],
    "positive_passages": query['positive_passages'],
    "negative_passages": query['negative_passages'],
    "ja-lucene-hybrid-results": [],
    "ja-lucene-hybrid-answers": [],
    "ja-lucene-semantic-results": [],
    "ja-lucene-semantic-answers": [],
    "ja-microsoft-hybrid-results": [],
    "ja-microsoft-hybrid-answers": [],
    "ja-microsoft-semantic-results": [],
    "ja-microsoft-semantic-answers": [],
    "no-analyzer-hybrid-results": [],
    "no-analyzer-hybrid-answers": [],
    "no-analyzer-semantic-results": [],
    "no-analyzer-semantic-answers": [],
  }
  for analyzer, client in search_clients.items():
    # 1: Keyword Search (skip for now because it would be hopeless with this dataset)
    vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=15, fields="embedding", exhaustive=True)
    # 2: Hybrid Search
    response = await client.search(
      search_text=query['query'],
      vector_queries=[vector_query],
      top=3,
      query_language="ja-JP")
    matches = []
    async for doc in response:
      matches.append(doc['docid'])
    answers = await response.get_answers()
    if answers:
      results[query_id][f"{analyzer}-hybrid-answers"] = [answer.key.replace("i", "#") for answer in answers]
    results[query_id][f"{analyzer}-hybrid-results"] = matches

    # 3: Semantic Search
    response = await client.search(
      search_text=query['query'],
      query_type="semantic",
      semantic_configuration_name="miracl-semantic",
      vector_queries=[
        vector_query
      ],
      top=3,
      query_answer="extractive",
      query_answer_count=3,
      query_caption="extractive",
      query_language="ja-JP")
    matches = []
    async for doc in response:
      matches.append(doc['docid'])

    results[query_id][f"{analyzer}-semantic-results"] = matches
    answers = await response.get_answers()
    if answers:
      results[query_id][f"{analyzer}-semantic-answers"] = [answer.key.replace("i", "#") for answer in answers]

  print(results[query_id])

with open('miracl-results.json', 'w') as f:
  json.dump(results, f)

Query: 海底ケーブルが初めて結ばれたのはどこ？


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D2827F290>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D282FDB90>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D27E47890>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D260AD1D0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D2882C910>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D280168D0>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D28B8C310>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D27E99B90>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D28AA2190>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000029D28BB7790>


TypeError: 'QueryAnswerResult' object is not subscriptable

In [None]:
# Analyze results

import json
import pandas as pd

with open('miracl-results.json', 'r') as f:
  results = json.load(f)

for query_id, query in results.items():
  expected = set(query['positive_passages'])
  # TODO: Calculate NDCG@3 for each query
  for analyzer in ['ja-lucene', 'ja-microsoft', 'no-analyzer']:
    expected = set(query['positive_passages'])
    actual = set(query[f"{analyzer}-results"])
    recall = len(expected.intersection(actual)) / len(expected)

    results[query_id][f"{analyzer}-recall"] = recall

df = pd.DataFrame.from_dict(results, orient='index')

df.to_csv('miracl-results.csv')

