# MIRACL Benchmark with analyzers

This experiment is to compare the performance of the Microsoft and Lucene analyzers with the MIRACL benchmark.

In [None]:
%pip install datasets pandas azure-identity "azure-search-documents==11.6.0b1" azure-cosmos

In [None]:
## Load the MIRACL Corpus (this a dump of a bunch of data from Japanese Wikipedia)
import datasets

lang='ja'  # or any of the 16 languages
miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', lang, cache_dir='.cache', trust_remote_code=True)['train']


In [None]:
## Upload the documents in the corpus to Cosmos DB in batches (50 at a time)
# This will take hours..
from azure.cosmos.aio import CosmosClient
from azure.cosmos.partition_key import PartitionKey
import os
import asyncio

# Replace the connection string with your own.
client = CosmosClient(os.environ["AZURE_COSMOS_URI"], credential=os.environ["AZURE_COSMOS_KEY"])

db = await client.create_database_if_not_exists(id='miracl')
# setup container for this sample
container = await db.create_container_if_not_exists(id='corpus',
                                             partition_key=PartitionKey(path='/docid', kind='Hash'))

for i in range(0, 40): # ~270,000 documents
   test_corpus = miracl_corpus.shard(1000, i)

   for batch in test_corpus.to_pandas(batched=True, batch_size=50):
      documents = batch.to_dict(orient='records')
      tasks = []
      for doc in documents:
         doc['id'] = doc['docid'].replace("#", "i")
         tasks.append(container.upsert_item(doc))
      await asyncio.gather(*tasks)
      print(f"Added batch of 50 documents to CosmosDB in batch {i}, last index - {doc['docid']}.")
   print(f"Added batch {i} to CosmosDB index.")


In [None]:
""" Capture a list of all the documents we did upload with the same sharding criteria as above """
indexed_docs = []
for i in range(0, 40):
   test_corpus = miracl_corpus.shard(1000, i)
   for batch in test_corpus.to_pandas(batched=True, batch_size=50):
      indexed_docs.extend([doc['docid'] for doc in batch.to_dict(orient='records')])


In [None]:
import datasets
import os
import json

token = os.environ["HUGGING_FACE_TOKEN"]
lang='ja'  # or any of the 16 languages
miracl = datasets.load_dataset('miracl/miracl', lang, use_auth_token=token, cache_dir='.cache')

# training set:
questions = {}

for data in miracl['train']:  # or 'dev', 'testA'
  query_id = data['query_id']
  query = data['query']
  if data['query_id'] not in questions:
    questions[data['query_id']] = {'query': data['query'], 'positive_passages': [], 'negative_passages': []}

  positive_passages = data['positive_passages']
  negative_passages = data['negative_passages']
  for entry in positive_passages:
    if entry['docid'] in indexed_docs:
      questions[data['query_id']]['positive_passages'].append(entry['docid'])
  for entry in negative_passages:
    if entry['docid'] in indexed_docs:
      questions[data['query_id']]['negative_passages'].append(entry['docid'])

# Clean up the questions and remove any that don't have positive passages
searchable_questions = {}
for query_id, question in questions.items():
  if len(question['positive_passages']) > 0:
    searchable_questions[query_id] = question

with open('data/miracl_questions.json', 'w') as f:
  json.dump(searchable_questions, f)


In [None]:
len(miracl['train'])

In [None]:
from azure.identity.aio import DefaultAzureCredential
from azure.search.documents.aio import SearchClient
from azure.search.documents.indexes.aio import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchableField,
    SearchIndex,
    SearchIndexer,
    SearchFieldDataType,
)
from azure.core.exceptions import ResourceNotFoundError
import os
ANALYSER = "ja.microsoft"

service_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
index_name = 'test-miracl-index-ja-microsoft'
indexer_name = 'test-miracl-indexer-ja-microsoft'
azure_cred = DefaultAzureCredential()

search_client = SearchClient(service_endpoint, index_name, azure_cred)
index_client = SearchIndexClient(service_endpoint, azure_cred)
indexer_client = SearchIndexerClient(service_endpoint, azure_cred)

async def create_miracl_corpus_index(name):
    try:
        if await index_client.get_index(name):
            return
    except ResourceNotFoundError:
        pass
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SimpleField(name="docid", type=SearchFieldDataType.String),
        SimpleField(name="title", type=SearchFieldDataType.String),
        SearchableField(name="text", type=SearchFieldDataType.String, analyzer_name=ANALYSER),
    ]
    index = SearchIndex(
        name=name,
        fields=fields)
    result = await index_client.create_index(index)
    return result

async def create_indexer(name, index_name):
    try:
        if await indexer_client.get_indexer(name):
            return
    except ResourceNotFoundError:
        pass
    # Create an indexer
    indexer_name = f"{name}-indexer"

    indexer = SearchIndexer(
        name=indexer_name,
        description="Indexer to index documents and generate embeddings",
        # skillset_name=f"{name}-skillset",
        target_index_name=index_name,
        data_source_name=f"miracl-cosmos",
    )

    return await indexer_client.create_or_update_indexer(indexer)

await create_miracl_corpus_index(index_name)
await create_indexer(indexer_name, index_name)


In [None]:
from azure.identity.aio import DefaultAzureCredential
from azure.search.documents.aio import SearchClient

import os
ANALYSER = "ja.microsoft"

service_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_cred = DefaultAzureCredential()

search_clients = {
  "ja-microsoft": SearchClient(service_endpoint, 'test-miracl-index-ja-microsoft', azure_cred),
  "ja-lucene": SearchClient(service_endpoint, 'test-miracl-index-ja-lucene', azure_cred),
  "no-analyzer": SearchClient(service_endpoint, 'test-miracl-index-no-analyzer', azure_cred)
}

with open('data/miracl_questions.json', 'r') as f:
  questions = json.load(f)

results = {}

for query_id, query in questions.items():
  print(f"Query: {query['query']}")
  results[query_id] = {
    "query": query['query'],
    "positive_passages": query['positive_passages'],
    "negative_passages": query['negative_passages'],
    "ja-lucene-results": [],
    "ja-microsoft-results": [],
    "no-analyzer-results": []
  }
  for analyzer, client in search_clients.items():
    response = await search_client.search(
      search_text=query['query'],
      query_type="semantic",
      semantic_configuration_name="miracl-semantic",
      query_answer="extractive",
      query_answer_count=3,
      query_caption="extractive",
      query_language="ja-JP")
    answers = await response.get_answers()
    matches = [answer.as_dict()['key'].replace('i', '#') for answer in answers]
    results[query_id][f"{analyzer}-results"] = matches

with open('miracl-results.json', 'w') as f:
  json.dump(results, f)