In [None]:
%pip install datasets pandas azure-identity "azure-search-documents==11.6.0b1" azure-cosmos

In [None]:
import datasets

lang='ja'  # or any of the 16 languages
miracl_corpus = datasets.load_dataset('miracl/miracl-corpus', lang, cache_dir='.cache')['train']


In [None]:
from azure.cosmos.aio import CosmosClient
from azure.cosmos.partition_key import PartitionKey
import os
import asyncio

# Replace the connection string with your own.
client = CosmosClient(os.environ["AZURE_COSMOS_URI"], credential=os.environ["AZURE_COSMOS_KEY"])

db = await client.create_database_if_not_exists(id='miracl')
# setup container for this sample
container = await db.create_container_if_not_exists(id='corpus',
                                             partition_key=PartitionKey(path='/docid', kind='Hash'))

for i in range(35, 40):
   test_corpus = miracl_corpus.shard(1000, i)

   for batch in test_corpus.to_pandas(batched=True, batch_size=50):
      documents = batch.to_dict(orient='records')
      tasks = []
      for doc in documents:
         doc['id'] = doc['docid'].replace("#", "i")
         tasks.append(container.upsert_item(doc))
      await asyncio.gather(*tasks)
      print(f"Added batch of 50 documents to CosmosDB in batch {i}, last index - {doc['docid']}.")
   print(f"Added batch {i} to CosmosDB index.")


In [None]:
lang='ja'  # or any of the 16 languages
miracl = datasets.load_dataset('miracl/miracl', lang, use_auth_token=True, cache_dir='.cache')

# training set:
for data in miracl['train']:  # or 'dev', 'testA'
  query_id = data['query_id']
  query = data['query']
  positive_passages = data['positive_passages']
  negative_passages = data['negative_passages']
  
  for entry in positive_passages: # OR 'negative_passages'
    docid = entry['docid']
    title = entry['title']
    text = entry['text']

In [None]:
from azure.identity.aio import DefaultAzureCredential
from azure.search.documents.aio import SearchClient
from azure.search.documents.indexes.aio import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchableField,
    SearchIndex,
    SearchIndexer,
    SearchFieldDataType,
)
import os
ANALYSER = "ja.microsoft"

service_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
index_name = 'test-miracl-index-ja-microsoft'
azure_cred = DefaultAzureCredential()

search_client = SearchClient(service_endpoint, index_name, azure_cred)
index_client = SearchIndexClient(service_endpoint, azure_cred)
indexer_client = SearchIndexerClient(service_endpoint, azure_cred)

async def create_miracl_corpus_index(name):
    if await index_client.get_index(name):
        return
    fields = [
        SimpleField(name="docid", type=SearchFieldDataType.String, key=True),
        SimpleField(name="title", type=SearchFieldDataType.String),
        SearchableField(name="text", type=SearchFieldDataType.String, analyzer_name=ANALYSER),
    ]
    index = SearchIndex(
        name=name,
        fields=fields)
    result = await index_client.create_index(index)
    return result

async def create_indexer(name):
    # Create an indexer
    indexer_name = f"{name}-indexer"

    indexer = SearchIndexer(
        name=indexer_name,
        description="Indexer to index documents and generate embeddings",
        skillset_name=f"{name}-skillset",
        target_index_name=name,
        data_source_name=f"miracl-cosmos",
    )

    return await indexer_client.create_or_update_indexer(indexer)

await create_miracl_corpus_index(index_name)
await create_indexer(index_name)
