## Import Required Packages and Dependencies 

In [1]:
from ir_datasets import load
from pymongo import MongoClient
from pymongo.errors import BulkWriteError


## data set name and variables

In [2]:
dataset_name1='nano-beir/arguana'
dataset_name2='beir/webis-touche2020/v2'
dataset_name3='beir/quora/test'
dataset_name4='antique/test'
dataset_name=dataset_name3
batch_size=1000
name=dataset_name.replace("/", "-").replace("\\", "_").strip()

## connecting to mongodb

In [3]:

client = MongoClient("mongodb://localhost:27017/")
db = client["ir_project"]
collection = db[name]


## make index 

In [4]:

collection.create_index("doc_id", unique=True)

'doc_id_1'

## loading the dataset 

In [5]:

dataset = load(dataset_name)
count_inserted = 0
batch = []

print(f"🚀 Starting to load dataset: {name}")

for i, doc in enumerate(dataset.docs_iter()):
    doc_id = doc.doc_id
    text = doc.text

    batch.append({
        "doc_id": doc_id,
        "body": text
    })

    if len(batch) == batch_size:
        try:
            result = collection.insert_many(batch, ordered=False)
            count_inserted += len(result.inserted_ids)
            print(f"✅ Inserted {count_inserted} documents so far...")
        except BulkWriteError as bwe:
            # Skip duplicates gracefully
            num_errors = len(bwe.details.get("writeErrors", []))
            count_inserted += batch_size - num_errors
            print(f"⚠️ Skipped {num_errors} duplicates, inserted {batch_size - num_errors}")
            batch = []

# Final batch
if batch:
    try:
        result = collection.insert_many(batch, ordered=False)
        count_inserted += len(result.inserted_ids)
        print(f"✅ Final batch inserted. Total: {count_inserted}")
    except BulkWriteError as bwe:
        num_errors = len(bwe.details.get("writeErrors", []))
        count_inserted += len(batch) - num_errors
        print(f"⚠️ Final batch: Skipped {num_errors} duplicates, inserted {len(batch) - num_errors}")

print(f"🎉 Done! Inserted {count_inserted} documents into '{dataset_name}' collection.")


🚀 Starting to load dataset: beir-quora-test
✅ Inserted 1000 documents so far...
⚠️ Final batch: Skipped 1000 duplicates, inserted 521931
🎉 Done! Inserted 522931 documents into 'beir/quora/test' collection.
