In [None]:
import requests
import logging
import pandas as pd
import os
import time
from config import config  # Ensure this defines get_vespa_config()

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

vespa_cfg = config.get_vespa_config()
endpoint = vespa_cfg["endpoint"].rstrip("/")
cert = vespa_cfg["cert_file_path"]
key = vespa_cfg["key_file_path"]
NAMESPACE = "doc"
DOCUMENT_TYPE = "doc"

OUTPUT_FILE = "vespa_docs_missing_field.csv"


def safe_append_to_csv(record, output_file):
    """Append a single record to CSV safely."""
    df = pd.DataFrame([record])
    write_header = not os.path.exists(output_file)
    df.to_csv(output_file, mode="a", header=write_header, index=False)


def fetch_all_docs_missing_field(field_name):
    continuation = None
    processed_count = 0
    missing_count = 0

    while True:
        url = f"{endpoint}/document/v1/{NAMESPACE}/{DOCUMENT_TYPE}/docid"
        if continuation:
            url += f"?continuation={continuation}"

        logging.info(f"üì° Fetching batch from Vespa: {url}")
        try:
            response = requests.get(url, cert=(cert, key), timeout=30)
            if response.status_code != 200:
                logging.error(f"‚ùå Failed to fetch docs: {response.status_code}")
                break
        except Exception as e:
            logging.error(f"‚ö†Ô∏è Request error: {e}, retrying in 10 seconds...")
            time.sleep(10)
            continue

        data = response.json()
        documents = data.get("documents", [])
        logging.info(f"‚úÖ Retrieved {len(documents)} documents in this batch")

        for doc in documents:
            try:
                docid = doc.get("id")
                vespa_id = docid.split("::")[-1]
                doc_url = f"{endpoint}/document/v1/{NAMESPACE}/{DOCUMENT_TYPE}/docid/{vespa_id}"

                doc_resp = requests.get(doc_url, cert=(cert, key), timeout=30)
                if doc_resp.status_code != 200:
                    logging.warning(f"‚ö†Ô∏è Could not fetch {vespa_id}: {doc_resp.status_code}")
                    continue

                doc_data = doc_resp.json()
                fields = doc_data.get("fields", {})
                doc_name = fields.get("document_name")

                if field_name not in fields:
                    record = {"id": docid, "doc_name": doc_name, "status": "missing_field"}
                    safe_append_to_csv(record, OUTPUT_FILE)
                    missing_count += 1

                processed_count += 1
                if processed_count % 50 == 0:
                    logging.info(f"üìà Processed {processed_count} docs, Missing so far: {missing_count}")

            except Exception as e:
                logging.error(f"‚ùå Error processing doc {docid}: {e}")

        continuation = data.get("continuation")
        if not continuation:
            logging.info("‚úÖ Completed pagination.")
            break

    logging.info(f"üèÅ Finished. Processed: {processed_count}, Missing: {missing_count}")
    return


if __name__ == "__main__":
    fetch_all_docs_missing_field("engine_type")