In [None]:
import json
import math
import os

import httpx
import pandas as pd
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm

In [None]:
local_es = Elasticsearch(
    hosts=os.environ["ELASTIC_HOST"],
    http_auth=(os.environ["ELASTIC_USERNAME"], os.environ["ELASTIC_PASSWORD"]),
)

remote_es = Elasticsearch(
    hosts=os.environ["REMOTE_ELASTIC_HOST"],
    http_auth=(
        os.environ["REMOTE_ELASTIC_USERNAME"],
        os.environ["REMOTE_ELASTIC_PASSWORD"],
    ),
)

In [None]:
pit = remote_es.open_point_in_time(index="works-indexed-2021-07-19", keep_alive="1m")

response = remote_es.search(
    body={
        "query": {"exists": {"field": "data.subjects"}},
        "sort": [{"_id": {"order": "asc"}}],
        "size": 10_000,
        "pit": {**pit, "keep_alive": "1m"},
    },
    _source="data.subjects.id.sourceIdentifier",
    track_total_hits=True,
)

results = response["hits"]["hits"]
total_results = response["hits"]["total"]["value"]
n_batches = math.ceil(total_results / 10_000)
loop = tqdm(total=total_results)
loop.update(10_000)


for i in range(n_batches):
    response = remote_es.search(
        body={
            "query": {"exists": {"field": "data.subjects"}},
            "sort": [{"_id": {"order": "asc"}}],
            "size": 10_000,
            "pit": {**pit, "keep_alive": "1m"},
            "search_after": results[-1]["sort"],
        },
        _source="data.subjects.id.sourceIdentifier",
        track_total_hits=False,
    )
    results.extend(response["hits"]["hits"])
    loop.update(10_000)
loop.close()

remote_es.close_point_in_time(pit)

In [None]:
unique_subjects = set()

for result in tqdm(results):
    if result["_source"]:
        if result["_source"]["data"]["subjects"]:
            for subject in result["_source"]["data"]["subjects"]:
                unique_subjects.add(
                    (
                        subject["id"]["sourceIdentifier"]["identifierType"]["id"],
                        subject["id"]["sourceIdentifier"]["value"],
                    )
                )

In [None]:
len(unique_subjects)

In [None]:
df = pd.DataFrame(unique_subjects, columns=["id_type", "id"])
records = df.to_dict("records")

with open("../data/unique_subjects.json", "w") as f:
    json.dump(records, f)