In [None]:
import os
import httpx
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm
import json

In [None]:
def traverse(node, parent=None):
    """
    recursively yields each node in a tree alongside its parent node (if the
    parent exists)
    """
    yield {
        "parent": parent,
        "child": {"label": node["label"], "label_type": node["label_type"]},
    }

    if node["children"]:
        for child in node["children"]:
            parent = {"label": node["label"], "label_type": node["label_type"]}
            yield from traverse(child, parent)

In [None]:
es = Elasticsearch(
    hosts=os.environ["ELASTIC_HOST"],
    http_auth=(os.environ["ELASTIC_USERNAME"], os.environ["ELASTIC_PASSWORD"]),
)

In [None]:
data_path = "/home/jovyan/data/lcsh.skos.ndjson"

In [None]:
n_lines = sum(1 for line in open(data_path))
n_lines

In [None]:
def lines():
    with open(data_path) as f:
        for line in f:
            yield json.loads(line)

In [None]:
subject = next(iter(lines()))

subject_id = subject['@context']['about'].split('/')[-1]

response = httpx.get(
    url="http://enricher:80", 
    params={
        "id_type": "lc_subjects",
        "id": subject_id
    },
    timeout=10
).json()
response

In [None]:
for subject in tqdm(lines(), total=n_lines):   
    try:
        try:
            subject_id = subject['@context']['about'].split('/')[-1]
        except:
            print(f"couldn't parse subject ID in:\n{subject}")
            pass

        response = httpx.get(
            url="http://enricher:80", 
            params={
                "id_type": "lc_subjects",
                "id": subject_id
            },
            timeout=10
        ).json()
        variant_names = [a['child']['label'] for a in traverse(response)]
        es.index(
            index="variant-names-lcsh",
            id=subject_id,
            body={
                "variant_names": variant_names
            }
        )
    except:
        print(f"something went wrong with {subject_id}")