In [None]:
import os

import pandas as pd
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm

In [None]:
df = pd.read_json("../../data/stories.json")

In [None]:
es = Elasticsearch(
    os.environ["ELASTIC_HOST"],
    http_auth=(os.environ["ELASTIC_USERNAME"], os.environ["ELASTIC_PASSWORD"]),
)

In [None]:
INDEX_NAME = "stories"

In [None]:
mappings = {
    "properties": {
        "id": {"type": "keyword"},
        "Author": {
            "type": "text",
            "fields": {"keyword": {"type": "keyword"}},
        },
        "Date published": {
            "type": "date",
            "format": "epoch_millis",
        },
        "Images by": {
            "type": "text",
            "fields": {"keyword": {"type": "keyword"}},
        },
        "Keywords": {"type": "text", "analyzer": "csv_analyzer"},
        "Notes": {
            "type": "text",
            "fields": {"keyword": {"type": "keyword"}},
            "index": "false",
        },
        "Part of": {
            "type": "text",
            "fields": {"keyword": {"type": "keyword"}},
        },
        "Quarter": {"type": "keyword"},
        "Relates to": {
            "type": "text",
            "fields": {"keyword": {"type": "keyword"}},
        },
        "Title": {"type": "text"},
        "Type": {"type": "keyword"},
        "URL": {"type": "keyword"},
        "Wikidata ID": {"type": "keyword"},
        "fulltext": {
            "type": "text",
            "fields": {"keyword": {"type": "keyword"}},
        },
    }
}

settings = {
    "analysis": {
        "analyzer": {"csv_analyzer": {"tokenizer": "csv_tokenizer"}},
        "tokenizer": {"csv_tokenizer": {"type": "pattern", "pattern": ","}},
    }
}

In [None]:
es.indices.create(index=INDEX_NAME, mappings=mappings, settings=settings)

In [None]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    es.index(index="stories", document=row.to_dict())