In [None]:
import json
import os

import httpx
import pandas as pd
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm

In [None]:
df = pd.read_json("../../data/stories.json")

In [None]:
df["Keywords"]

In [None]:
def clean(input_string):
    return input_string.strip().lower().replace(",", "")

In [None]:
df["concepts"] = df["Keywords"].apply(
    lambda x: [clean(y) for y in str(x).split(", ") if y != ""]
)

In [None]:
unique_concepts = list(
    set([concept for set_of_concepts in df["concepts"] for concept in set_of_concepts])
)

In [None]:
len(unique_concepts)

In [None]:
concept = unique_concepts[2]

In [None]:
def get_variant_names(
    concept, languages=["en", "en-gb", "en-ca", "en-us", "en-simple"]
):
    response = httpx.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbsearchentities",
            "language": "en",
            "format": "json",
            "search": concept,
        },
    ).json()

    try:
        # naively select the first result
        wikidata_id = response["search"][0]["id"]

        response = httpx.get(
            f"http://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
        ).json()

        data = response["entities"][wikidata_id]
        labels = [
            label["value"]
            for label in data["labels"].values()
            if label["language"] in languages
        ]
        aliases = [
            alias["value"]
            for group in data["aliases"].values()
            for alias in group
            if alias["language"] in languages
        ]
        variant_names = list(set([clean(name) for name in labels + aliases]))

    except (IndexError, KeyError):
        variant_names = []

    return variant_names

In [None]:
get_variant_names("water")

In [None]:
variants = {concept: get_variant_names(concept) for concept in tqdm(unique_concepts)}

In [None]:
v = [v for l in variants.values() for v in l]

In [None]:
# 638
len(v)

In [None]:
# 625
len(set(v))

In [None]:
from collections import Counter

c = Counter(v)

In [None]:
[name for name in c if c[name] > 1]

## find intersections

In [None]:
for concept_1, variant_names in tqdm(variants.items()):
    for concept_2 in variants.keys():
        if concept_2 in variant_names and concept_1 != concept_2:
            print(f"{concept_1} is linked to {concept_2}")

In [None]:
for concept_1, variant_names_1 in tqdm(variants.items()):
    for concept_2, variant_names_2 in variants.items():
        intersection = set(variant_names_1).intersection(set(variant_names_2))
        if intersection and concept_1 != concept_2:
            print(f"{concept_1} is linked to {concept_2} via {intersection}")

In [None]:
variants

In [None]:
df["concepts_expanded"] = df["concepts"].apply(
    lambda concepts: [variant for concept in concepts for variant in variants[concept]]
)

In [None]:
df['concepts_expanded']

# index with variants

In [None]:
import os

import pandas as pd
from elasticsearch import Elasticsearch
from tqdm.notebook import tqdm
import json

In [None]:
es = Elasticsearch(
    os.environ["ELASTIC_HOST"],
    http_auth=(os.environ["ELASTIC_USERNAME"], os.environ["ELASTIC_PASSWORD"]),
)

In [None]:
INDEX_NAME = "stories"

In [None]:
with open('../../data/stories/mapping.json', 'r') as f:
    mappings = json.load(f)

with open('../../data/stories/settings.json', 'r') as f:
    settings = json.load(f)

In [None]:
es.indices.delete(index=INDEX_NAME, ignore=404)

In [None]:
es.indices.create(index=INDEX_NAME, mappings=mappings, settings=settings)

In [None]:
df

In [None]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    document = row.to_dict()
    for field, value in document.items():
        if type(value) == list:
            document[field] = ','.join(value)
    es.index(index="stories", document=document)