By the end of this notebook, I want to make something that looks like this in neo4j

![](./structure.png)

In [None]:
import json
import os

import httpx
import pandas as pd
from elasticsearch import Elasticsearch
from neomodel import (
    DateProperty,
    IntegerProperty,
    Relationship,
    RelationshipFrom,
    RelationshipTo,
    StringProperty,
    StructuredNode,
    UniqueIdProperty,
    clear_neo4j_database,
    config,
    db,
)
from tqdm.notebook import tqdm

In [None]:
config.DATABASE_URL = os.environ["NEO4J_BOLT_URL"]
db.set_connection(os.environ["NEO4J_BOLT_URL"])
clear_neo4j_database(db)

## super simple stories ingest

In [None]:
class Concept(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)
    variant_name = RelationshipTo("VariantName", "AKA")
    stories = RelationshipTo("Story", "HAS_CONCEPT")


class Contributor(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)
    variant_name = RelationshipTo("VariantName", "AKA")
    stories = RelationshipTo("Story", "CONTRIBUTED_TO")


class Story(StructuredNode):
    uid = UniqueIdProperty()
    title = StringProperty(unique_index=True, required=True)
    published = DateProperty()
    contributor = RelationshipFrom("Contributor", "CONTRIBUTED_TO")
    concept = RelationshipFrom("Concept", "HAS_CONCEPT")


class VariantName(StructuredNode):
    uid = UniqueIdProperty()
    name = StringProperty(unique_index=True, required=True)
    concept = RelationshipFrom("Concept", "AKA")

In [None]:
df = pd.read_json("../../data/stories.json").fillna("")

In [None]:
df["Date published"] = pd.to_datetime(df["Date published"], unit="ms")

### create all the stories

In [None]:
stories = {}
for _, story_data in tqdm(df.iterrows(), total=len(df)):
    story = Story(
        title=story_data["Title"], published=story_data["Date published"].date()
    ).save()
    stories[story_data["Title"]] = story

### create all the contributors

In [None]:
unique_contributors = list(
    set(
        [
            name.strip()
            for authors, images_by in df[["Author", "Images by"]].values
            for name in (authors.split(",") + images_by.split(","))
            if name.strip() != ""
        ]
    )
)


contributors = {}
for name in tqdm(unique_contributors):
    contributor = Contributor(name=name).save()
    contributors[name] = contributor

### create all the concepts

In [None]:
unique_concepts = list(
    set(
        [
            concept.strip()
            for concepts in df["Keywords"].values
            for concept in concepts.split(",")
            if concept.strip() != ""
        ]
    )
)


concepts = {}
for name in tqdm(unique_concepts):
    concept = Concept(name=name).save()
    concepts[name] = concept

### connect them up

In [None]:
for index, story_data in tqdm(df.iterrows(), total=len(df)):
    story = stories[story_data["Title"]]

    contributor_names = [
        name.strip()
        for name in (
            story_data["Author"].split(",") + story_data["Images by"].split(",")
        )
        if name.strip() != ""
    ]
    for name in contributor_names:
        contributor = contributors[name]
        story.contributor.connect(contributor)

    concept_names = [
        concept.strip()
        for concept in story_data["Keywords"].split(",")
        if concept.strip() != ""
    ]
    for name in concept_names:
        concept = concepts[name]
        story.concept.connect(concept)

### create variant names

In [None]:
def clean(input_string):
    return input_string.strip().lower().replace(",", "")


def get_variant_names(
    concept, languages=["en", "en-gb", "en-ca", "en-us", "en-simple"]
):
    response = httpx.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbsearchentities",
            "language": "en",
            "format": "json",
            "search": concept,
        },
    ).json()

    try:
        # naively select the first result
        wikidata_id = response["search"][0]["id"]

        response = httpx.get(
            f"http://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
        ).json()

        data = response["entities"][wikidata_id]
        labels = [
            label["value"]
            for label in data["labels"].values()
            if label["language"] in languages
        ]
        aliases = [
            alias["value"]
            for group in data["aliases"].values()
            for alias in group
            if alias["language"] in languages
        ]
        variant_names = list(set([clean(name) for name in labels + aliases]))

    except (IndexError, KeyError):
        variant_names = []

    return variant_names

In [None]:
variants = {concept: get_variant_names(concept) for concept in tqdm(unique_concepts)}

In [None]:
all_variant_name_edges = [
    (concept_core_name, variant_name)
    for concept_core_name, variant_names in tqdm(variants.items())
    for variant_name in variant_names
    if variant_name != concept_core_name
]

In [None]:
unique_variant_names = list(set([edge[1] for edge in all_variant_name_edges]))

In [None]:
variant_dict = {}
for variant_name in tqdm(unique_variant_names):
    v = VariantName(name=variant_name).save()
    variant_dict[variant_name] = v

### connect them up

In [None]:
for concept_core_name, variant_name in tqdm(all_variant_name_edges):
    concept = concepts[concept_core_name]
    variant = variant_dict[variant_name]
    concept.variant_name.connect(variant)