In [None]:
import datetime
import os
from pathlib import Path

import httpx
import pandas as pd
from httpx import ConnectError
from neomodel import (
    ArrayProperty,
    DateProperty,
    Relationship,
    RelationshipFrom,
    RelationshipTo,
    StringProperty,
    StructuredNode,
    UniqueIdProperty,
    clear_neo4j_database,
    config,
    db,
)
from tqdm.notebook import tqdm

In [None]:
config.DATABASE_URL = os.environ["NEO4J_BOLT_URL"]
db.set_connection(os.environ["NEO4J_BOLT_URL"])
clear_neo4j_database(db)

In [None]:
class BaseConcept(StructuredNode):
    __abstract_node__ = True
    uid = UniqueIdProperty()
    sources = RelationshipFrom("SourceConcept", "HAS_SOURCE_CONCEPT")


class Concept(BaseConcept):
    name = StringProperty()
    stories = RelationshipTo("Story", "HAS_CONCEPT")
    neighbours = Relationship("Concept", "IS_NEIGHBOUR_OF")


class SourceConcept(StructuredNode):
    uid = UniqueIdProperty()
    source_id = StringProperty(unique_index=True, required=True)
    source = StringProperty(
        required=True,
        choices={"wikidata": "wikidata", "lcsh": "lcsh", "mesh": "mesh"},
    )
    description = StringProperty()
    preferred_name = StringProperty()
    variant_names = ArrayProperty(StringProperty())
    parent = RelationshipTo("Concept", "HAS_SOURCE_CONCEPT")


class Person(BaseConcept):
    contributed_to = RelationshipTo("Story", "CONTRIBUTED_TO")
    is_about = RelationshipFrom("Story", "IS_ABOUT")


class Story(StructuredNode):
    uid = UniqueIdProperty()
    wellcome_id = StringProperty(unique_index=True, required=True)
    published = DateProperty(required=True)
    title = StringProperty(required=True)
    wikidata_id = StringProperty(unique_index=True)
    concepts = RelationshipFrom("Concept", "HAS_CONCEPT")
    contributors = RelationshipFrom("Person", "CONTRIBUTED_TO")
    subjects = RelationshipTo("Person", "IS_ABOUT")

In [None]:
df = (
    pd.read_excel(
        pd.ExcelFile("../../data/stories/stories.xlsx", engine="openpyxl"),
        sheet_name="Articles",
        dtype={"Date published": datetime.datetime},
    )
    .fillna("")
)

In [None]:
def clean(input_string):
    return input_string.strip().lower().replace(",", "")


def clean_csv(input_string):
    return [clean(y) for y in str(input_string).split(", ") if y != ""]


def get_wikidata_id(concept_name):
    response = httpx.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbsearchentities",
            "language": "en",
            "format": "json",
            "search": concept_name,
        },
    ).json()

    # naively select the first result
    try:
        wikidata_id = response["search"][0]["id"]
    except IndexError:
        wikidata_id = None
    return wikidata_id


def get_wikidata(wikidata_id):
    response = httpx.get(
        "http://www.wikidata.org/wiki/Special:EntityData/" 
        f"{wikidata_id}.json"
    ).json()

    data = response["entities"][wikidata_id]

    return data


def get_wikidata_preferred_name(wikidata):
    try:
        preferred_name = wikidata["labels"]["en"]["value"]

    except (IndexError, KeyError, ConnectError):
        preferred_name = None

    return preferred_name


def get_wikidata_variant_names(
    wikidata, languages=["en", "en-gb", "en-ca", "en-us", "en-simple"]
):
    try:
        labels = [
            label["value"]
            for label in wikidata["labels"].values()
            if label["language"] in languages
        ]
        aliases = [
            alias["value"]
            for group in wikidata["aliases"].values()
            for alias in group
            if alias["language"] in languages
        ]
        variant_names = list(set([clean(name) for name in labels + aliases]))

    except (IndexError, KeyError, ConnectError):
        variant_names = []

    return variant_names


def get_wikidata_description(wikidata):
    try:
        description = wikidata["descriptions"]["en"]["value"]

    except (IndexError, KeyError, ConnectError):
        description = ""

    return description


def get_contributor_wikidata_ids(wikidata):
    try:
        contributors = [
            author["mainsnak"]["datavalue"]["value"]["id"]
            for author in wikidata["claims"]["P50"]
        ]
    except (IndexError, KeyError, ConnectError):
        contributors = []
    return contributors

In [None]:
stories = {}
people = {}
concepts = {}

for _, story_data in tqdm(df.iterrows(), total=len(df)):
    story = Story(
        wellcome_id=Path(story_data["URL"]).name,
        title=story_data["Title"],
        published=story_data["Date published"].date(),
        wikidata_id=story_data["Wikidata ID"],
    ).save()
    stories[story.wellcome_id] = story

    story_wikidata = get_wikidata(story.wikidata_id)
    contributor_wikidata_ids = get_contributor_wikidata_ids(story_wikidata)
    for contributor_wikidata_id in contributor_wikidata_ids:
        if contributor_wikidata_id in people:
            person = people[contributor_wikidata_id]
        else:
            contributor_wikidata = get_wikidata(contributor_wikidata_id)
            source_concept = SourceConcept(
                source_id=contributor_wikidata_id,
                source="wikidata",
                description=get_wikidata_description(contributor_wikidata),
                preferred_name=get_wikidata_preferred_name(contributor_wikidata),
                variant_names=get_wikidata_variant_names(contributor_wikidata),
            ).save()
            person = Person().save()
            person.sources.connect(source_concept)
            people[contributor_wikidata_id] = person
        story.contributors.connect(person)

    for concept_name in clean_csv(story_data["Keywords"]):
        if concept_name in concepts:
            concept = concepts[concept_name]
        else:
            try:
                concept_wikidata_id = get_wikidata_id(concept_name)
                concept_wikidata = get_wikidata(concept_wikidata_id)
                source_concept = SourceConcept(
                    source_id=concept_wikidata_id,
                    source="wikidata",
                    description=get_wikidata_description(concept_wikidata),
                    preferred_name=get_wikidata_preferred_name(concept_wikidata),
                    variant_names=get_wikidata_variant_names(concept_wikidata),
                ).save()
                concept = Concept(name=concept_name).save()
                concept.sources.connect(source_concept)
                concepts[concept_name] = concept
            except:
                concept = Concept(name=concept_name).save()
                concepts[concept_name] = concept
        story.concepts.connect(concept)