In [None]:
import datetime
import os
from pathlib import Path

import httpx
import pandas as pd
from httpx import ConnectError
from neomodel import (
    ArrayProperty,
    DateProperty,
    Relationship,
    RelationshipFrom,
    RelationshipTo,
    StringProperty,
    StructuredNode,
    UniqueIdProperty,
    clear_neo4j_database,
    config,
    db,
)
from tqdm.notebook import tqdm

In [None]:
config.DATABASE_URL = os.environ["NEO4J_BOLT_URL"]
db.set_connection(os.environ["NEO4J_BOLT_URL"])
clear_neo4j_database(db)

In [None]:
class BaseConcept(StructuredNode):
    __abstract_node__ = True
    uid = UniqueIdProperty()
    sources = RelationshipFrom("SourceConcept", "HAS_SOURCE_CONCEPT")


class Concept(BaseConcept):
    name = StringProperty()
    stories = RelationshipTo("Story", "HAS_CONCEPT")
    neighbours = Relationship("Concept", "IS_NEIGHBOUR_OF")


class SourceConcept(StructuredNode):
    uid = UniqueIdProperty()
    source_id = StringProperty(unique_index=True, required=True)
    source_type = StringProperty(
        required=True,
        choices={"wikidata": "wikidata", "lcsh": "lcsh", "mesh": "mesh"},
    )
    description = StringProperty()
    preferred_name = StringProperty()
    variant_names = ArrayProperty(StringProperty())
    parent = RelationshipTo("Concept", "HAS_SOURCE_CONCEPT")


class Person(BaseConcept):
    contributed_to = RelationshipTo("Story", "CONTRIBUTED_TO")
    is_about = RelationshipFrom("Story", "IS_ABOUT")


class Story(StructuredNode):
    uid = UniqueIdProperty()
    wellcome_id = StringProperty(unique_index=True, required=True)
    published = DateProperty(required=True)
    title = StringProperty(required=True)
    wikidata_id = StringProperty(unique_index=True)
    concepts = RelationshipFrom("Concept", "HAS_CONCEPT")
    contributors = RelationshipFrom("Person", "CONTRIBUTED_TO")
    subjects = RelationshipTo("Person", "IS_ABOUT")

In [None]:
df = (
    pd.read_excel(
        pd.ExcelFile("../../data/stories/stories.xlsx", engine="openpyxl"),
        sheet_name="Articles",
        dtype={"Date published": datetime.datetime},
    )
    .fillna("")
).head(200)

In [None]:
def clean(input_string):
    return input_string.strip().lower().replace(",", "")


def clean_csv(input_string):
    return [clean(y) for y in str(input_string).split(", ") if y != ""]


def get_wikidata_id(concept_name):
    response = httpx.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbsearchentities",
            "language": "en",
            "format": "json",
            "search": concept_name,
        },
    ).json()

    # naively select the first result
    try:
        wikidata_id = response["search"][0]["id"]
    except IndexError:
        wikidata_id = None
    return wikidata_id


def get_wikidata(wikidata_id):
    response = httpx.get(
        "http://www.wikidata.org/wiki/Special:EntityData/" 
        f"{wikidata_id}.json"
    ).json()

    data = response["entities"][wikidata_id]

    return data


def get_wikidata_preferred_name(wikidata):
    try:
        preferred_name = wikidata["labels"]["en"]["value"]

    except (IndexError, KeyError, ConnectError):
        preferred_name = None

    return preferred_name


def get_wikidata_variant_names(
    wikidata, languages=["en", "en-gb", "en-ca", "en-us", "en-simple"]
):
    try:
        labels = [
            label["value"]
            for label in wikidata["labels"].values()
            if label["language"] in languages
        ]
        aliases = [
            alias["value"]
            for group in wikidata["aliases"].values()
            for alias in group
            if alias["language"] in languages
        ]
        variant_names = list(set([clean(name) for name in labels + aliases]))

    except (IndexError, KeyError, ConnectError):
        variant_names = []

    return variant_names


def get_wikidata_description(wikidata):
    try:
        description = wikidata["descriptions"]["en"]["value"]

    except (IndexError, KeyError, ConnectError):
        description = ""

    return description


def get_contributor_wikidata_ids(wikidata):
    try:
        contributors = [
            author["mainsnak"]["datavalue"]["value"]["id"]
            for author in wikidata["claims"]["P50"]
        ]
    except (IndexError, KeyError, ConnectError):
        contributors = []
    return contributors



def get_lcsh_id(wikidata):
    try:
        lcsh_id = wikidata["claims"]["P244"][0]["mainsnak"]["datavalue"][
            "value"
        ]
    except (KeyError, IndexError):
        lcsh_id = None
    return lcsh_id


def get_lcsh_data(lcsh_id):
    url = f"http://id.loc.gov/authorities/subjects/{lcsh_id}.json"

    try:
        response = httpx.get(url)
    except ValueError as e:
        raise e
    if response.status_code == 200:
        pass
    elif response.status_code == 404:
        raise ValueError(f"{lcsh_id} is not a valid library of congress ID")
    else:
        raise ValueError(
            f"something unexpected happened when calling url: {url}"
        )

    for element in response.json():
        if element["@id"] == url.replace(".json", ""):
            return element


def get_lcsh_variant_names(lcsh_data):
    key = "http://www.w3.org/2004/02/skos/core#altLabel"
    if key in lcsh_data:
        variants = [label["@value"] for label in lcsh_data[key]]
    else:
        variants = []
    return variants


def get_lcsh_preferred_name(lcsh_data):
    key = "http://www.w3.org/2004/02/skos/core#prefLabel"
    if key in lcsh_data:
        preferred_name = lcsh_data[key][0]["@value"]
    else:
        preferred_name = None
    return preferred_name


def get_mesh_id(wikidata):
    try:
        mesh_id = wikidata["claims"]["P486"][0]["mainsnak"]["datavalue"][
            "value"
        ]
    except (KeyError, IndexError):
        mesh_id = None
    return mesh_id


def get_mesh_data(mesh_id):
    response = httpx.get(
        url="https://meshb.nlm.nih.gov/api/search/record",
        params={
            "searchInField": "ui",
            "sort": "",
            "size": "1",
            "searchType": "exactMatch",
            "searchMethod": "FullWord",
            "q": mesh_id,
        },
    )
    try:
        mesh_data = response.json()["hits"]["hits"][0]["_source"]["_generated"]
    except IndexError:
        raise ValueError(f"{mesh_id} is not a valid MeSH ID")
    except KeyError:
        raise ValueError(
            f"something unexpected happened when calling url: {response.url}"
        )
    return mesh_data


def get_mesh_preferred_name(mesh_data):
    try:
        preferred_name = mesh_data["RecordName"]
    except KeyError:
        preferred_name = None
    return preferred_name


def get_mesh_description(mesh_data):
    if "PreferredConceptScopeNote" in mesh_data:
        description = mesh_data["PreferredConceptScopeNote"]
    elif "scrNote" in mesh_data:
        description = mesh_data["scrNote"]
    else:
        description = None
    return description


def get_mesh_variant_names(mesh_data):
    try:
        variants = (
            mesh_data["originalEntryTerms"] + mesh_data["permutatedEntryTerms"]
        )
    except KeyError:
        variants = []
    return variants


In [None]:
stories = {}
people = {}
concepts = {}

for _, story_data in tqdm(df.iterrows(), total=len(df)):
    story = Story(
        wellcome_id=Path(story_data["URL"]).name,
        title=story_data["Title"],
        published=story_data["Date published"].date(),
        wikidata_id=story_data["Wikidata ID"],
    ).save()
    stories[story.wellcome_id] = story

    story_wikidata = get_wikidata(story.wikidata_id)
    contributor_wikidata_ids = get_contributor_wikidata_ids(story_wikidata)
    for contributor_wikidata_id in contributor_wikidata_ids:
        if contributor_wikidata_id in people:
            person = people[contributor_wikidata_id]
        else:
            contributor_wikidata = get_wikidata(contributor_wikidata_id)
            source_concept = SourceConcept(
                source_id=contributor_wikidata_id,
                source_type="wikidata",
                description=get_wikidata_description(contributor_wikidata),
                preferred_name=get_wikidata_preferred_name(contributor_wikidata),
                variant_names=get_wikidata_variant_names(contributor_wikidata),
            ).save()
            person = Person().save()
            person.sources.connect(source_concept)
            people[contributor_wikidata_id] = person
        story.contributors.connect(person)

    for concept_name in clean_csv(story_data["Keywords"]):
        if concept_name in concepts:
            concept = concepts[concept_name]
        else:
            concept = Concept(name=concept_name).save()
            try:
                concept_wikidata_id = get_wikidata_id(concept_name)
                concept_wikidata = get_wikidata(concept_wikidata_id)
                wikidata_source_concept = SourceConcept(
                    source_id=concept_wikidata_id,
                    source_type="wikidata",
                    description=get_wikidata_description(concept_wikidata),
                    preferred_name=get_wikidata_preferred_name(concept_wikidata),
                    variant_names=get_wikidata_variant_names(concept_wikidata),
                ).save()
                concept.sources.connect(wikidata_source_concept)
                
                try:
                    lcsh_id = get_lcsh_id(concept_wikidata)
                    concept_lcsh_data = get_lcsh_data(lcsh_id)
                    lcsh_source_concept = SourceConcept(
                        source_id=lcsh_id,
                        source_type="lcsh",
                        preferred_name=get_lcsh_preferred_name(concept_lcsh_data),
                        variant_names=get_lcsh_variant_names(concept_lcsh_data),
                    ).save()
                    concept.sources.connect(lcsh_source_concept)
                except: 
                    pass
                
                try:
                    mesh_id = get_mesh_id(concept_wikidata)
                    concept_mesh_data = get_mesh_data(mesh_id)
                    mesh_source_concept = SourceConcept(
                        source_id=lcsh_id,
                        source_type="lcsh",
                        description=get_mesh_description(concept_mesh_data),
                        preferred_name=get_mesh_preferred_name(concept_mesh_data),
                        variant_names=get_mesh_variant_names(concept_mesh_data),
                    ).save()
                    concept.sources.connect(mesh_source_concept)                
                except: 
                    pass
                
            except: 
                pass
            concepts[concept_name] = concept
     
        story.concepts.connect(concept)