In [1]:
from neo4j import GraphDatabase, basic_auth
import neo4j
import time
import aiohttp
import asyncio
import json
import os
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import requests
from datetime import datetime

In [2]:
#load the environment variables
dotenv_path = Path('~/.env')
load_dotenv(dotenv_path=dotenv_path)  # This line brings all environment variables from .env into os.environ

# Get variables
SUSTAINGRAPH_URI = os.getenv('SUSTAINGRAPH_URI')
SUSTAINGRAPH_USER = os.getenv('SUSTAINGRAPH_USER')
SUSTAINGRAPH_PASSWORD = os.getenv('SUSTAINGRAPH_PASSWORD')
database_name = os.getenv('DATABASE_NAME')

# Connect to database
driver = GraphDatabase.driver(SUSTAINGRAPH_URI, auth=(SUSTAINGRAPH_USER, SUSTAINGRAPH_PASSWORD))

# Verify connectivity
with driver.session(database=database_name) as session:
    print(session.run("RETURN 'Connected to ' + $db", db=database_name).single()[0])

Connected to neo4j


### Constaints 

In [3]:
def create_constraint(tx,statement):
    tx.run(statement)

> Uniqueness

In [4]:
statement_goal = """ 
CREATE CONSTRAINT goal_unique IF NOT EXISTS FOR (goal:Goal) 
REQUIRE (goal.code,goal.title,goal.description) IS NODE KEY;
"""
statement_target="""
CREATE CONSTRAINT target_unique IF NOT EXISTS FOR (t:Target) 
REQUIRE (t.code,t.title,t.description) IS NODE KEY;
"""
statement_indicator ="""
CREATE CONSTRAINT indicator_unique IF NOT EXISTS FOR (i:Indicator) 
REQUIRE (i.code,i.description) IS NODE KEY;
"""

statement_series ="""
CREATE CONSTRAINT series_unique IF NOT EXISTS FOR (s:Series) 
REQUIRE (s.code,s.description) IS NODE KEY;
"""

statement_sm ="""
CREATE CONSTRAINT sm_unique IF NOT EXISTS FOR ( seriesmetadata:SeriesMetadata )
 REQUIRE (seriesmetadata.attributesCode, seriesmetadata.dimensionsCode,seriesmetadata.seriesCode) IS NODE KEY;
"""

statement_has_obs = """ 
CREATE CONSTRAINT has_obs_unique IF NOT EXISTS
FOR ()-[has_obs:HAS_OBSERVATION]-() REQUIRE (has_obs.attributesCode, has_obs.dimensionsCode,has_obs.seriesCode,has_obs.geoCode,has_obs.time) IS RELATIONSHIP KEY
"""

statement_so ="""CREATE CONSTRAINT source_name IF NOT EXISTS FOR (s:Source) 
REQUIRE s.name IS NODE KEY;
""" 

> Property types

In [5]:
## Goal
statement_goal_code = """ 
CREATE CONSTRAINT goal_type_code IF NOT EXISTS
FOR (g:Goal) REQUIRE g.code :: STRING
"""
statement_goal_desc = """ 
CREATE CONSTRAINT goal_type_desc IF NOT EXISTS
FOR (g:Goal) REQUIRE g.description :: STRING
"""
statement_goal_title = """ 
CREATE CONSTRAINT goal_type_title IF NOT EXISTS
FOR (g:Goal) REQUIRE g.title :: STRING
"""

## Target
statement_target_code = """ 
CREATE CONSTRAINT target_type_code IF NOT EXISTS
FOR (t:Target) REQUIRE t.code :: STRING
"""
statement_target_desc = """ 
CREATE CONSTRAINT target_type_desc IF NOT EXISTS
FOR (t:Target) REQUIRE t.description :: STRING
"""
statement_target_title = """ 
CREATE CONSTRAINT target_type_title IF NOT EXISTS
FOR (t:Target) REQUIRE t.title :: STRING
"""

## Indicator
statement_indicator_code = """ 
CREATE CONSTRAINT indicator_type_code IF NOT EXISTS
FOR (i:Indicator)  REQUIRE i.code :: STRING
"""
statement_indicator_desc = """ 
CREATE CONSTRAINT indicator_type_desc IF NOT EXISTS
FOR (i:Indicator)  REQUIRE i.description :: STRING
"""

## Series
statement_series_code = """ 
CREATE CONSTRAINT series_type_code IF NOT EXISTS
FOR (s:Series) REQUIRE s.code :: STRING
"""
statement_series_desc = """ 
CREATE CONSTRAINT series_type_desc IF NOT EXISTS
FOR (s:Series)  REQUIRE s.description :: STRING
"""
statement_series_provider = """ 
CREATE CONSTRAINT series_type_desc IF NOT EXISTS
FOR (s:Series)  REQUIRE s.dataProviderURL :: STRING
"""

## SeriesMetadata

statement_sm_atcode = """ 
CREATE CONSTRAINT sm_type_attcode IF NOT EXISTS
FOR (sm:SeriesMetadata) REQUIRE sm.attributesCode :: STRING
"""

statement_sm_dimcode = """ 
CREATE CONSTRAINT sm_type_dimcode IF NOT EXISTS
FOR (sm:SeriesMetadata) REQUIRE sm.dimensionsCode :: STRING
"""
statement_sm_dimdesc= """ 
CREATE CONSTRAINT sm_type_dimdesc IF NOT EXISTS
FOR (sm:SeriesMetadata) REQUIRE sm.dimensionsDescription :: STRING
"""
statement_sm_attdesc= """ 
CREATE CONSTRAINT sm_type_dimdesc IF NOT EXISTS
FOR (sm:SeriesMetadata) REQUIRE sm.attributesDescription :: STRING
"""
statement_sm_seriesCode= """ 
CREATE CONSTRAINT sm_type_dimdesc IF NOT EXISTS
FOR (sm:SeriesMetadata) REQUIRE sm.seriesCode :: STRING
"""

### Observation
statement_obs_value = """ 
CREATE CONSTRAINT obs_type_value IF NOT EXISTS
FOR (o:Observation) REQUIRE o.value :: FLOAT
"""
statement_obs_time = """ 
CREATE CONSTRAINT obs_type_date IF NOT EXISTS
FOR (o:Observation) REQUIRE o.time :: DATE
"""

### Source
statement_so_name = """ 
CREATE CONSTRAINT so_type_name IF NOT EXISTS
FOR (so:Source) REQUIRE so.name :: STRING
"""

In [6]:
statements = [statement_goal,statement_goal_code,statement_goal_desc,statement_goal_title,
  statement_indicator,statement_indicator_code,statement_indicator_desc,
  statement_obs_time,statement_obs_value,
  statement_series,statement_series_code,statement_series_desc,statement_series_provider,
  statement_sm,statement_sm_atcode,statement_sm_attdesc,statement_sm_dimcode,statement_sm_dimdesc,statement_sm_seriesCode,
  statement_so,statement_so_name,
  statement_target,statement_target_code,statement_target_desc,statement_target_title,statement_has_obs]

with driver.session(database=database_name) as session:
    for statement in statements:
        session.execute_write(create_constraint, statement)

In [7]:
def cypher_run(tx,statement):
    tx.run(statement)

def cypher_run_params(tx,statement, params):
    tx.run(statement,parameters={"parameters":params})

Goals, Targets, Series, Indicators were retrieved from the United Nations SDG API and with the help of the APOC Neo4jâ€™s standard library were introduced into the SustainGraph.

The final graph schema is provided below:
![Alt text](wiki/SustainGraph-Indicators.png)

### Goals

In [8]:
statement_goals = """
    CALL apoc.load.json("https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=false")
    YIELD value
    UNWIND value AS goal
    MERGE (g:Goal {code: toString(goal.code)})
    SET g.title = toString(goal.title)
    SET g.description = toString(goal.description)
    RETURN COUNT(DISTINCT g)
"""
with driver.session(database=database_name) as session:
    session.execute_write(cypher_run, statement = statement_goals)

### Targets per Goal

In [9]:
statement_targets = """
    MATCH (g:Goal)
    CALL apoc.load.json("https://unstats.un.org/SDGAPI/v1/sdg/Goal/"+g.code+"/Target/List?includechildren=true")
    YIELD value
    UNWIND value.targets AS target
    MERGE (t:Target {code: toString(target.code)})
    SET t.title = toString(target.title)
    SET t.description = toString(target.description)
    MERGE (t)<-[:HAS_TARGET]-(g)
"""
with driver.session(database=database_name) as session:
    session.execute_write(cypher_run, statement = statement_targets)

### Indicators per Target

In [10]:
statement_ind = """
    MATCH (t:Target) 
    CALL apoc.load.json("https://unstats.un.org/SDGAPI/v1/sdg/Target/"+t.code+"/Indicator/List?includechildren=true")
    YIELD value
    UNWIND value.indicators AS indicator
    MERGE (i:Indicator {code: toString(indicator.code)})
    SET i.description = toString(indicator.description)
    MERGE (s:Source{name:'UN_SDG'})
    MERGE (t)-[:HAS_INDICATOR]->(i)-[:COMES_FROM]->(s)
"""
with driver.session(database=database_name) as session:
    session.execute_write(cypher_run, statement = statement_ind)

### Series per Indicator

In [11]:
statement_series = """
    MATCH (i:Indicator)-[:COMES_FROM]-(:Source{name:'UN_SDG'})
    CALL apoc.load.json("https://unstats.un.org/SDGAPI/v1/sdg/Indicator/"+i.code+"/Series/List")
    YIELD value
    UNWIND value.series AS series
    MERGE (s:Series {code: toString(series.code)})
    SET s.description = toString(series.description)
    SET s.dataProviderURL = 'https://unstats.un.org/sdgapi/swagger/'
    MERGE (i)-[:HAS_SERIES]->(s)
"""
with driver.session(database=database_name) as session:
    session.execute_write(cypher_run, statement = statement_series)

> Check cypher queries

In [12]:
records, summary, keys = driver.execute_query("""\
       MATCH (n:Goal) RETURN count(distinct n) as goals
        """,routing_="r",database_=database_name)
nodes_created=records[0]['goals']
expected=len(requests.get("https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=false").json())
print("{nodes_created} Goals in {time} ms, expected: {expected}".format(
    nodes_created=nodes_created,
    time=summary.result_available_after,
    expected=expected
))

records, summary, keys = driver.execute_query("""\
       MATCH (n:Target) RETURN count(distinct n) as targets
        """,routing_="r",database_=database_name)
print("{nodes_created} Targets in {time} ms, expected: {expected}".format(
    nodes_created=records[0]['targets'],
    time=summary.result_available_after,
    expected=sum(
        len(group.get("targets", [])) 
        for goal in requests.get("https://unstats.un.org/SDGAPI/v1/sdg/Goal/List?includechildren=false").json()
        for group in requests.get(f"https://unstats.un.org/SDGAPI/v1/sdg/Goal/{goal['code']}/Target/List?includechildren=true").json()
    )
))


records, summary, keys = driver.execute_query("""\
       MATCH (n:Source {name: "UN_SDG"})--(i:Indicator) RETURN COUNT(i) as ind
        """,routing_="r",database_=database_name)
print("{nodes_created} UN Indicators in {time} ms, expected: {expected}".format(
    nodes_created=records[0]['ind'],
    time=summary.result_available_after,
    expected=sum(
        len(group.get("indicators", []))
        for target in requests.get("https://unstats.un.org/SDGAPI/v1/sdg/Target/List?includechildren=true").json()
        for group in requests.get(f"https://unstats.un.org/SDGAPI/v1/sdg/Target/{target['code']}/Indicator/List?includechildren=true").json()
    )
))

expected_series = {
    series["code"]
    for indicator in requests.get("https://unstats.un.org/SDGAPI/v1/sdg/Indicator/List").json()
    for group in requests.get(f"https://unstats.un.org/SDGAPI/v1/sdg/Indicator/{indicator['code']}/Series/List").json()
    for series in group.get("series", [])
}
len_expected_series = len(expected_series)
records, summary, keys = driver.execute_query("""\
    MATCH (n:Source {name: "UN_SDG"})--(i:Indicator)--(s:Series) RETURN count(distinct s) as series
""", routing_="r", database_=database_name)
print("{nodes_created} UN Series in {time} ms, expected: {expected}".format(
    nodes_created=records[0]['series'],
    time=summary.result_available_after,
    expected=len_expected_series
))

17 Goals in 14 ms, expected: 17
169 Targets in 16 ms, expected: 169
251 UN Indicators in 49 ms, expected: 251
708 UN Series in 60 ms, expected: 708


### Observations per Series

Each UN Series code has multiple observations for each GeoArea, for different time periods and also for different attributes and dimensions 

In [13]:
def goal_series(tx,statement, goal):
    result = tx.run(statement,goal=goal)
    return result.data()[0]['s_codes']

In [14]:
# Get all series per goal and store them in a dictionary
series_per_goal ={}
statement = """
    MATCH (:Goal{code:$goal})-[:HAS_TARGET]->(:Target)-[:HAS_INDICATOR]->(:Indicator)-[:HAS_SERIES]->(s:Series)
    WITH COLLECT(DISTINCT s.code) as s_codes
    RETURN s_codes
    """
with driver.session(database=database_name) as session:
    for i in range(1,18):
        series_per_goal[str(i)] = session.execute_read(goal_series, statement = statement, goal = str(i))

# Function for mapping the code of an attribute or a dimension with its description (We dont consider NaN values)
def metadata_mapping(metadata) :
    mapping = {}
    for m in metadata:
        for  c in m["codes"]:
            mapping[c["code"]] = c["description"]
    return mapping

Collect data (observation per Series) only for European countries and countries of the European Union. <p>
Note: Cyprus belongs to the European Union, but does not belong geographically to Europe.

In [15]:
def areas_codes(tx,statement):
    result = tx.run(statement)
    return result.data()[0]['geocodes']

# Collect the available GeoArea codes of Europe in the SustainGraph
statement_areas = """
    MATCH (r:Region{name:'Europe'})-[:HAS_SUBREGION]->(sr:SubRegion)-[:HAS_AREA]->(a:Area)
    MATCH (eu:EuropeanUnion)<-[:BELONGS_TO]-(eua:Area)
    WITH COLLECT(DISTINCT a.M49code)+COLLECT(DISTINCT eua.M49code) as geocodes
    UNWIND geocodes as codes
    RETURN COLLECT(DISTINCT codes) as geocodes
    """

with driver.session(database=database_name) as session:
    geocodes = session.execute_read(areas_codes,statement_areas)

In [16]:
def create_params(series_code, geoAreas):
    return {
        'seriesCode':series_code,
        'areaCode': geoAreas # list of geoarea codes in M49
    }
def create_params_pages(series_code, geoAreas, pageSize):
    return {
        'seriesCode':series_code,
        'areaCode': geoAreas, #list of integers M49 codes
        'pageSize': pageSize
    }

#Function to collect the data per series code from the API
async def get_observations(session,series_code, geoAreas):
    async with session.get("https://unstats.un.org/sdgapi/v1/sdg/Series/Data", 
                           params = create_params(series_code,geoAreas)) as response:
        result = await response.json()
        print('Total elements of %s: %s in %s pages'%(series_code,result['totalElements'], result['totalPages']))
        exceptions = 0 
        obs = 0 
        params = []
        
        async with session.get("https://unstats.un.org/sdgapi/v1/sdg/Series/Data",
                                params = create_params_pages(series_code,geoAreas,str(result['totalElements']))) as response:
            result = await response.json()
            attributes = metadata_mapping( result['attributes'])
            dimensions = metadata_mapping( result['dimensions'])
            data = result['data']
            for observation in data :
                if (observation['value']!='NaN'):
                    obs +=1
                    attributes_code_string = '|'.join(list( observation['attributes'].values()))
                    attr_description = []
                    for a in list( observation['attributes'].values()):
                        attr_description.append(attributes[a])
                    attributes_description_string = '|'.join(attr_description)

                    dimension_code_string = '|'.join(list( observation['dimensions'].values()))
                    dim_description = []
                    for d in list( observation['dimensions'].values()):
                        dim_description.append(dimensions[d])
                    dimension_description_string = '|'.join(dim_description)
                    
                    if (observation['valueType'] != "Float"):
                        #print ("Series: %s value" %observation["series"] )
                        #print(observation['value'])
                        exceptions +=1
                        if observation['value'][0] != '<' and observation['value'][0] != '>' :
                            print ("Series: %s value" %observation["series"] )
                            print(observation['value'])
                            continue
                        else:
                            value = observation['value'][1:]
                    else:
                        value = observation['value']
                        

                    params_dict={
                            'geo':str(observation['geoAreaCode']),
                            'year':str(int(observation['timePeriodStart'])),
                            'value':float(value),
                            'att_desc': attributes_description_string,
                            'att_codes': attributes_code_string,
                            'dim_desc': dimension_description_string,
                            'dim_codes': dimension_code_string,
                            's_code':observation['series']
                        }
                    params.append(params_dict)  
        print("Observations  %s %d" %(series_code,obs))
        if (exceptions>0):
            print("Exceptions %s %d" %(series_code,exceptions))
        
        return params #list of dictionaries per series code

### Collect the data from the API

Collect all the data from the API per goal (defining the parameter "goal") and store them in a json file under the folder "Data/2.UN_observations".
We make asynchronous requests for each goal, to collect the data of all its series. Due to the size of the datasets, the asynchronous requests are made in batches of 5 to prevent server request timeout.

In [17]:
def indicators_for_goal(tx, statement, goal):
    result = tx.run(statement, goal=goal)
    return result.data()[0]['i_codes']

def series_for_indicator(tx, statement, indicator):
    result = tx.run(statement, indicator=indicator)
    return result.data()[0]['s_codes']

# store list of indicators per goal so Series data can be passed later
indicators_per_goal = {}
statement_goal = """
    MATCH (:Goal {code:$goal})-[:HAS_TARGET]->(:Target)-[:HAS_INDICATOR]->(i:Indicator)
    WITH COLLECT(DISTINCT i.code) as i_codes
    RETURN i_codes
"""
# store list of series per indicator
series_per_indicator = {}
statement_indicator = """
    MATCH (:Indicator {code:$indicator})-[:HAS_SERIES]->(s:Series)
    WITH COLLECT(DISTINCT s.code) as s_codes
    RETURN s_codes
"""

with driver.session(database=database_name) as session:
    # build mapping: goal -> indicators
    for i in range(1, 18):
        indicators_per_goal[str(i)] = session.execute_read(
            indicators_for_goal, 
            statement=statement_goal, 
            goal=str(i)
        )

    # build mapping: indicator -> series
    for goal, indicators in indicators_per_goal.items():
        for indicator in indicators:
            series_per_indicator[indicator] = session.execute_read(
                series_for_indicator,
                statement=statement_indicator,
                indicator=indicator
            )

# invert series_per_indicator
series_to_indicator = {}
for indicator, series_list in series_per_indicator.items():
    for s_code in series_list:
        series_to_indicator[s_code] = indicator

def metadata_mapping(metadata) :
    mapping = {}
    for m in metadata:
        for  c in m["codes"]:
            mapping[c["code"]] = c["description"]
    return mapping

In [18]:
for goal in range(1,18):
    st = time.time()
    print("Goal", str(goal))
    params = []
    batch_size = 5  # can be changed
    series_codes = series_per_goal[str(goal)]

    async with aiohttp.ClientSession() as session: 
        for i in range(0, len(series_codes), batch_size):
            batch = series_codes[i:i + batch_size]
            tasks = []

            for code in batch:
                indicator_code = series_to_indicator.get(code)
                
                async def fetch_with_indicator(s_code=code, i_code=indicator_code):
                    obs_list = await get_observations(session, s_code, geocodes)
                    for obs in obs_list:
                        obs['indicator'] = i_code
                    return obs_list

                tasks.append(asyncio.ensure_future(fetch_with_indicator()))

            batch_result = await asyncio.gather(*tasks)
            params.extend(batch_result)

    et = time.time()
    elapsed_time = et - st  # get the execution time
    print('Collect data :', elapsed_time/60, 'minutes')

    filename = f"Data/2.UN_observations/observations{goal}.json"
    with open(filename, "w") as outfile:
        json.dump(params, outfile, indent=2)
    print(filename, "created")


Goal 1
Total elements of SI_COV_DISAB: 587 in 24 pages
Total elements of SI_COV_MATNL: 135 in 6 pages
Total elements of SI_COV_SOCINS: 160 in 7 pages
Total elements of SI_COV_CHLD: 277 in 12 pages
Total elements of SI_COV_WKINJRY: 152 in 7 pages
Observations  SI_COV_MATNL 135
Observations  SI_COV_WKINJRY 152
Observations  SI_COV_SOCINS 160
Observations  SI_COV_CHLD 277
Observations  SI_COV_DISAB 587
Total elements of SI_COV_LMKT: 96 in 4 pages
Total elements of SI_COV_SOCAST: 158 in 7 pages
Total elements of SI_COV_UEMP: 595 in 24 pages
Total elements of SI_COV_POOR: 59 in 3 pages
Total elements of SI_COV_PENSN: 518 in 21 pages
Observations  SI_COV_LMKT 96
Observations  SI_COV_POOR 59
Observations  SI_COV_SOCAST 158
Observations  SI_COV_PENSN 518
Observations  SI_COV_UEMP 595
Total elements of DC_ODA_POVG: 0 in 0 pages
Total elements of DC_ODA_POVDLG: 529 in 22 pages
Total elements of DC_ODA_POVLG: 190 in 8 pages
Total elements of SI_COV_BENFTS: 178 in 8 pages
Total elements of SI_COV_

In [19]:
now = datetime.now()
print("API was last accessed at:", now)

API was last accessed at: 2025-09-17 18:12:56.337272


### Import data  to the SustainGraph using the json file per goal

In [20]:
def write_obs(tx,statement, params):
    result = tx.run(statement,parameters={"parameters":params})
    return result.data()[0]['total']

In [None]:
statement = """ 
    UNWIND $parameters as row
    MATCH (ga:GeoArea),(s:Series{code:row.s_code})
    WHERE row.geo = ga.M49code
    MATCH (src:Source{name:'UN_SDG'})-[:COMES_FROM]-(i:Indicator)-[:HAS_SERIES]->(s)
    MERGE (s)-[:HAS_METADATA]->(sm:SeriesMetadata{attributesCode:row.att_codes,attributesDescription:row.att_desc,
                            dimensionsCode:row.dim_codes,dimensionsDescription:row.dim_desc,seriesCode:row.s_code})
    MERGE (sm)-[:HAS_OBSERVATION{attributesCode:row.att_codes,dimensionsCode:row.dim_codes,
                                seriesCode:row.s_code,time:date(row.year),
                                geoCode:row.geo}]->(o:Observation{time:date(row.year)})
    SET o.value = toFloat(row.value)
    MERGE (o)-[:REFERS_TO_AREA]->(ga)
    MERGE (i) -[:HAS_OBSERVATIONS]->(ga)
    RETURN COUNT(DISTINCT o) as total
"""

total_start = time.time()

for goal in range(1,18):
    print("Goal ", goal)
    f = open(os.getcwd()+'\\Data\\2.UN_observations\\observations'+str(goal)+'.json')
    dataToImport = json.load(f)
    print("Start importing to neo4j")
    st = time.time()
    with driver.session(database=database_name) as session:
        for p in dataToImport:
            results = session.execute_write(write_obs,statement, p)
            et = time.time()

    elapsed_time = et - st
    print('Import data :', elapsed_time/60, 'minutes')

total_end = time.time()
total_elapsed = total_end - total_start
print('Total import time: ', total_elapsed/60, 'minutes')

Goal  1
Start importing to neo4j


  print('\Total import time: ', total_elapsed/60, 'minutes')


Import data : 0.487278946240743 minutes
Goal  2
Start importing to neo4j
Import data : 0.39811888535817463 minutes
Goal  3
Start importing to neo4j
Import data : 0.6892135620117188 minutes
Goal  4
Start importing to neo4j
Import data : 1.8750510255495707 minutes
Goal  5
Start importing to neo4j
Import data : 0.16781112353007 minutes
Goal  6
Start importing to neo4j
Import data : 0.4870819807052612 minutes
Goal  7
Start importing to neo4j
Import data : 0.37057942946751915 minutes
Goal  8
Start importing to neo4j
Import data : 2.437811227639516 minutes
Goal  9
Start importing to neo4j
Import data : 0.24255380630493165 minutes
Goal  10
Start importing to neo4j
Import data : 0.5163045287132263 minutes
Goal  11
Start importing to neo4j
Import data : 0.4132896264394124 minutes
Goal  12
Start importing to neo4j
Import data : 2.356424033641815 minutes
Goal  13
Start importing to neo4j
Import data : 0.20491911172866822 minutes
Goal  14
Start importing to neo4j
Import data : 0.201362939675649 mi

> Check cypher query

Checking for observations that weren't imported properly to the graph ('mismatches' between graph and json files from UN database)

In [22]:
statement = """
UNWIND $parameters AS row
MATCH (sm:SeriesMetadata {
    attributesCode: row.att_codes,
    dimensionsCode: row.dim_codes,
    seriesCode: row.s_code
})-[:HAS_OBSERVATION {
    attributesCode: row.att_codes,
    dimensionsCode: row.dim_codes,
    seriesCode: row.s_code,
    geoCode: row.geo,
    time: date(row.year)
}]->(o:Observation)
RETURN count((o)) AS total
"""

check_data_un = []

for goal in range(1, 18):
    with open(os.path.join(os.getcwd()+'\\Data\\2.UN_observations\\observations'+str(goal)+'.json')) as f:
        dataToImport = json.load(f)

    with driver.session(database=database_name) as session:
        for p in dataToImport:
            expected_count = len(p)
            results = session.execute_write(write_obs, statement, p)

            if (len(p)!=0): #not empty observations
                first_entry = p[0]
                s_code = first_entry['s_code']
                att_code = first_entry['att_codes']
                dim_code = first_entry['dim_codes']
                indicator_code = first_entry["indicator"]

                check_data_un.append({
                    "indicator": indicator_code,
                    "series": s_code,
                    "dim_code": dim_code,
                    "att_code": att_code,
                    "seriesmetadata": f"{dim_code}|{att_code}",
                    "to_import": expected_count,
                    "imported": results
                })

check_df_un = pd.DataFrame(check_data_un)

In [23]:
check_df_un

Unnamed: 0,indicator,series,dim_code,att_code,seriesmetadata,to_import,imported
0,1.3.1,SI_COV_WKINJRY,15+|BOTHSEX|G,E|PERCENT|A,15+|BOTHSEX|G|E|PERCENT|A,152,152
1,1.3.1,SI_COV_CHLD,<15Y|BOTHSEX|G,E|PERCENT|A,<15Y|BOTHSEX|G|E|PERCENT|A,277,277
2,1.3.1,SI_COV_MATNL,15-49|FEMALE|G,E|PERCENT|A,15-49|FEMALE|G|E|PERCENT|A,135,135
3,1.3.1,SI_COV_SOCINS,G|_T,CA|PERCENT,G|_T|CA|PERCENT,160,160
4,1.3.1,SI_COV_DISAB,BOTHSEX|G,E|PERCENT|A,BOTHSEX|G|E|PERCENT|A,587,587
...,...,...,...,...,...,...,...
688,17.2.1,DC_ODA_SIDS,G,C|CON_USD_M,G|C|CON_USD_M,743,743
689,17.19.1,SG_STT_CAPTY,G,G|CU_USD,G|G|CU_USD,103,103
690,17.19.2,SG_REG_CENSUSN,G,C|NUMBER,G|C|NUMBER,49,49
691,17.19.2,SG_REG_DETH75N,G,C|NUMBER,G|C|NUMBER,358,358


In [None]:
total_observations_imported = check_df_un['imported'].sum()
total_observations_to_import = check_df_un['to_import'].sum()
print("Total UN SDG observations imported in SustainGraph", total_observations_imported, ", expected:",total_observations_to_import)

mismatches = check_df_un[check_df_un['to_import'] != check_df_un['imported']]
if len(mismatches)==0:
    print("Mismatches: 0")
else:
    print("\nMismatches:\n", mismatches)

Total UN SDG observations imported in SustainGraph  834104 , expected: 834104
Mismatches: 0
