In [1]:
import eurostat
from neo4j import GraphDatabase, basic_auth
import neo4j
import pandas as pd
import numpy as np
import time
import pycountry
import os
from dotenv import load_dotenv
from pathlib import Path
import glob
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

from functions import import_sm_obs,import_happiness_score,preprocessing_happiness,country_code_converter,preprocessing_eurostat_data

In [2]:
#load the environment variables
dotenv_path = Path('~/.env')
load_dotenv(dotenv_path=dotenv_path)  # This line brings all environment variables from .env into os.environ

# Get variables
SUSTAINGRAPH_URI = os.getenv('SUSTAINGRAPH_URI')
SUSTAINGRAPH_USER = os.getenv('SUSTAINGRAPH_USER')
SUSTAINGRAPH_PASSWORD = os.getenv('SUSTAINGRAPH_PASSWORD')
database_name = os.getenv('DATABASE_NAME')

# Connect to database
driver = GraphDatabase.driver(SUSTAINGRAPH_URI, auth=(SUSTAINGRAPH_USER, SUSTAINGRAPH_PASSWORD))

# Verify connectivity
with driver.session(database=database_name) as session:
    print(session.run("RETURN 'Connected to ' + $db", db=database_name).single()[0])

Connected to neo4j


### Write batch function

In [3]:
def write_tps_indicators(tx,statement, params_dict):
    tx.run(statement, parameters=params_dict)

Since we have populated the graph with the EU, UN SDGs,it's time to enrich the Knoweledge graph with data coming from other sources, like the World Hapiness Report.
> Create TPS Source node

Before importing indicators coming from third party sources, the Source node with property name: "TPS" is created and all these indicators will be connected with this node through the relationship 'COMES_FROM'.

In [4]:
records, summary, keys = driver.execute_query("""\
        MERGE (s:Source{name:'TPS'})
        RETURN count(s) as Source_Data
        """)
print("Created {nodes_created} nodes in {time} ms.".format(
    nodes_created=summary.counters.nodes_created,
    time=summary.result_available_after
))

Created 1 nodes in 1 ms.


### Hapiness Score indicator

#### World Hapiness Report

The [World Hapiness Report](https://worldhappiness.report) is a publication of the Sustainable Development Solutions Network, powered by the Gallup World Poll data. It uses global survey data to report how people evaluate their own lives in more than 150 countries worldwide.
The global survey data are stored in an excel file, that we are going to import into the neo4j with the help of the py2neo library of python.

#### Pycountry Library

Before importing the data, it is required to map the country names to the ISO-alpha 3 codes. The ISOCode3 are unique for each country, while the country names may differ between different data sources. The [pycountry](https://pypi.org/project/pycountry/) is library of python that provides the ISO databases for countries, subdivision of countries etc.

In [5]:
path = 'Data/4.TPS_Happiness_Score'
xls_files = glob.glob(os.path.join(path, "*.xls"))

df_2022 = preprocessing_happiness(xls_files[2],'2022',driver=driver)
df_2022.rename(columns={'Dystopia (1.83) + residual': 'Dystopia + residual'},inplace=True)
df_2021 = preprocessing_happiness(xls_files[1],'2021',driver=driver)
df_2020 = preprocessing_happiness(xls_files[0],'2020',driver=driver)

Before update: No codes of countries: ['Kosovo', 'North Cyprus', 'Russia', 'Hong Kong S.A.R. of China', 'Ivory Coast', 'Turkey', 'Palestinian Territories', 'Eswatini, Kingdom of']

After update: No codes of countries: ['Kosovo', 'North Cyprus', 'Turkey', 'Eswatini, Kingdom of']
Length (with NA values): 146
Length (without NA values): 142
Length of df after selecting only countries existing in neo4j SustainGraph: 40
---------------------------------------------------------------------------
Before update: No codes of countries: ['Taiwan Province of China', 'Kosovo', 'North Cyprus', 'Russia', 'Hong Kong S.A.R. of China', 'Congo (Brazzaville)', 'Ivory Coast', 'Turkey', 'Palestinian Territories', 'Swaziland']

After update: No codes of countries: ['Taiwan Province of China', 'Kosovo', 'North Cyprus', 'Turkey', 'Eswatini, Kingdom of']
Length (with NA values): 149
Length (without NA values): 145
Length of df after selecting only countries existing in neo4j SustainGraph: 40
------------------

In [6]:
series_encoding ={
    'Happiness score':'happiness_score',
    'Dystopia + residual':'happiness_score_dystopia',
    'Explained by: GDP per capita':'happiness_score_gdp',
    'Explained by: Log GDP per capita':'happiness_score_log_gdp',
    'Explained by: Social support':'happiness_score_social_support', 
    'Explained by: Healthy life expectancy':'happiness_score_life_expectancy',
    'Explained by: Freedom to make life choices':'happiness_score_life_choices',
    'Explained by: Generosity':'happiness_score_generosity', 
    'Explained by: Perceptions of corruption':'happiness_score_corruption'
}
import_happiness_score(df=df_2020,year='2020',series_encoding=series_encoding,batch_size=10000,driver=driver)
import_happiness_score(df=df_2021,year='2021',series_encoding=series_encoding,batch_size=10000,driver=driver)
import_happiness_score(df=df_2022,year='2022',series_encoding=series_encoding,batch_size=10000,driver=driver)

2020
happiness_score
39 observations: Done! (0.0022367556889851888 minutes)
happiness_score_log_gdp
39 observations: Done! (0.0005583683649698893 minutes)
happiness_score_social_support
39 observations: Done! (0.0002621054649353027 minutes)
happiness_score_life_expectancy
39 observations: Done! (0.00026175578435262047 minutes)
happiness_score_life_choices
39 observations: Done! (0.00033391714096069335 minutes)
happiness_score_generosity
39 observations: Done! (0.000526277224222819 minutes)
happiness_score_corruption
39 observations: Done! (0.0004932959874471029 minutes)
happiness_score_dystopia
39 observations: Done! (0.0005353768666585286 minutes)
2021
happiness_score
40 observations: Done! (0.0005486249923706054 minutes)
happiness_score_log_gdp
40 observations: Done! (0.0005272229512532552 minutes)
happiness_score_social_support
40 observations: Done! (0.0010077277819315591 minutes)
happiness_score_life_expectancy
40 observations: Done! (0.0005302230517069498 minutes)
happiness_score

> Check cypher query

In [7]:
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='happiness_score'and o.time.year=2022
        return count(DISTINCT o) as obs
        """,routing_="r")
print("{nodes_created} Hapiness Observations(expected {expected} 2022) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = 40*8
))
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='happiness_score'and o.time.year=2022
        return count(DISTINCT o) as obs
        """,routing_="r")
print("{nodes_created} Hapiness Observations(expected {expected} 2021) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = 40*8
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='happiness_score'and o.time.year=2020
        return count(DISTINCT o) as obs
        """,routing_="r")
print("{nodes_created} Hapiness Observations(expected {expected} 2020) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = 39*8
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)
        where i.code='happiness_score'
        return count(DISTINCT s) as s
        """,routing_="r")
print("{nodes_created} Hapiness Series(expected {expected} 2020) in {time} ms.".format(
    nodes_created=records[0]['s'],
    time=summary.result_available_after,
    expected =9
))

320 Hapiness Observations(expected 320 2022) in 53 ms.
320 Hapiness Observations(expected 320 2021) in 0 ms.
312 Hapiness Observations(expected 312 2020) in 20 ms.
9 Hapiness Series(expected 9 2020) in 28 ms.


### Press Freedom Index
Reporters Without Borders is an international non-profit and non-governmental organization with the stated aim of safeguarding the right to freedom of information. The purpose of the World Press Freedom Index(https://rsf.org/en/index) is to compare the level of press freedom enjoyed by journalists and media in 180 countries and territories.  

#### Read files
The csv files regarding the data of Press Freedom Index between 2013 and 2022 are available under the "6.TPS_Press_Freedom_Index" folder. Regarding the below script, we read the files and we concatenate them into a pandas data frame.

In [8]:
# Get path of folder containing press freedom index
path = 'Data/4.TPS_Press_Freedom_Index'

# Get csv files
csv_files = glob.glob(os.path.join(path, "*.csv"))

all_df = []
for f in csv_files:
      
    # read the csv file
    df = pd.read_csv(f, sep=';')
    print('File Name:', f.split("\\")[-1])
    
    # Different columns in 2022 csv file
    if f.split("\\")[-1] == '2022.csv':
        df = df[['Year (N)','ISO', 'Rank', 'Score']]
        df.rename(columns = {'Rank':'Rank N', 'Score':'Score N'}, inplace = True)
    else:
        df = df[['Year (N)', 'ISO', 'Rank N', 'Score N']]
    
    all_df.append(df)
    
# Concatenate all dfs  
frame = pd.concat(all_df, axis=0, ignore_index=True)
frame['att_codes'] = 'IND'
frame['att_desc'] = 'Index'
frame['dim_codes'] ='NA'
frame['dim_desc'] = 'Not available'

# Keep only the rows containing values of countries existing in the neo4j LPG model.
records, summary, keys = driver.execute_query("""
        MATCH (r:Region{name:'Europe'})-[:HAS_SUBREGION]->(sr:SubRegion)-[:HAS_AREA]->(a:Area)
        MATCH (eu:EuropeanUnion)<-[:BELONGS_TO]-(eua:Area)
        WITH COLLECT(DISTINCT a.ISOalpha3code)+COLLECT(DISTINCT eua.ISOalpha3code) as geocodes
        UNWIND geocodes as codes
        RETURN COLLECT(DISTINCT codes) as geocodes
        """,routing_="r")
available_neo4j_geocodes = records[0]['geocodes']

frame = frame.loc[frame['ISO'].isin(available_neo4j_geocodes)]
frame.rename(columns = {'Year (N)':'time', 'Score N':'value','ISO':'geo'}, inplace = True)
frame['value'].replace(',','.',inplace=True,regex=True)

File Name: 2013.csv
File Name: 2014.csv
File Name: 2015.csv
File Name: 2016.csv
File Name: 2017.csv
File Name: 2018.csv
File Name: 2019.csv
File Name: 2020.csv
File Name: 2021.csv
File Name: 2022.csv


In [9]:
# Default values
code_name = 'press_freedom_index'
code_description= 'Press freedom is defined as the ability of journalists as individuals and collectives to select, produce, and disseminate news in the public interest independent of political, economic, legal, and social interference and in the absence of threats to their physical and mental safety'
url='https://rsf.org/en/index'

# Create (Series)<-[:HAS_SERIES]-(Indicator)-[:COMES_FROM]->(Source)
statement_tps_s_sm = """
        MATCH (so:Source{name:'TPS'})
        MERGE (i:Indicator{code:$ind_code,description:$ind_desc})
        MERGE (i)-[:COMES_FROM]->(so)
        MERGE (s:Series{code:$ind_code,dataProviderURL:$url,
        description:$ind_desc})
        MERGE (s)<-[:HAS_SERIES]-(i)        
        """ 
with driver.session() as session:
    session.execute_write(write_tps_indicators, 
                            params_dict = {'ind_code':code_name,'ind_desc':code_description,'url':url},
                            statement = statement_tps_s_sm)
    
import_sm_obs(frame,code_name,batch_size=10000,driver=driver,geoEUcode=False)


420 observations: Done! (0.004695018132527669 minutes)


> Check cypher query

In [10]:
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='press_freedom_index' and o.time.year=2022
        return count(DISTINCT o) as obs
        """,routing_="r")
print("{nodes_created} Press freedom Observations(expected {expected} 2022) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = 42
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='press_freedom_index' 
        return count(DISTINCT o) as obs
        """,routing_="r")
print("{nodes_created} Press freedom Observations(expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = len(frame)
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)-[r:HAS_OBSERVATIONS]-(ga:GeoArea)
        where i.code='press_freedom_index' 
        return count(DISTINCT r) as rels
        """,routing_="r")
print("{nodes_created} Press freedom indicator HAS_OBSERVATIONS-geoArea(expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['rels'],
    time=summary.result_available_after,
    expected = 42
))

42 Press freedom Observations(expected 42 2022) in 7 ms.
420 Press freedom Observations(expected 420) in 7 ms.
42 Press freedom indicator HAS_OBSERVATIONS-geoArea(expected 42) in 4 ms.
