In [1]:
import pandas as pd
from neo4j import GraphDatabase, basic_auth
import os
from dotenv import load_dotenv
from pathlib import Path
from functions import import_sm_obs

# Connect to the SustainGraph

In [2]:
#load the environment variables
dotenv_path = Path('~/.env')
load_dotenv(dotenv_path=dotenv_path)  # This line brings all environment variables from .env into os.environ

# Get variables
SUSTAINGRAPH_URI = os.getenv('SUSTAINGRAPH_URI')
SUSTAINGRAPH_USER = os.getenv('SUSTAINGRAPH_USER')
SUSTAINGRAPH_PASSWORD = os.getenv('SUSTAINGRAPH_PASSWORD')
database_name = os.getenv('DATABASE_NAME')

# Connect to database
driver = GraphDatabase.driver(SUSTAINGRAPH_URI, auth=(SUSTAINGRAPH_USER, SUSTAINGRAPH_PASSWORD))

# Verify connectivity
with driver.session(database=database_name) as session:
    print(session.run("RETURN 'Connected to ' + $db", db=database_name).single()[0])

Connected to neo4j


# Get data for different indicators

In [3]:
sheet_names = pd.ExcelFile('Data/7.Case_Studies_Indicators/meteorological_florina_noa.xlsx').sheet_names

In [4]:
df_temperature = pd.read_excel('Data/7.Case_Studies_Indicators/meteorological_florina_noa.xlsx',sheet_name=sheet_names[0])
df_temperature = df_temperature.rename(columns={'Unnamed: 0': 'Year'})
df_temperature = df_temperature[df_temperature.iloc[:, 0].notna()]

df_precipitation = pd.read_excel('Data/7.Case_Studies_Indicators/meteorological_florina_noa.xlsx',sheet_name=sheet_names[1])
df_precipitation = df_precipitation.rename(columns={'Unnamed: 0': 'Year'})

df_windspeed = pd.read_excel('Data/7.Case_Studies_Indicators/meteorological_florina_noa.xlsx',sheet_name=sheet_names[2])
df_windspeed = df_windspeed.rename(columns={'Unnamed: 0': 'Year'})

df_rainfall = pd.read_excel('Data/7.Case_Studies_Indicators/meteorological_florina_noa.xlsx',sheet_name=sheet_names[3])
df_rainfall = df_rainfall.rename(columns={'Unnamed: 0': 'Year'})

# Functions

In [5]:
# function
def preprocess_cs4_data(df,att_code,dim_code,att_desc,dim_desc):
    
    # Step 1: Melt the DataFrame to convert months into rows
    df_melted = pd.melt(df, id_vars=['Year'], value_vars=range(1, 13), 
                        var_name='Month', value_name='value')

    # Step 2: Create a date column in 'YYYY-MM-DD' format
    df_melted['time'] = pd.to_datetime(df_melted['Year'].astype(int).astype(str) + '-' + df_melted['Month'].astype(str) + '-01')

    # Step 3: Drop unnecessary columns and reorder
    df_final = df_melted[['time', 'value']].dropna().reset_index(drop=True)

    # Step 4: Add Metadata
    df_final['att_codes'] = att_code
    df_final['att_desc'] = att_desc
    df_final['dim_codes'] = dim_code
    df_final['dim_desc'] = dim_desc
    df_final['geo']='EL533'

    df_final['time'] = pd.to_datetime(df_final['time']).dt.strftime('%Y-%m-%d')


    return df_final


In [6]:
def write_tps_indicators(tx,statement, params_dict):
    tx.run(statement, parameters=params_dict)

# Import mean temperature

In [7]:
df_temperature_import = preprocess_cs4_data(df_temperature,'DegCel|2m','M','Degree Celsius|2 meters above the surface','Monthly')
df_temperature_import.head(1)

Unnamed: 0,time,value,att_codes,att_desc,dim_codes,dim_desc,geo
0,2008-01-01,1.2,DegCel|2m,Degree Celsius|2 meters above the surface,M,Monthly,EL533


In [8]:
df_temperature_import.to_excel('mean_temp.xlsx')

In [9]:
# Default values
code_name = 'mean_temp'
code_description= 'The mean temperature is measured 2 meters above the surface and provided by the Institute for Environmental Research, National Observatory of Athens (IERSD/NOA) in meteo.gr.'
url='https://catalogue.arsinoe-project.eu/dataset/meteorological_florina_noa'

# Create (Series)<-[:HAS_SERIES]-(Indicator)-[:COMES_FROM]->(Source)
statement_tps_s_sm = """
        MATCH (so:Source{name:'TPS'})
        MERGE (i:Indicator{code:$ind_code,description:$ind_desc})
        MERGE (i)-[:COMES_FROM]->(so)
        MERGE (s:Series{code:$ind_code,dataProviderURL:$url,
        description:$ind_desc})
        MERGE (s)<-[:HAS_SERIES]-(i)        
        """ 
with driver.session(database=database_name) as session:
    session.execute_write(write_tps_indicators, 
                            params_dict = {'ind_code':code_name,'ind_desc':code_description,'url':url},
                            statement = statement_tps_s_sm)
    
import_sm_obs(df_temperature_import,code_name,batch_size=10000,driver=driver,geoEUcode=True)


174 observations: Done! (0.0012652039527893066 minutes)


In [10]:
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='mean_temp' 
        return count(DISTINCT o) as obs
        """,routing_="r",database_=database_name)
print("{nodes_created} Observations(expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = len(df_temperature_import)
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)-[r:HAS_OBSERVATIONS]-(ga:GeoArea)
        where i.code='mean_temp' 
        return count(DISTINCT r) as rels
        """,routing_="r",database_=database_name)
print("{nodes_created}  HAS_OBSERVATIONS-geoArea(expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['rels'],
    time=summary.result_available_after,
    expected = 1
))

174 Observations(expected 174) in 0 ms.
1  HAS_OBSERVATIONS-geoArea(expected 1) in 0 ms.


# Import precipitation

In [11]:
df_precipitation_import = preprocess_cs4_data(df_precipitation,'mm|3h','M','Milimetre|3h accumulated precipitation','Monthly')
df_precipitation_import.head(1)

Unnamed: 0,time,value,att_codes,att_desc,dim_codes,dim_desc,geo
0,2008-01-01,3.8,mm|3h,Milimetre|3h accumulated precipitation,M,Monthly,EL533


In [12]:
# Default values
code_name = 'mean_precipitation'
code_description= 'Total 3-hr accumulated precipitation in mm provided by the Institute for Environmental Research, National Observatory of Athens (IERSD/NOA) in meteo.gr.'
url='https://catalogue.arsinoe-project.eu/dataset/meteorological_florina_noa'

# Create (Series)<-[:HAS_SERIES]-(Indicator)-[:COMES_FROM]->(Source)
statement_tps_s_sm = """
        MATCH (so:Source{name:'TPS'})
        MERGE (i:Indicator{code:$ind_code,description:$ind_desc})
        MERGE (i)-[:COMES_FROM]->(so)
        MERGE (s:Series{code:$ind_code,dataProviderURL:$url,
        description:$ind_desc})
        MERGE (s)<-[:HAS_SERIES]-(i)        
        """ 
with driver.session(database=database_name) as session:
    session.execute_write(write_tps_indicators, 
                            params_dict = {'ind_code':code_name,'ind_desc':code_description,'url':url},
                            statement = statement_tps_s_sm)
    
import_sm_obs(df_precipitation_import,code_name,batch_size=10000,driver=driver,geoEUcode=True)


174 observations: Done! (0.0009817560513814291 minutes)


In [21]:
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='mean_precipitation' 
        return count(DISTINCT o) as obs
        """,routing_="r",database_=database_name)
print("{nodes_created} Observations (expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = len(df_precipitation_import)
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)-[r:HAS_OBSERVATIONS]-(ga:GeoArea)
        where i.code='mean_precipitation' 
        return count(DISTINCT r) as rels
        """,routing_="r",database_=database_name)
print("{nodes_created} HAS_OBSERVATIONS-geoArea (expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['rels'],
    time=summary.result_available_after,
    expected = 1
))

174 Observations (expected 174) in 1 ms.
1 HAS_OBSERVATIONS-geoArea (expected 1) in 1 ms.


# Import wind speed

In [14]:
df_windspeed_import = preprocess_cs4_data(df_windspeed,'m per s|10m','M','Meters per second|wind at 10 m','Monthly')
df_windspeed_import.head(1)

Unnamed: 0,time,value,att_codes,att_desc,dim_codes,dim_desc,geo
0,2008-01-01,2.9,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533


In [15]:
df_windspeed_import

Unnamed: 0,time,value,att_codes,att_desc,dim_codes,dim_desc,geo
0,2008-01-01,2.9,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
1,2009-01-01,2.4,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
2,2010-01-01,4.2,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
3,2011-01-01,2.2,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
4,2012-01-01,3.3,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
...,...,...,...,...,...,...,...
169,2017-12-01,4.5,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
170,2018-12-01,3.3,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
171,2019-12-01,3.4,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533
172,2020-12-01,2.1,m per s|10m,Meters per second|wind at 10 m,M,Monthly,EL533


In [16]:
# Default values
code_name = 'mean_wind_speed'
code_description= 'Wind at 10 m (Beaufort) provided by the Institute for Environmental Research, National Observatory of Athens (IERSD/NOA) in meteo.gr.'
url='https://catalogue.arsinoe-project.eu/dataset/meteorological_florina_noa'

# Create (Series)<-[:HAS_SERIES]-(Indicator)-[:COMES_FROM]->(Source)
statement_tps_s_sm = """
        MATCH (so:Source{name:'TPS'})
        MERGE (i:Indicator{code:$ind_code,description:$ind_desc})
        MERGE (i)-[:COMES_FROM]->(so)
        MERGE (s:Series{code:$ind_code,dataProviderURL:$url,
        description:$ind_desc})
        MERGE (s)<-[:HAS_SERIES]-(i)        
        """ 
with driver.session(database=database_name) as session:
    session.execute_write(write_tps_indicators, 
                            params_dict = {'ind_code':code_name,'ind_desc':code_description,'url':url},
                            statement = statement_tps_s_sm)
    
import_sm_obs(df_windspeed_import,code_name,batch_size=10000,driver=driver,geoEUcode=True)


174 observations: Done! (0.0008516550064086914 minutes)


In [22]:
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='mean_wind_speed' 
        return count(DISTINCT o) as obs
        """,routing_="r",database_=database_name)
print("{nodes_created} Observations (expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = len(df_windspeed_import)
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)-[r:HAS_OBSERVATIONS]-(ga:GeoArea)
        where i.code='mean_wind_speed' 
        return count(DISTINCT r) as rels
        """,routing_="r",database_=database_name)
print("{nodes_created} HAS_OBSERVATIONS-geoArea (expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['rels'],
    time=summary.result_available_after,
    expected = 1
))

174 Observations (expected 174) in 1 ms.
1 HAS_OBSERVATIONS-geoArea (expected 1) in 1 ms.


# Import days of rainfall

In [18]:
df_rainfall_import = preprocess_cs4_data(df_rainfall,'NR','M','Number','Monthly')
df_rainfall_import.head(1)

Unnamed: 0,time,value,att_codes,att_desc,dim_codes,dim_desc,geo
0,2008-01-01,4.0,NR,Number,M,Monthly,EL533


In [19]:
# Default values
code_name = 'number_of_days_of_rainfall'
code_description= 'Number of days of rainfall provided by the Institute for Environmental Research, National Observatory of Athens (IERSD/NOA) in meteo.gr.'
url='https://catalogue.arsinoe-project.eu/dataset/meteorological_florina_noa'

# Create (Series)<-[:HAS_SERIES]-(Indicator)-[:COMES_FROM]->(Source)
statement_tps_s_sm = """
        MATCH (so:Source{name:'TPS'})
        MERGE (i:Indicator{code:$ind_code,description:$ind_desc})
        MERGE (i)-[:COMES_FROM]->(so)
        MERGE (s:Series{code:$ind_code,dataProviderURL:$url,
        description:$ind_desc})
        MERGE (s)<-[:HAS_SERIES]-(i)        
        """ 
with driver.session(database=database_name) as session:
    session.execute_write(write_tps_indicators, 
                            params_dict = {'ind_code':code_name,'ind_desc':code_description,'url':url},
                            statement = statement_tps_s_sm)
    
import_sm_obs(df_rainfall_import,code_name,batch_size=10000,driver=driver,geoEUcode=True)


174 observations: Done! (0.001319583257039388 minutes)


In [23]:
records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)--(s:Series)--(sm:SeriesMetadata)--(o:Observation)--(ga:GeoArea)
        where i.code='number_of_days_of_rainfall' 
        return count(DISTINCT o) as obs
        """,routing_="r",database_=database_name)
print("{nodes_created} Observations (expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['obs'],
    time=summary.result_available_after,
    expected = len(df_rainfall_import)
))

records, summary, keys = driver.execute_query("""\
       MATCH (i:Indicator)-[r:HAS_OBSERVATIONS]-(ga:GeoArea)
        where i.code='number_of_days_of_rainfall' 
        return count(DISTINCT r) as rels
        """,routing_="r",database_=database_name)
print("{nodes_created} HAS_OBSERVATIONS-geoArea (expected {expected}) in {time} ms.".format(
    nodes_created=records[0]['rels'],
    time=summary.result_available_after,
    expected = 1
))

174 Observations (expected 174) in 6 ms.
1 HAS_OBSERVATIONS-geoArea (expected 1) in 0 ms.
