# CS6 Datasets 

In [1]:
import pandas as pd
from neo4j import GraphDatabase, basic_auth
import os
from dotenv import load_dotenv
from pathlib import Path

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
#load the environment variables
dotenv_path = Path('~/.env')
load_dotenv(dotenv_path=dotenv_path)  # This line brings all environment variables from .env into os.environ

# Get variables
SUSTAINGRAPH_URI = os.getenv('SUSTAINGRAPH_URI')
SUSTAINGRAPH_USER = os.getenv('SUSTAINGRAPH_USER')
SUSTAINGRAPH_PASSWORD = os.getenv('SUSTAINGRAPH_PASSWORD')

database_name = os.getenv('DATABASE_NAME')

# Connect to database
driver = GraphDatabase.driver(SUSTAINGRAPH_URI, auth=(SUSTAINGRAPH_USER, SUSTAINGRAPH_PASSWORD))

# Verify connectivity
with driver.session(database=database_name) as session:
    print(session.run("RETURN 'Connected to ' + $db", db=database_name).single()[0])

Connected to neo4j


## Queries

In [3]:
#Create Indicator-Series
def write_data(tx,statement, params_dict, database_name='neo4j'):
    tx.run(statement, parameters=params_dict, database_ =database_name)

query_indicators_series = """
MATCH (src:Source{name:'TPS'})
MERGE (i:Indicator{code:$ind_code,description:$ind_descr})
MERGE (s:Series{code:$series_code,description:$series_descr,dataProviderURL:$dataProviderURL})
MERGE (i)-[:COMES_FROM]->(src)
MERGE (i)-[:HAS_SERIES]->(s)
"""

# CS6 Dataset from Arsinoe Catalogue: https://catalogue.arsinoe-project.eu/dataset/my-dataset

Flow rates and water levels of Aliakmonas river measured at the Niseli station during the period from 2018 to 2021

In [4]:
df_cs = pd.read_excel('Data/7.Case_Studies_Indicators/CS6-aliakmonas_niseli_2018_2021.xlsx', sheet_name='Aliakmonas_NIseli_2018_2021')

In [5]:
df_cs

Unnamed: 0,Date,Time,Aliakmonas_Discharge (m³/s),WATER LEVEL RQ30 (m)
0,2018-01-01 00:00:00,00:00:00,*,0.78
1,2018-01-01 00:00:00,00:30:00,1026853,0.79
2,2018-01-01 00:00:00,01:00:00,1054468,0.8
3,2018-01-01 00:00:00,01:30:00,1057046,0.81
4,2018-01-01 00:00:00,02:00:00,1060175,0.81
...,...,...,...,...
64779,2021-09-09 00:00:00,09:00:00,2857558,1.41
64780,2021-09-09 00:00:00,09:30:00,2810348,1.4
64781,2021-09-09 00:00:00,10:00:00,2781152,1.38
64782,2021-09-09 00:00:00,10:30:00,2736600,1.37


The dataset has frequency of 30 min. To be introduced in the SustainGraph the data are aggregated to daily frequency.

For water discharge (flow rate) the average value of the hourly values is calculated for the daily value.

For water level  the average value of the hourly values is calculated for the daily value.

In [6]:
#First we make sure that all the values in the Dataframe are numeric and if not convert them to Nan
df_cs.iloc[:, 2:] = df_cs.iloc[:, 2:].apply(pd.to_numeric, errors='coerce')
#Convert column Date to datetime 
df_cs['Date'] = pd.to_datetime(df_cs['Date'])
df_cs

Unnamed: 0,Date,Time,Aliakmonas_Discharge (m³/s),WATER LEVEL RQ30 (m)
0,2018-01-01,00:00:00,,0.78
1,2018-01-01,00:30:00,1026853.0,0.79
2,2018-01-01,01:00:00,1054468.0,0.8
3,2018-01-01,01:30:00,1057046.0,0.81
4,2018-01-01,02:00:00,1060175.0,0.81
...,...,...,...,...
64779,2021-09-09,09:00:00,2857558.0,1.41
64780,2021-09-09,09:30:00,2810348.0,1.4
64781,2021-09-09,10:00:00,2781152.0,1.38
64782,2021-09-09,10:30:00,2736600.0,1.37


In [7]:
df_cs_avg = df_cs.groupby(df_cs['Date'])[['Aliakmonas_Discharge (m³/s)', 'WATER LEVEL RQ30 (m)']].mean().reset_index()
df_cs_avg

Unnamed: 0,Date,Aliakmonas_Discharge (m³/s),WATER LEVEL RQ30 (m)
0,2018-01-01,870199.702128,0.727292
1,2018-01-02,1610447.6875,1.03
2,2018-01-03,11780701.645833,2.870833
3,2018-01-04,21320410.0,3.800625
4,2018-01-05,3083156.354167,1.523125
...,...,...,...
1343,2021-12-04,2176023.354167,1.260417
1344,2021-12-05,2790266.583333,1.415833
1345,2021-12-06,5847521.083333,2.025417
1346,2021-12-07,2364654.770833,1.315417


In [8]:
df_cs_avg[df_cs_avg['Aliakmonas_Discharge (m³/s)'].isnull()]

Unnamed: 0,Date,Aliakmonas_Discharge (m³/s),WATER LEVEL RQ30 (m)
173,2018-06-23,,
174,2018-06-24,,
175,2018-06-25,,
176,2018-06-26,,
177,2018-06-27,,
1277,2021-07-22,,
1278,2021-07-23,,
1279,2021-07-24,,
1280,2021-07-25,,
1281,2021-07-26,,


In [None]:

params_inds = [{
    'ind_code':'aliakmonas_discharge',
    'ind_descr':'Water discharge in m3/s for Aliakmonas river measured at the Niseli station during the period from 2018 to 2021',
    'series_code':'aliakmonas_discharge',
    'series_descr':'Water discharge in m3/s for Aliakmonas river measured at the Niseli station during the period from 2018 to 2021 provided by Institute of Soil and Water Resources',
    'dataProviderURL':'https://catalogue.arsinoe-project.eu/dataset/my-dataset'
    },
    {
    'ind_code':'aliakmonas_water_level',
    'ind_descr':'Water level in m of Aliakmonas river measured at the Niseli station with during the period from 2018 to 2021',
    'series_code':'aliakmonas_water_level',
    'series_descr':'Water level in m of Aliakmonas river measured at the Niseli station during the period from 2018 to 2021 provided by Institute of Soil and Water Resources',
    'dataProviderURL':'https://catalogue.arsinoe-project.eu/dataset/my-dataset'       
    }]
with driver.session() as session:
        for params_ind in params_inds:
            session.execute_write(write_data, query_indicators_series, params_ind)

In [None]:
aliakmonas_discharge = []
aliakmonas_water_level = []
for index, row in df_cs_avg.iterrows():
    if not pd.isna(row['Aliakmonas_Discharge (m³/s)']):  
        aliakmonas_discharge.append({
            'date': row['Date'],
            'value':round(row['Aliakmonas_Discharge (m³/s)'],1)
        })
    if not pd.isna(row['WATER LEVEL RQ30 (m)']):  
        aliakmonas_water_level.append({
            'date': row['Date'],
            'value':round(row['WATER LEVEL RQ30 (m)'],2)
        })

params_inds = [{
    'ind_code':'aliakmonas_discharge',
    'series_code':'aliakmonas_discharge',
    'attr_code':'M3_PER_SEC',
    'attr_descr':'Cubic metres per second',
    'dim_code' : 'D_AVG',
    'dim_descr' : 'Daily average (30 min interval)',
    'observations': aliakmonas_discharge
    },
    {
    'ind_code':'aliakmonas_water_level',
    'series_code':'aliakmonas_water_level',
    'attr_code':'M|RQ30',
    'attr_descr':'Meters|RQ-30 sensor',
    'dim_code' : 'D_AVG',
    'dim_descr' : 'Daily average (30 min interval)',
    'observations':aliakmonas_water_level
    }]

In [None]:
#Niseli Station is at Imathia (EUcode: EL521)
statement = """ 
    MATCH (i:Indicator{code:$ind_code}), (s:Series{code:$series_code}),(ga:GeoArea{EUcode: 'EL521'})
    MERGE (sm:SeriesMetadata{attributesCode:$attr_code,attributesDescription:$attr_descr,dimensionsCode:$dim_code,
    dimensionsDescription:$dim_descr,seriesCode:$series_code})
    MERGE (s)-[:HAS_METADATA]->(sm)
    WITH i, sm, ga
    UNWIND $observations as obs
    MERGE (sm)-[:HAS_OBSERVATION{attributesCode:$attr_code,dimensionsCode:$dim_code,
                                seriesCode:$series_code,time:date(obs.date),
                                geoCode:'EL521'}]->(o:Observation{time:date(obs.date)})
    SET o.value = obs.value
    MERGE (o)-[:REFERS_TO_AREA]->(ga)
    MERGE (i) -[:HAS_OBSERVATIONS]->(ga)
    RETURN COUNT(DISTINCT o) as total

"""
def write_data_obs(tx,statement, parameters):
    records = tx.run(statement, parameters= parameters)
    return records.data()[0]['total']

In [12]:
print('Data observations to import ', len(aliakmonas_discharge))
print('Data observations to import ', len(aliakmonas_water_level))
with driver.session() as session:
    for params_ind in params_inds:
        obs_data = session.execute_write(write_data_obs, statement, parameters = params_ind)
        print('Imported data: ', obs_data)

Data observations to import  1338
Data observations to import  1338
Imported data:  1338
Imported data:  1338


# CS6 dataset with Aliakmon river daily data

In [4]:
df_cs = pd.read_excel('Data/7.Case_Studies_Indicators/Arsinoe_Aliakmon_DT_daily_data.xlsx', sheet_name='daily_data')
df_cs['Date'] = pd.to_datetime(df_cs['Date'])
print('Length of original dataframe: ', len(df_cs))
# # Drop the duplicate column for indiccator D_std
# df_cs = df_cs.drop(columns=['D_std.1'])

df_cs['date'] = pd.to_datetime(df_cs['Date'], dayfirst=True, errors='coerce')
cutoff = pd.Timestamp('2026-01-15')
df_cs = df_cs[df_cs['date'] <= cutoff].copy()
df_cs = df_cs.drop(columns=['date'])
print('Length of filtered dataframe: ', len(df_cs))

Length of original dataframe:  386
Length of filtered dataframe:  380


In [5]:
df_cs

Unnamed: 0,Date,ila_in_Sim,ila_in_Obs,ila_out_Obs,IL_elev_Sim,IL_elev_Obs,Pol_in_Sim,Pol_in_Obs,Pol_out_Obs,Pol_elev_Sim,...,C_Q3,C_max,D_std,D_min,D_Q1,D_Q2,D_avg,D_Q3,D_max,D_std2
0,2025-01-01,45.021851,45.020000,87.190002,379.570007,379.570007,93.061806,93.070000,32.060001,282.160004,...,,,,,,,,,,
1,2025-01-02,39.468590,39.459999,76.620003,379.029999,379.029999,83.301712,83.309998,10.950000,282.260010,...,,,,,,,,,,
2,2025-01-03,40.030651,40.049999,81.150002,378.440002,378.440002,89.957710,89.970001,35.810001,282.350006,...,,,,,,,,,,
3,2025-01-04,38.803192,38.810001,87.639999,377.730011,377.730011,97.932526,97.930000,21.200001,282.459991,...,,,,,,,,,,
4,2025-01-05,38.439011,38.459999,80.870003,377.109985,377.109985,90.555107,90.559998,16.370001,282.570007,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2026-01-11,151.213425,238.777130,152.045227,385.922821,390.427673,155.562073,163.372070,34.969723,286.979523,...,,,,,,,,,,
376,2026-01-12,142.262802,298.977478,165.878357,385.799530,391.567413,169.276260,174.355820,63.108017,287.131165,...,,,,,,,,,,
377,2026-01-13,131.921585,222.534515,156.840469,385.555298,392.103790,160.646835,158.074219,64.187180,287.266205,...,,,,,,,,,,
378,2026-01-14,123.324982,,78.662933,385.653961,,83.852821,,7.140000,287.381561,...,,,,,,,,,,


In [6]:
#Check if there are duplicate columns
duplicate_columns = df_cs.columns[df_cs.columns.duplicated()].tolist()
duplicate_columns

[]

In [7]:
if 'C_std' not in df_cs.columns:
    print('C_std is missing')
if 'D_std2' in df_cs.columns:
    print('D_std2 is a typo')

C_std is missing
D_std2 is a typo


In [8]:
#Rename the D_std to C_std and D_std2 to D_std
df_cs.rename(columns={'D_std':'C_std', 'D_std2':'D_std'}, inplace=True)
df_cs

Unnamed: 0,Date,ila_in_Sim,ila_in_Obs,ila_out_Obs,IL_elev_Sim,IL_elev_Obs,Pol_in_Sim,Pol_in_Obs,Pol_out_Obs,Pol_elev_Sim,...,C_Q3,C_max,C_std,D_min,D_Q1,D_Q2,D_avg,D_Q3,D_max,D_std
0,2025-01-01,45.021851,45.020000,87.190002,379.570007,379.570007,93.061806,93.070000,32.060001,282.160004,...,,,,,,,,,,
1,2025-01-02,39.468590,39.459999,76.620003,379.029999,379.029999,83.301712,83.309998,10.950000,282.260010,...,,,,,,,,,,
2,2025-01-03,40.030651,40.049999,81.150002,378.440002,378.440002,89.957710,89.970001,35.810001,282.350006,...,,,,,,,,,,
3,2025-01-04,38.803192,38.810001,87.639999,377.730011,377.730011,97.932526,97.930000,21.200001,282.459991,...,,,,,,,,,,
4,2025-01-05,38.439011,38.459999,80.870003,377.109985,377.109985,90.555107,90.559998,16.370001,282.570007,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,2026-01-11,151.213425,238.777130,152.045227,385.922821,390.427673,155.562073,163.372070,34.969723,286.979523,...,,,,,,,,,,
376,2026-01-12,142.262802,298.977478,165.878357,385.799530,391.567413,169.276260,174.355820,63.108017,287.131165,...,,,,,,,,,,
377,2026-01-13,131.921585,222.534515,156.840469,385.555298,392.103790,160.646835,158.074219,64.187180,287.266205,...,,,,,,,,,,
378,2026-01-14,123.324982,,78.662933,385.653961,,83.852821,,7.140000,287.381561,...,,,,,,,,,,


In [9]:
df_cs_metadata = pd.read_excel('Data/7.Case_Studies_Indicators/Arsinoe_Aliakmon_DT_daily_data.xlsx', sheet_name='InfoSheet', skiprows=2)
# Drop the first column(NODE) by position
df_cs_metadata = df_cs_metadata.iloc[:, 1:]
df_cs_metadata.rename(columns={"Datasheet_Column_Full Names.1": "Indicator_Codes"}, inplace=True)
df_cs_metadata.rename(columns={"Datasheet_Column_Full Names": "Indicator_Descriptions"}, inplace=True)
df_cs_metadata

Unnamed: 0,Indicator_Descriptions,Indicator_Codes,Type,Variable,Units,Diagram,Map Pin 1 Name,Map Pin 2 Name (Monitoring Stations separately),Comments
0,Ilarion Reservoir Inflows Simulated,ila_in_Sim,Simulation,flow-rate,m3/s,1,Ilarion Reservoir,,Mean daily values
1,Ilarion Reservoir Inflows Observed,ila_in_Obs,Observation,flow-rate,m3/s,1,Ilarion Reservoir,,Mean daily values
2,Ilarion Reservoir Outflows Observed,ila_out_Obs,Observation,flow-rate,m3/s,1,Ilarion Reservoir,,Mean daily values. Estimated from energy produ...
3,Ilarion Reservoir Pool Elevation Simulated,IL_elev_Sim,Simulation,elevation,m,1,Ilarion Reservoir,,Mean daily values
4,Ilarion Reservoir Pool Elevation Observed,IL_elev_Obs,Observation,elevation,m,1,Ilarion Reservoir,,Mean daily values
5,Polifitos Reservoir Inflows Simulated,Pol_in_Sim,Simulation,flow-rate,m3/s,2,Polyfytos Reservoir,,Mean daily values
6,Polifitos Reservoir Inflows Observed,Pol_in_Obs,Observation,flow-rate,m3/s,2,Polyfytos Reservoir,,Mean daily values
7,Polifitos Reservoir Outflows Observed,Pol_out_Obs,Observation,flow-rate,m3/s,2,Polyfytos Reservoir,,Mean daily values. Estimated from energy produ...
8,Polifitos Reservoir Pool Elevation Simulated,Pol_elev_Sim,Simulation,elevation,m,2,Polyfytos Reservoir,,instant value at 24:00
9,Polifitos Reservoir Pool Elevation Observed,Pol_elev_Obs,Observation,elevation,m,2,Polyfytos Reservoir,,instant value at 24:00


In [10]:
#Check if there are duplicate indicator codes in Indicator_Codes column
duplicate_indicator_codes = df_cs_metadata['Indicator_Codes'][df_cs_metadata['Indicator_Codes'].duplicated()].tolist()
duplicate_indicator_codes
for ind, code in enumerate(duplicate_indicator_codes):
    print(code)
    print(df_cs_metadata[df_cs_metadata['Indicator_Codes']==code]['Indicator_Descriptions'])

D_std
43    RiverMon_C Monitoring Station_stdev
50    RiverMon_D Monitoring Station_stdev
Name: Indicator_Descriptions, dtype: object


In [11]:
#Rename 'D-std' dulicate code with RiverMon_C Monitoring Station_stdev to 'C_std'
df_cs_metadata.loc[df_cs_metadata['Indicator_Descriptions']=='RiverMon_C Monitoring Station_stdev', 'Indicator_Codes'] = 'C_std'
duplicate_indicator_codes = df_cs_metadata['Indicator_Codes'][df_cs_metadata['Indicator_Codes'].duplicated()].tolist()
duplicate_indicator_codes

[]

In [12]:
type_mapping = {
    'Simulation': 'Sim',
    'Observation':'Obs', 
    'Monitoring': 'Mon',
}
variables_mapping = {
    'flow-rate': 'flow_rate',
    'elevation': 'elevation',
    'flow depth': 'flow_depth'
}
units_mapping = {
    'm3/s': 'Cubic metres per second',  
    'm': 'Metres',
}
comments_mapping = {
    'Mean daily values':'MDV',
    'Mean daily values. Estimated from energy production':'MDV_Estimated',
    'instant value at 24:00': 'IV_24h',
    'Mean daily values. Estimated from in-route of Channel A0 small hydropower station "Makrochori_1" energy production': 'MDV_Estimated_Makrochori_1',
    'Inflow to Aliakmon River from Almopaios': 'Inflow_Almopaios',
    'This is the same as "24. Agia Varvara Reservoir Outflows to Aliakmon Simulated"': 'Agia_Varvara_Reservoir_Outflows',
    'Junction of Almopaios and Aliakmon downstream of Agia Varvara': 'Junction_Almopaios_Aliakmon',
    "Monitoring stations' data include 7 daily stats (min, Q1, Q2, mean, Q3, max, stdev) of flow depth, derived from 10min interval measured data": 'Monitoring_Stations_7_Daily_Stats',
}


pin_mapping = {
    'Ilarion Reservoir': 'Ilarion_Reservoir',
 'Polyfytos Reservoir': 'Polyfytos_Reservoir',
 'Sfikia Reservoir': 'Sfikia_Reservoir',
 'Asomata Reservoir': 'Asomata_Reservoir',
 'Agia Varvara Reservoir': 'Agia_Varvara_Reservoir',
 'Almopaios R. Discharge':  'Almopaios_R_Discharge',
 'Agia Varvara Downstream': 'Agia_Varvara_Downstream',
 'Kouloura Junction':  'Kouloura_Junction',
 'Rapsomaniki Junction':  'Rapsomaniki_Junction',
 'Niselli Bridge Discharge': 'Niselli_Bridge_Discharge'
}


In [13]:
df_cs_metadata

Unnamed: 0,Indicator_Descriptions,Indicator_Codes,Type,Variable,Units,Diagram,Map Pin 1 Name,Map Pin 2 Name (Monitoring Stations separately),Comments
0,Ilarion Reservoir Inflows Simulated,ila_in_Sim,Simulation,flow-rate,m3/s,1,Ilarion Reservoir,,Mean daily values
1,Ilarion Reservoir Inflows Observed,ila_in_Obs,Observation,flow-rate,m3/s,1,Ilarion Reservoir,,Mean daily values
2,Ilarion Reservoir Outflows Observed,ila_out_Obs,Observation,flow-rate,m3/s,1,Ilarion Reservoir,,Mean daily values. Estimated from energy produ...
3,Ilarion Reservoir Pool Elevation Simulated,IL_elev_Sim,Simulation,elevation,m,1,Ilarion Reservoir,,Mean daily values
4,Ilarion Reservoir Pool Elevation Observed,IL_elev_Obs,Observation,elevation,m,1,Ilarion Reservoir,,Mean daily values
5,Polifitos Reservoir Inflows Simulated,Pol_in_Sim,Simulation,flow-rate,m3/s,2,Polyfytos Reservoir,,Mean daily values
6,Polifitos Reservoir Inflows Observed,Pol_in_Obs,Observation,flow-rate,m3/s,2,Polyfytos Reservoir,,Mean daily values
7,Polifitos Reservoir Outflows Observed,Pol_out_Obs,Observation,flow-rate,m3/s,2,Polyfytos Reservoir,,Mean daily values. Estimated from energy produ...
8,Polifitos Reservoir Pool Elevation Simulated,Pol_elev_Sim,Simulation,elevation,m,2,Polyfytos Reservoir,,instant value at 24:00
9,Polifitos Reservoir Pool Elevation Observed,Pol_elev_Obs,Observation,elevation,m,2,Polyfytos Reservoir,,instant value at 24:00


In [14]:
params_ind_series_list = []
params_metadata_obs_list = []
indicator_codes = [col for col in df_cs.columns if col != 'Date']
for code in indicator_codes:
    print(code)
    if code not in df_cs_metadata['Indicator_Codes'].values:
        print(f"Warning: Indicator code '{code}' not found in metadata. Skipping...")
        continue
    description = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Indicator_Descriptions'].values[0]
    
    params_ind_series = {
        'ind_code': code,
        'ind_descr': description,
        'series_code': code,
        'series_descr': description,
        'dataProviderURL': 'ARSINOE Case Study 6',
    }
    params_ind_series_list.append(params_ind_series)
    # Create Indicator-Series
    type = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Type'].values[0]
    type_code = type_mapping[type]
    variable = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Variable'].values[0]
    variable_code = variables_mapping[variable]
    unit = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Units'].values[0]
    unit_code = units_mapping[unit]
    map_pin_1 = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Map Pin 1 Name'].values[0]
    map_pin_1_code = pin_mapping[map_pin_1]
    map_pin_2 = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Map Pin 2 Name (Monitoring Stations separately)'].values[0]
    comment = df_cs_metadata[df_cs_metadata['Indicator_Codes'] == code][ 'Comments'].values[0]
    comment_code = comments_mapping[comment]
    
        # Join only non-NaN, non-None values
    attr_code = "|".join(str(x) for x in [type_code, variable_code, unit_code] if  pd.notna(x))
    attr_descr = "|".join(str(x) for x in [type, variable, unit] if  pd.notna(x))
    dim_code = "|".join(str(x) for x in [comment_code, map_pin_1_code, map_pin_2] if  pd.notna(x))
    dim_descr = "|".join(str(x) for x in [comment, map_pin_1, map_pin_2] if  pd.notna(x))

    observations = []
    for index, row in df_cs.iterrows():
        if not pd.isna(row[code]):  
            observations.append({
                'date': row['Date'],
                'value':round(row[code],3)
            })
        
    params_metadata_obs={
        'ind_code':code,
        'series_code':code,
        'attr_code': attr_code,
        'attr_descr': attr_descr,
        'dim_code': dim_code,
        'dim_descr':dim_descr,
        'observations':observations
    }
    params_metadata_obs_list.append(params_metadata_obs)

ila_in_Sim
ila_in_Obs
ila_out_Obs
IL_elev_Sim
IL_elev_Obs
Pol_in_Sim
Pol_in_Obs
Pol_out_Obs
Pol_elev_Sim
Pol_elev_Obs
Sfi_elev_Sim
Sfi_elev_Obs
Aso_elev_Sim
Aso_elev_Obs
Var_out_A0_Obs
Var_out_Aliak_Sim
Almo
Var_down_Aliak
Koul
Rap_in_A0
Rap_out_Aliak
Rap_out_SKG
Nis_Sim
A_min
A_Q1
A_Q2
A_avg
A_Q3
A_max
A_std
B_min
B_Q1
B_Q2
B_avg
B_Q3
B_max
B_std
C_min
C_Q1
C_Q2
C_avg
C_Q3
C_max
C_std
D_min
D_Q1
D_Q2
D_avg
D_Q3
D_max
D_std


In [15]:
#indroduce indicator and series 

for params_ind in params_ind_series_list:
    records,_,_ = driver.execute_query(
            query_indicators_series,
            params_ind,
            database_=database_name
        )
    

In [16]:
query_validate = """
    MATCH (i:Indicator)-[:COMES_FROM]->(:Source {name: 'TPS'})
    RETURN i.code AS code
"""

records,_,_ = driver.execute_query(
        query_validate,
        database_=database_name
    )
imported_codes = {record["code"] for record in records}

expected_codes = {item["ind_code"] for item in params_ind_series_list}

missing = expected_codes - imported_codes
print(f"Imported: {expected_codes & imported_codes}")
print(f"Missing: {missing}")


Imported: {'C_std', 'Rap_out_Aliak', 'Almo', 'B_Q2', 'D_min', 'A_avg', 'Rap_out_SKG', 'Pol_in_Obs', 'Pol_out_Obs', 'D_Q1', 'C_Q1', 'Pol_in_Sim', 'D_Q3', 'A_min', 'ila_in_Sim', 'Aso_elev_Obs', 'IL_elev_Obs', 'C_min', 'C_max', 'ila_out_Obs', 'B_avg', 'A_max', 'B_Q3', 'A_Q2', 'Pol_elev_Sim', 'A_std', 'D_max', 'C_avg', 'Var_out_Aliak_Sim', 'B_max', 'C_Q2', 'Sfi_elev_Sim', 'B_Q1', 'Pol_elev_Obs', 'Nis_Sim', 'B_min', 'Aso_elev_Sim', 'B_std', 'Koul', 'C_Q3', 'A_Q3', 'D_Q2', 'D_std', 'A_Q1', 'Var_down_Aliak', 'Rap_in_A0', 'IL_elev_Sim', 'ila_in_Obs', 'Sfi_elev_Obs', 'Var_out_A0_Obs', 'D_avg'}
Missing: set()


In [17]:
#All the Pins are located at Imathia (EUcode: EL521)
statement = """ 
    MATCH (i:Indicator{code:$ind_code}), (s:Series{code:$series_code}),(ga:GeoArea{EUcode: 'EL521'})
    MERGE (sm:SeriesMetadata{attributesCode:$attr_code,attributesDescription:$attr_descr,dimensionsCode:$dim_code,
    dimensionsDescription:$dim_descr,seriesCode:$series_code})
    MERGE (s)-[:HAS_METADATA]->(sm)
    WITH i, sm, ga
    UNWIND $observations as obs
    MERGE (sm)-[:HAS_OBSERVATION{attributesCode:$attr_code,dimensionsCode:$dim_code,
                                seriesCode:$series_code,time:date(obs.date),
                                geoCode:'EL521'}]->(o:Observation{time:date(obs.date)})
    SET o.value = obs.value
    MERGE (o)-[:REFERS_TO_AREA]->(ga)
    MERGE (i) -[:HAS_OBSERVATIONS]->(ga)
    RETURN COUNT(DISTINCT o) as total

"""
# def write_data_obs(tx,statement, parameters):
#     records = tx.run(statement, parameters= parameters)
#     return records.data()[0]['total']


In [18]:
for params_ind in params_metadata_obs_list:
    print(params_ind['ind_code'])

ila_in_Sim
ila_in_Obs
ila_out_Obs
IL_elev_Sim
IL_elev_Obs
Pol_in_Sim
Pol_in_Obs
Pol_out_Obs
Pol_elev_Sim
Pol_elev_Obs
Sfi_elev_Sim
Sfi_elev_Obs
Aso_elev_Sim
Aso_elev_Obs
Var_out_A0_Obs
Var_out_Aliak_Sim
Almo
Var_down_Aliak
Koul
Rap_in_A0
Rap_out_Aliak
Rap_out_SKG
Nis_Sim
A_min
A_Q1
A_Q2
A_avg
A_Q3
A_max
A_std
B_min
B_Q1
B_Q2
B_avg
B_Q3
B_max
B_std
C_min
C_Q1
C_Q2
C_avg
C_Q3
C_max
C_std
D_min
D_Q1
D_Q2
D_avg
D_Q3
D_max
D_std


In [19]:
with driver.session(database=database_name) as session:
    for params_ind in params_metadata_obs_list:
        records = driver.execute_query(
            statement,
            params_ind,
            database_=database_name
        )
        if records:
            obs_data = records[0][0]['total']
        else:
            obs_data = 0
        print('Imported data: ', obs_data)

Imported data:  380
Imported data:  378
Imported data:  380
Imported data:  380
Imported data:  378
Imported data:  380
Imported data:  378
Imported data:  380
Imported data:  380
Imported data:  378
Imported data:  380
Imported data:  378
Imported data:  380
Imported data:  378
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  380
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  18
Imported data:  18
Imported data:  18
Imported data:  18
Imported data:  18
Imported data:  18
Imported data:  18
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0
Imported data:  0


In [20]:
indicator_codes

['ila_in_Sim',
 'ila_in_Obs',
 'ila_out_Obs',
 'IL_elev_Sim',
 'IL_elev_Obs',
 'Pol_in_Sim',
 'Pol_in_Obs',
 'Pol_out_Obs',
 'Pol_elev_Sim',
 'Pol_elev_Obs',
 'Sfi_elev_Sim',
 'Sfi_elev_Obs',
 'Aso_elev_Sim',
 'Aso_elev_Obs',
 'Var_out_A0_Obs',
 'Var_out_Aliak_Sim',
 'Almo',
 'Var_down_Aliak',
 'Koul',
 'Rap_in_A0',
 'Rap_out_Aliak',
 'Rap_out_SKG',
 'Nis_Sim',
 'A_min',
 'A_Q1',
 'A_Q2',
 'A_avg',
 'A_Q3',
 'A_max',
 'A_std',
 'B_min',
 'B_Q1',
 'B_Q2',
 'B_avg',
 'B_Q3',
 'B_max',
 'B_std',
 'C_min',
 'C_Q1',
 'C_Q2',
 'C_avg',
 'C_Q3',
 'C_max',
 'C_std',
 'D_min',
 'D_Q1',
 'D_Q2',
 'D_avg',
 'D_Q3',
 'D_max',
 'D_std']

In [21]:
for ind_code in indicator_codes:
    # print(f'Data observations to import for indicator {ind_code}:', len(df_cs[ind_code].dropna()))

    query_check = '''
    MATCH (i:Indicator {code: $ind_code})-[:HAS_SERIES]->(s:Series {code: $series_code})-[:HAS_METADATA]->(sm:SeriesMetadata)
    MATCH (sm)-[:HAS_OBSERVATION]->(o:Observation)
    RETURN COUNT(o) AS total
    '''

    result = driver.execute_query(
        query_check,
        {'ind_code': ind_code, 'series_code': ind_code},
        database_=database_name
    )

    if result:
        count = result[0][0]['total']
    else:
        count = 0
    # print(f'Observations in DB for indicator {ind_code}:', count)
    if (count != len(df_cs[ind_code].dropna())):
        print(f'Mismatch for indicator {ind_code}: {count} in DB vs {len(df_cs[ind_code].dropna())} in DataFrame')
