In [2]:
from neo4j import GraphDatabase, basic_auth
import pandas as pd
import numpy as np
import time
import os
from dotenv import load_dotenv
from pathlib import Path

In [5]:
#load the environment variables
dotenv_path = Path('/Users/bunyasit/dev/sustaingraph/.env')
load_dotenv(dotenv_path=dotenv_path)  # This line brings all environment variables from .env into os.environ

# Get variables
SUSTAINGRAPH_URI = os.getenv('SUSTAINGRAPH_URI')
SUSTAINGRAPH_USER = os.getenv('SUSTAINGRAPH_USER')
SUSTAINGRAPH_PASSWORD = os.getenv('SUSTAINGRAPH_PASSWORD')
database_name = os.getenv('DATABASE_NAME')

# Connect to database
driver = GraphDatabase.driver(SUSTAINGRAPH_URI, auth=(SUSTAINGRAPH_USER, SUSTAINGRAPH_PASSWORD))

# Verify connectivity
with driver.session(database=database_name) as session:
    print(session.run("RETURN 'Connected to ' + $db", db=database_name).single()[0])

Connected to neo4j


### Constraints

Before importing any data we add a Unique node property constraint on the properties _M49code_ , _ISOalpha2code_, _ISOalpha3code_ , _EUcode_ of _GeoArea_ (and therefore an index too) for data integrity and better query performance. A type constrant is also added on properties _name_ and the aforementioned codes. 


In [6]:
def create_constraint(tx,statement):
    tx.run(statement)

constraints = [
    """CREATE CONSTRAINT geoArea_M49code IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.M49code IS UNIQUE""",
    """CREATE CONSTRAINT geoArea_ISO2code IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.ISOalpha2code IS UNIQUE""",
    """CREATE CONSTRAINT geoArea_ISO3code IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.ISOalpha3code IS UNIQUE""",
    """CREATE CONSTRAINT geoArea_EUcode IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.EUcode IS UNIQUE""",
    """CREATE CONSTRAINT geoArea_M49code_type IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.M49code IS :: STRING""",
    """CREATE CONSTRAINT geoArea_ISO2code_type IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.ISOalpha2code IS :: STRING""",
    """CREATE CONSTRAINT geoArea_ISO3code_type IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.ISOalpha3code IS :: STRING""",
    """CREATE CONSTRAINT geoArea_EUcode_type IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.EUcode IS :: STRING""",
    """CREATE CONSTRAINT geoArea_name_type IF NOT EXISTS FOR (ga:GeoArea) REQUIRE ga.name IS :: STRING"""
]

with driver.session(database=database_name) as session:
    for statement_constraint in constraints:
        session.execute_write(create_constraint, statement_constraint)

### Write batch function

In [7]:
def write_batch(tx,statement, params_list):
    tx.run(statement, parameters={"parameters": params_list})

### Import GeoAreas to Neo4j from United Nations (Regions,SubRegions,Areas) and Eurostat (Areas)

The [United Nations publication "Standard Country or Area Codes for Statistical Use"](https://unstats.un.org/unsd/methodology/m49/) offers a list of countries or areas contains the names of countries or areas in alphabetical order, their three-digit numerical codes used for statistical processing purposes by the Statistics Division of the United Nations Secretariat, and their two- and three-digit alphabetical codes assigned by the International Organization for Standardization ( ISO 3166 alpha-2 and alpha-3). 


In [8]:
# read csv 
df = pd.read_csv('Data/1.GeoArea_UN.csv',sep=';',usecols=['Global Code', 'Global Name', 'Region Code', 'Region Name',
       'Sub-region Code', 'Sub-region Name', 'Country or Area', 'M49 Code',
       'ISO-alpha2 Code', 'ISO-alpha3 Code'])
df.dropna(inplace=True)

# Query statement to import the data in batches of 10000 rows to avoid memory issues
statement_geo = """
    UNWIND $parameters as row
    WITH row 
    MERGE (world:GeoArea{name:row.world_name,M49:row.world_code})
    MERGE (world)-[:HAS_REGION]->(reg:GeoArea{name:row.reg_name,M49code:row.reg_code})
    MERGE (reg)-[:HAS_SUBREGION]->(subreg:GeoArea{name:row.sub_name,M49code:row.sub_code})
    MERGE (subreg)-[:HAS_AREA]->(area:GeoArea{name:row.area_name,M49code:row.m49, ISOalpha2code:row.iso2, ISOalpha3code:row.iso3})
    """ 

params=[]
batch_size = 10000
batch_i = 1
with driver.session(database=database_name) as session:
    for index, row in df.iterrows():
        st = time.time()
        params_dict = {
            'world_name': str(row['Global Name']), 
            'world_code': str(int(row['Global Code'])),
            'reg_name': str(row['Region Name']),
            'reg_code':str(int(row['Region Code'])),
            'sub_name':str(row['Sub-region Name']),
            'sub_code': str(int(row['Sub-region Code'])),
            'area_name': str(row['Country or Area']),
            'm49':str(row['M49 Code']),
            'iso2':str(row['ISO-alpha2 Code']),
            'iso3':str(row['ISO-alpha3 Code'])
        }
        params.append(params_dict)
        if index % batch_size == 0 and index > 0:
            st = time.time()
            session.execute_write(write_batch, params_list = params,statement = statement_geo)
            # driver.execute_query(statement,parameters=params)
            et = time.time()
            # get the execution time
            elapsed_time = et - st            
            print('Batch {} with {} data : Done! ({} minutes)'.format(batch_i,len(params),elapsed_time/60))
            params = []            
            batch_i +=1
        
    if params:
            st = time.time()  # Record start time for the last batch
            session.execute_write(write_batch, params_list = params,statement = statement_geo)
            et = time.time()
            elapsed_time = et - st
            print('{} observations: Done! ({} minutes)'.format(len(params), elapsed_time/60))

246 observations: Done! (0.012616884708404542 minutes)


> Check cypher query 

In [9]:
def check_regions(tx,statement):
    result = tx.run(statement)
    data = result.data()[0]
    return data['continents'],data['subregions'], data['areas']

statement_check = """ 
MATCH (ga:GeoArea)-[:HAS_REGION]->(r)-[:HAS_SUBREGION]->(sb)-[:HAS_AREA]->(a)
RETURN COUNT(DISTINCT r) as continents,COUNT(DISTINCT sb) as subregions,COUNT(DISTINCT a) as areas
""" 

with driver.session(database=database_name) as session:
    continents, subregions, areas = session.execute_write(check_regions, statement_check)
    print('Continents:', continents)
    print('SubRegions:', subregions)
    print('Areas:', areas)

Continents: 5
SubRegions: 17
Areas: 246


The Eurostat database has assigned to Member States of the European Union (EU) and other countries a two-letter country code, always written in capital letters, and often used as an abbreviation in statistical analyses, tables, figures or maps. In the _Data/1.GeoArea_EU.xlsx_ file there is the mapping for countries with their EUcodes based on https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Country_codes

Several country names where replaced to match the name given from the   United Nations:
- EU_name --> UN_name 
- Moldova -> Republic of Moldova
- Türkiye -> Turkey
- Palestine -> State of Palestine
- Syria -> Syrian Arab Republic
- Hong Kong -> China, Hong Kong Special Administrative Region
- Russia -> Russian Federation
- South Korea -> Republic of Korea
- United Kingdom -> United Kingdom of Great Britain and Northern Ireland
- United States -> United States of America

Kosovo and Taiwan mentioned in EU dataset are not present in the UN dataset.

In [10]:
eu_df = pd.read_excel('Data/1.GeoArea_EU.xlsx')
eu_df.replace(['Moldova', 'Türkiye', 'Palestine','Syria','Hong Kong','Russia','South Korea','United Kingdom','United States'],
                 ['Republic of Moldova', 'Turkey', 'State of Palestine','Syrian Arab Republic',
                  'China, Hong Kong Special Administrative Region','Russian Federation','Republic of Korea',
                  'United Kingdom of Great Britain and Northern Ireland',' United States of America'],inplace=True)

eu_codes = []

def write_eu(tx,statement, eu_codes):
    result = tx.run(statement, parameters={"eu_codes": eu_codes})
    total = result.data()[0]['total']
    return total

statement_eu = """
    UNWIND $eu_codes as country
    WITH country 
    MATCH (area:GeoArea{name:country.area_name})
    SET area.EUcode = country.eu_code
    RETURN COUNT(area) as total
    """ 
with driver.session(database=database_name) as session:
    for index, row in eu_df.iterrows():
        params_dict = {
            'area_name': str(row['Name']).strip(),
            'eu_code':str(row['Code']),
        }
        eu_codes.append(params_dict)
    total_updates = session.execute_write(write_eu,statement_eu, eu_codes)
    print("Countries with EU codes:",len(eu_df['Name']) - 2) # Kosovo and Taiwan
    print("Countries updated with  EU codes:",total_updates) # Kosovo and Taiwan
    

Countries with EU codes: 70
Countries updated with  EU codes: 70


### European Union
Create a node for the European Union where all the 27-countries belong to. 

In [11]:

def write_eu(tx,statement, eu_members):
    result = tx.run(statement, parameters={"eu_members": eu_members})
    total = result.data()[0]['total']
    return total

statement_eu_mem = """
    MERGE (eu:EuropeanUnion{description:'The European Union, abbreviated as EU, is an economic and political union of European countries with 27 member states.'})
    WITH eu
    UNWIND $eu_members as code
    MATCH (a:GeoArea{EUcode:code})
    MERGE (a)-[b:BELONGS_TO]->(eu)
    RETURN COUNT(b) as total
    """ 
eu_members = []
with driver.session(database=database_name) as session:
    for index, row in eu_df.iterrows():
        if row['Classification'] == 'European Union':
            eu_members.append(row['Code'])
    total_updates = session.execute_write(write_eu,statement_eu_mem, eu_members)
    print("Countries member of EU added:",total_updates) # Kosovo and Taiwan
    

Countries member of EU added: 27


### Import GeoAreas to Neo4j from Eurostat (NUTS)

Nomenclature of Territorial Units for Statistics or NUTS is a geocode standard for referencing the subdivisions of countries for statistical purposes. For each EU member country, a hierarchy of three NUTS levels is established by **Eurostat** in agreement with each member state. In the folder Data, there is a excel file(1.GeoArea_EU_NUTS.xlsx), containing information about the NUTS levels, that we are going to import into our graph. Data were collected from : https://ec.europa.eu/eurostat/web/nuts/overview (version NUTS 2021 classification)

Before importing the data into neo4j, the excel file is transformed into the desired format. The data transformation follows the below processing:

- Drop unnecessary columns and rows (Data cleaning)
- Unpivot pandas dataframe
- Add extra column about the higher NUTS level

In [12]:
def split(word,n):
    chunks = [word[i:i+n] for i in range(0, len(word), n)]
    return chunks

# Read excel file
df_nuts = pd.read_excel('Data/1.GeoArea_EU_NUTS.xlsx',sheet_name='NUTS2021')

# Drop unnecessary columns & rows
df_nuts.drop(['Country order', 'Region order'], axis=1, inplace=True)
df_nuts.drop(df_nuts.loc[df_nuts['Code 2021'].str.endswith('Z')].index, inplace=True)

# Create new column Length
df_nuts["Length"]= df_nuts["Code 2021"].str.len()

# Unpivot dataframe
df = df_nuts.melt(id_vars=['Code 2021','NUTS level','Length'],var_name='Level', value_name='Name')
## Drop NA & reset index
df = df.dropna(subset=['Name'])
df.reset_index(drop=True,inplace=True)

# Add new Column Country
df['SuperClass'] = df.apply(lambda x: split(x['Code 2021'],x['Length']-1)[0], axis=1) 
# Drop countries
df.drop(df.loc[df['Length'] == 2].index, inplace=True)

df.reset_index(drop=True,inplace=True)

In [13]:
def nuts_creation(df1,name_of_rel):
      
    records, summary, keys = driver.execute_query("""
        MATCH (ga:GeoArea)
        WHERE ga.EUcode IS NOT NULL
        RETURN COLLECT(ga.EUcode) as geocodes
        """,routing_="r",database_=database_name)
    available_neo4j_geocodes = records[0]['geocodes']

    # Keep only the rows containing values of countries existing in the neo4j LPG model.
    df1 = df1.loc[df1['SuperClass'].isin(available_neo4j_geocodes)]
    
    if name_of_rel == 'HAS_NUTS1':
        statement_nuts = """
        UNWIND $parameters as row
        MATCH (super_class:GeoArea)
        WHERE row.super_code = super_class.EUcode
        MERGE (super_class)-[:HAS_NUTS1]->(ga:GeoArea{name:row.nuts_name,EUcode:row.nuts_code})
        """    
    if name_of_rel == 'HAS_NUTS2':
        statement_nuts = """
        UNWIND $parameters as row
        MATCH (super_class:GeoArea)
        WHERE row.super_code = super_class.EUcode
        MERGE (super_class)-[:HAS_NUTS2]->(ga:GeoArea{name:row.nuts_name,EUcode:row.nuts_code})
        """ 
    if name_of_rel == 'HAS_NUTS3':
        statement_nuts = """
        UNWIND $parameters as row
        MATCH (super_class:GeoArea)
        WHERE row.super_code = super_class.EUcode
        MERGE (super_class)-[:HAS_NUTS3]->(ga:GeoArea{name:row.nuts_name,EUcode:row.nuts_code})
        """ 
        
    # Begin a new auto-commit GraphTransaction.
    with driver.session(database=database_name) as session:
        params=[]
        for index, row in df1.iterrows():
            params_dict = {
                'super_code': str(row['SuperClass']), 
                'nuts_name': str(row['Name']).strip(),
                'nuts_code': str(row['Code 2021'])
            }
            params.append(params_dict)
        st = time.time()
        session.execute_write(write_batch, params_list = params,statement = statement_nuts)
        elapsed_time = time.time() - st
        print('{} observations: Done! ({} minutes)'.format(len(params), elapsed_time/60))

In [14]:
# Add sequentially each NUTS level 
nuts_creation(df1 = df.loc[df.Length==3],name_of_rel= 'HAS_NUTS1')
nuts_creation(df1 = df.loc[df.Length==4],name_of_rel= 'HAS_NUTS2')
nuts_creation(df1 = df.loc[df.Length==5],name_of_rel= 'HAS_NUTS3')

125 observations: Done! (0.0013754129409790038 minutes)
334 observations: Done! (0.0018534143765767416 minutes)
1514 observations: Done! (0.00373154878616333 minutes)


By executing the following cypher query, we check the size of imported data (in this way duplication of data can be avoided.)
> Check cypher query 

In [15]:
def count_nuts(tx,statement):
    result = tx.run(statement)
    data = result.data()[0]
    return data['nuts1'],data['nuts2'], data['nuts3']

statement_nuts = """ 
MATCH (ga:GeoArea)-[:HAS_NUTS1]->(r)-[:HAS_NUTS2]->(sb)-[:HAS_NUTS3]->(a)
RETURN COUNT (DISTINCT r) as nuts1 ,COUNT(DISTINCT sb) as nuts2 ,count(DISTINCT a) as nuts3
""" 
with driver.session(database=database_name) as session:
    nuts1,nuts2,nuts3 = session.execute_read(count_nuts, statement_nuts)
    print('NUTS1:', nuts1)
    print('NUTS2:', nuts2)
    print('NUTS3:', nuts3)

NUTS1: 125
NUTS2: 334
NUTS3: 1514


Now we have imported the geoAreas, we add a second label, indicating if this geoArea is a region,subregion,etc.

> Set second GeoArea label

In [16]:
def add_labels(tx,statement):
    tx.run(statement)
    return 

statement_labels = [
""" MATCH (g1:GeoArea)-[r:HAS_AREA]->(g2:GeoArea)
SET g2:Area""",
"""MATCH (g1:GeoArea)-[r:HAS_SUBREGION]->(g2:GeoArea)
SET g2:SubRegion""",
"""MATCH (g1:GeoArea)-[r:HAS_REGION]->(g2:GeoArea)
SET g2:Region""",
"""MATCH (g1:GeoArea)-[r:HAS_NUTS3]->(g2:GeoArea)
SET g2:NUTS3""",
"""MATCH (g1:GeoArea)-[r:HAS_NUTS2]->(g2:GeoArea)
SET g2:NUTS2""",
"""MATCH (g1:GeoArea)-[r:HAS_NUTS1]->(g2:GeoArea)
SET g2:NUTS1"""
] 

with driver.session(database=database_name) as session:
    for statement_label in statement_labels:
        session.execute_write(add_labels, statement_label)

### Import Eurostat Typology of NUTS3 Regions

#### Constraints

Each NUTS3 is classified according to a NUTS Typology provided by Eurostat.

In [17]:
statement_constraint = """ 
CREATE CONSTRAINT typology IF NOT EXISTS FOR ( top:Typology ) REQUIRE (top.categoryCode, top.categoryLabel) IS NODE KEY
""" 

with driver.session(database=database_name) as session:
    session.execute_write(create_constraint, statement_constraint)

#### Import Eurostat typology

In [18]:
# Read csv 
cols = ['UrbanRural','Metropolitan','Coastal',
        'Mountain','Border','Island','UrbanRuralRemoteness']
df_typology = pd.read_excel('Data/1.GeoArea_EU_NUTS.xlsx',sheet_name = cols )

In [19]:
def write_batch(tx,statement, params_list):
    tx.run(statement, parameters={"parameters": params_list})

In [20]:
statement_top = """
    UNWIND $parameters as row
    MATCH (ga:NUTS3{EUcode:row.nuts_code})
    MERGE (top:Typology{categoryCode:row.category,categoryLabel:row.label})
    MERGE (ga)-[:HAS_TYPOLOGY]-> (top)
    WITH top, row
    CALL apoc.create.addLabels( top, [ row.name] )
    YIELD node
    RETURN node
    """    

for col in cols:
    
    df_col = df_typology[col] 
    
    with driver.session(database=database_name) as session:
        params=[]
        for index, row in df_col.iterrows():        
            params_dict = {
                'nuts_code':str(row['Nuts']),
                'category':str(row['Category']).title(),
                'label':str(row['Label']).title(),
                'name':str(col)
            }
            params.append(params_dict)
        st = time.time()
        session.execute_write(write_batch, params_list = params,statement = statement_top)
        elapsed_time = time.time() - st
        print('{} observations: Done! ({} minutes)'.format(len(params), elapsed_time/60))

1514 observations: Done! (0.012930019696553548 minutes)
1514 observations: Done! (0.008372116088867187 minutes)
1514 observations: Done! (0.0077974836031595865 minutes)
1512 observations: Done! (0.00818260113398234 minutes)
1512 observations: Done! (0.007384232680002848 minutes)
73 observations: Done! (0.001057298978169759 minutes)
1387 observations: Done! (0.006750647226969401 minutes)


> Check cypher query 

In [21]:
def count_typology(tx,statement):
    result = tx.run(statement)
    data = result.data()[0]
    return data['total']

statement_typo= """ 
MATCH (n:Typology) RETURN count(distinct n) as total
""" 
statement_typo_rel = """
MATCH (ga:NUTS3)-[r:HAS_TYPOLOGY]->(:UrbanRural) RETURN count(r) as total
"""

unique_labels = 0 
for sheet_name, df in df_typology.items():
    unique_labels += len(df['Label'].unique())

df_NUTS3 = pd.read_excel('Data/1.GeoArea_EU_NUTS.xlsx',sheet_name = 'NUTS2021') 
nuts3_codes = df_NUTS3[df_NUTS3['NUTS level'] == 3]['Code 2021'].dropna().unique().tolist()
df_urbanrural = pd.read_excel('Data/1.GeoArea_EU_NUTS.xlsx', sheet_name='UrbanRural')
nuts3_labels = df_urbanrural[df_urbanrural['Nuts'].isin(nuts3_codes)]['Label']
labels3_list = nuts3_labels.dropna().tolist()

with driver.session(database=database_name) as session:
    total = session.execute_read(count_typology, statement_typo)
    total_rel = session.execute_read(count_typology, statement_typo_rel)
    print('Total typologies:', total, ', expected:', unique_labels) 
    print('Total urban_rural nuts3:', total_rel, ', expected:', len(labels3_list))

Total typologies: 9026 , expected: 20
Total urban_rural nuts3: 1514 , expected: 1514


### Import Eurostat LAU,Cities,FUA, Degree of Urbanization

To meet the demand for statistics at a local level, Eurostat maintains a system of Local Administrative Units (LAUs) compatible with NUTS. These LAUs are the building blocks of the NUTS, and comprise the municipalities and communes of the European Union. As defined by Eurostat:

* A City is a local administrative unit (LAU) where the majority of the population lives in an urban centre of at least 50 000 inhabitants.
* The Functional Urban Area (FUA) consists of a city and its commuting zone.
* The Degree of urbanization (DEGURBA) is a classification that indicates the character of an area: Cities (densely populated areas) - Towns and suburbs (intermediate density areas) - Rural areas (thinly populated areas)

![SustainGraph-Local_level__2_](https://gitlab.com/netmode/sustaingraph/-/wikis/uploads/c726fa43617fa6b5ef9046da9303e653/SustainGraph-Local_level.jpg)

In [22]:
# degree of urbanization
# https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Degree_of_urbanisation 
degurb = {1:'Cities',
          2:'TownsAndSuburbs',
          3:'RuralAreas'}

In [23]:
#Some LAUs may not belong to neither a City nor a FUA
statement_local = """
    UNWIND $parameters as row
    WITH row 
    MATCH (nuts3:NUTS3{EUcode:row.nuts_code})
    MERGE (lau:LAU{EUcode:row.lau_code,name:row.lau_name})
    ON CREATE
        SET lau:GeoArea
    MERGE (nuts3)-[:HAS_LAU]-> (lau)
    FOREACH(_ IN CASE WHEN row.deg_code IS NOT NULL THEN [1] ELSE [] END | 
    MERGE (degurba:UrbanizationDegree{categoryCode:row.deg_code,categoryLabel:row.deg_name})
    ON CREATE
        SET degurba:Typology
    MERGE (lau) -[:HAS_DEGURBA]-> (degurba)
    )
    FOREACH(_ IN CASE WHEN row.city_code IS NOT NULL THEN [1] ELSE [] END | 
    MERGE (city:City{EUcode:row.city_code,name:row.city_name})
       ON CREATE
         SET city:GeoArea
    MERGE (city) -[:CONSISTS_OF]-> (lau)
    )
    FOREACH(_ IN CASE WHEN row.fua_code IS  NOT NULL THEN [1] ELSE [] END | 
    MERGE (fua:FUA{EUcode:row.fua_code,name:row.fua_name})
    ON CREATE
        SET fua:GeoArea
    MERGE (fua) -[:CONSISTS_OF]-> (lau)
    )
    FOREACH(_ IN CASE WHEN row.fua_code IS  NOT NULL AND row.city_code IS NOT NULL THEN [1] ELSE [] END | 
    MERGE(fua:FUA{EUcode:row.fua_code})
    MERGE(city:City{EUcode:row.city_code})
    MERGE (fua) -[:CONSISTS_OF]-> (city)
    )
    """  

In [None]:
df_lau_all = pd.read_excel('Data/1.GeoArea_EU-27-LAU-2021-NUTS-2021.xlsx',sheet_name=None,dtype='object')
for name, sheet in df_lau_all.items():
    if(name == 'Overview POP_2021' ):
        continue
    print(name)
    #REPLACE 9 DEGREE of urbanization with None
    sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)
    sheet['DEGURBA_NAME'] = sheet['DEGURBA'].map(degurb)
    #--------------------------fix Eurostat errors----------------------------------------
    if(name == 'EL'):
        sheet['CITY_NAME'] = sheet['CITY_NAME'].replace('Narva', 'Athens (Greater City)')
    elif (name =='ES'):
        sheet['FUA_NAME'] = sheet['FUA_NAME'].replace('Ponteverda', 'Pontevedra')
    
    df_lau = sheet[['NUTS 3 CODE','LAU CODE', 'LAU NAME LATIN', 'DEGURBA','DEGURBA_NAME','CITY_ID','CITY_NAME','FUA_ID','FUA_NAME']]
    # -------------------------import data-------------------------------------------
    with driver.session(database=database_name) as session:
        params=[]
        for index, row in df_lau.iterrows(): 
            # country code added in front of lau code to make it unique for each country        
            params_dict = {
                'nuts_code':str(row['NUTS 3 CODE']),
                'lau_code':f"{name}_{(str(row['LAU CODE']))}",
                'lau_name':str(row['LAU NAME LATIN']).strip(),
                'deg_code':str(int(row['DEGURBA'])) if not pd.isna(row['DEGURBA']) else None,  # Keep it as None if NaN,
                'deg_name':str(row['DEGURBA_NAME']),
                'city_code':str(row['CITY_ID']) if not pd.isna(row['CITY_ID']) else None,  # Keep it as None if NaN, ,
                'city_name':str(row['CITY_NAME']).strip() if not pd.isna(row['CITY_NAME']) else None,  # Keep it as None if NaN, ,
                'fua_code':str(row['FUA_ID']) if not pd.isna(row['FUA_ID']) else None,  # Keep it as None if NaN, ,
                'fua_name':str(row['FUA_NAME']).strip() if not pd.isna(row['FUA_NAME']) else None,  # Keep it as None if NaN, ,
            }
            params.append(params_dict)
        
        st = time.time()
        session.execute_write(write_batch, params_list = params,statement = statement_local)
        elapsed_time = time.time() - st
        print('{} observations: Done! ({} minutes)'.format(len(params), elapsed_time/60))

BE


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


581 observations: Done! (0.012862447897593181 minutes)
BG
265 observations: Done! (0.0030786673227945964 minutes)
CZ


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)
  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


6258 observations: Done! (0.12043026685714722 minutes)
DK


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


99 observations: Done! (0.013086044788360595 minutes)
DE


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


11002 observations: Done! (0.5505887667338053 minutes)
IE


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


166 observations: Done! (0.01391056776046753 minutes)
EE


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


79 observations: Done! (0.007532250881195068 minutes)
EL


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


6137 observations: Done! (0.48239006598790485 minutes)
ES


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


8131 observations: Done! (0.9087620337804159 minutes)
FR


  sheet['DEGURBA'] = sheet['DEGURBA'] .replace([9], np.nan)


### Dataframe with the number of LAUs,FUAs and Cities per country to compare with the cypher check query

In [None]:
df_lau_all = pd.read_excel('Data/1.GeoArea_EU-27-LAU-2021-NUTS-2021.xlsx',sheet_name=None,dtype='object')
df_check = []
columns = ['country','totalLAU','totalCities','totalFUA']
for name, sheet in df_lau_all.items():
    if(name == 'Overview POP_2021' ):
        continue
    df_check.append([name,sheet['LAU CODE'].nunique(),sheet['CITY_ID'].nunique(),sheet['FUA_ID'].nunique()])
df = pd.DataFrame(df_check, columns=columns)
df= df.sort_values('country')

In [None]:
df

Unnamed: 0,country,totalLAU,totalCities,totalFUA
19,AT,2095,6,6
0,BE,581,15,14
1,BG,265,18,17
12,CY,615,3,3
2,CZ,6258,18,15
4,DE,11002,127,98
3,DK,99,4,4
6,EE,79,3,3
7,EL,6137,14,14
8,ES,8131,98,81


*** Note: Greece (EL) has one less LAU code in the cypher query check than in the table above because Aghio Oros LAU has not matched any NUTS3 code in the SustainGraph therefore is not imported 

In [None]:
recordsLAU, summary, keys = driver.execute_query("""
    MATCH (l:LAU)
    RETURN LEFT(l.EUcode, 2) AS country_code, count(l) AS LAUCount
    ORDER BY country_code
    """,routing_="r",database_=database_name)
countries = []
laus = []
for record in recordsLAU:
    countries.append(record['country_code'])
    laus.append(record['LAUCount'])

recordsCity, summary, keys = driver.execute_query("""
    MATCH (c:City)
    RETURN LEFT(c.EUcode, 2) AS country_code, count(c) AS CityCount
    ORDER BY country_code
    """,routing_="r",database_=database_name)
cities = []
for record in recordsCity:
    cities.append(record['CityCount'])

recordsFUA, summary, keys = driver.execute_query("""
    MATCH (f:FUA)
    RETURN LEFT(f.EUcode, 2) AS country_code, count(f) AS FUACount
    ORDER BY country_code
    """,routing_="r",database_=database_name)
fua = []
for record in recordsFUA:
    fua.append(record['FUACount'])

check_df = pd.DataFrame(
    {'country': countries,
     'totalLAU': laus,
     'totalCities': cities,
     'totalFUA':fua
    })
merged_df = pd.merge(df, check_df, on='country', suffixes=('_1', '_2'))


In [None]:
merged_df

Unnamed: 0,country,totalLAU_1,totalCities_1,totalFUA_1,totalLAU_2,totalCities_2,totalFUA_2
0,AT,2095,6,6,2095,6,6
1,BE,581,15,14,581,15,14
2,BG,265,18,17,265,18,17
3,CY,615,3,3,615,3,3
4,CZ,6258,18,15,6258,18,15
5,DE,11002,127,98,11002,127,98
6,DK,99,4,4,99,4,4
7,EE,79,3,3,79,3,3
8,EL,6137,14,14,6136,14,14
9,ES,8131,98,81,8131,98,81


In [None]:
# Connect FUA to their countries
def create_rel(tx, statement):
    records = tx.run(statement)
    return records.data()[0]['total_fua']

statement_country_fua = """ 
MATCH (a:Area)-[:HAS_NUTS1]-(:NUTS1)-[:HAS_NUTS2]-(:NUTS2)-[HAS_NUTS3]-(:NUTS3)-[:HAS_LAU]-(:LAU)-[:CONSISTS_OF]-(f:FUA)
MERGE (a)-[h:HAS_FUA]->(f)
RETURN COUNT(DISTINCT h) as total_fua
"""

with driver.session(database=database_name) as session:
    total_fua = session.execute_write(create_rel,statement_country_fua)
    print('Total relationships created: ', total_fua)

Total relationships created:  620


### Import Eurostat PostalCodes

According to the [mapping of postal codes with NUTS3 regions](https://gisco-services.ec.europa.eu/tercet/flat-files) of Eurostat, we introduce the zip codes only for Greece in the SustainGraph. In the folder Data/1.GeoArea_EU_PostalCodes, there is a csv file for Greece, containing information about the mapping of NUTS3 regions with the postal codes, that we are going to import into our graph.

! Import only Greece data due to large number of postal codes !

In [None]:
statement_post = """
    UNWIND $parameters as row
    WITH row 
    MATCH (ga:NUTS3{EUcode:toString(row.nuts_code)})
    MERGE (post:PostalCode{EUcode:toString(row.postal_code)})
    ON CREATE
        SET post:GeoArea
    MERGE (ga)-[:HAS_POSTAL_CODE]-> (post)
    """

In [None]:
# Get the current working directory (where the .ipynb file is located)
current_directory = os.getcwd()

# Construct the path to the 'Data' folder
oecd_data_path = os.path.join(current_directory, 'Data/1.GeoArea_EU_PostalCodes')

# List all CSV files in the 'OECD_Data' folder
csv_files = [f for f in os.listdir(oecd_data_path) if f.endswith('.csv')]

dict_of_dfs = {}
batch_size=5000

for csv_file in csv_files:
    csv_path = os.path.join(oecd_data_path, csv_file)
    # Example: Read CSV content using pandas

    # Original df
    df = pd.read_csv(csv_path.replace('\\','/'),delimiter=';')
    
    # delete ''
    df = df.applymap(lambda x: x.strip("'") if isinstance(x, str) else x)
    
    dict_of_dfs[csv_file.split('_')[1]] = df

  df = df.applymap(lambda x: x.strip("'") if isinstance(x, str) else x)


In [None]:
batch_size=5000
for key,df in dict_of_dfs.items():
    print('Country {} data import'.format(key))
    # Begin a new auto-commit GraphTransaction.
    params=[]
    batch_i = 1
    with driver.session(database=database_name) as session:
        for index, row in df.iterrows():
            # get the start time
            st = time.time()
            params_dict = {
                'nuts_code': str(row['NUTS3']), 
                'postal_code': str(row['CODE']),
            }
            params.append(params_dict)
            if index % batch_size == 0 and index > 0:
                st = time.time()
                session.execute_write(write_batch, params_list = params,statement = statement_post)
                # get the end time
                et = time.time()
                # get the execution time
                elapsed_time = et - st            
                print('Batch {} with {} observations : Done! ({} minutes)'.format(batch_i,len(params),elapsed_time/60))
                params = []            
                batch_i +=1
        if params:
            st = time.time()  # Record start time for the last batch
            session.execute_write(write_batch, params_list = params,statement = statement_post)
            et = time.time()
            elapsed_time = time.time() - st
            print('Expected Length of data: ', len(df))
            print('{} observations: Done! ({} minutes)'.format(len(params), elapsed_time/60))

Country EL data import
Expected Length of data:  1041
1041 observations: Done! (0.012144529819488525 minutes)


> Check cypher query

In [None]:
records, summary, keys = driver.execute_query("""
   MATCH (ga:NUTS3)-[:HAS_POSTAL_CODE]->(n:PostalCode) RETURN ga.name,COUNT(n) as codes
    """,routing_="r",database_=database_name)
for record in records:
    print(record['ga.name'], 'with', record['codes'],' postal codes')

Notios Tomeas Athinon with 30  postal codes
Kalymnos, Karpathos, Kasos, Kos, Rodos with 21  postal codes
Evros with 15  postal codes
Kilkis with 7  postal codes
Kastoria with 12  postal codes
Magnisia, Sporades with 22  postal codes
Achaia with 31  postal codes
Argolida, Arkadia with 28  postal codes
Voreios Tomeas Athinon with 38  postal codes
Dytikos Tomeas Athinon with 20  postal codes
Kentrikos Tomeas Athinon with 115  postal codes
Anatoliki Attiki with 36  postal codes
Dytiki Attiki with 13  postal codes
Peiraias, Nisoi with 41  postal codes
Lesvos, Limnos with 13  postal codes
Ikaria, Samos with 9  postal codes
Chios with 6  postal codes
Andros, Thira, Kea, Milos, Mykonos, Naxos, Paros,  Syros, Tinos with 23  postal codes
Irakleio with 29  postal codes
Lasithi with 10  postal codes
Rethymni with 9  postal codes
Chania with 22  postal codes
Xanthi with 6  postal codes
Rodopi with 4  postal codes
Drama with 8  postal codes
Thasos, Kavala with 16  postal codes
Imathia with 8  postal

In [None]:
def add_postal(tx,statement):
    tx.run(statement)
    return 

statement_postal= """
MATCH (n3:NUTS3 {EUcode: 'EL306'}) 
MERGE (n3)-[:HAS_POSTAL_CODE]->(z:PostalCode{EUcode:'13345'}) 
ON CREATE SET z:GeoArea 
 """

with driver.session(database=database_name) as session:
    session.execute_write(add_postal, statement_postal)

In [None]:
def add_postal_area(tx,statement):
    tx.run(statement)
    return 

statement_postal_area= """
MATCH (n3:NUTS3)-[:HAS_POSTAL_CODE]->(post:PostalCode)
MATCH (ga:Area)--(n1:NUTS1)--(n2:NUTS2)--(n3)
MERGE (ga)-[r:HAS_POSTAL_CODE]->(post)
RETURN count(distinct r)
 """

with driver.session(database=database_name) as session:
    session.execute_write(add_postal, statement_postal_area)