In [None]:
# select the python kernel where `geoenricher` is installed.
# make sure the imports work.

# env vars:
# set: JAVA_HOME, SPARK_HOME and HADOOP_HOME to point to respective installed directories
# And, add "$JAVA_HOME$/bin", "$SPARK_HOME$/bin" and "$HADOOP_HOME$/bin" to sys PATH

import geoenricher 
# from geoenricher import Enricher, EnrichOverlay


In [None]:

from geoenricher import Enricher

'''
Loading data from files.
This will take a while. Because it cleans and makes some essential transformations.
But it is a one-time operation. `parquet_all()` will save all datasets to the disk; 
preserving any transformations applied.
From next time, you can directly load them  with: `load_from_parquets()` to save time.

'''

# provide the data directory
data_dir = f"./data"

# individual file paths:
path_com_EU = f"{data_dir}/data_EU/comuni_shp/"
path_contr = f"{data_dir}/data_EU/countries_shp/"
path_grids = f"{data_dir}/data_EU/census_grid_EU/grids_OG_corrected.parquet"
path_grids_new = f"{data_dir}/data_EU/census_grid_EU/grids_new.gpkg"
path_reg = f"{data_dir}/data_Italy/regioni/"
path_prov = f"{data_dir}/data_Italy/provinci"
path_com = f"{data_dir}/data_Italy/comuni/"
path_hlth = f"{data_dir}/data_EU/services/healthcare_dropna.gpkg"
path_edu = f"{data_dir}/data_EU/services/education_dropna.gpkg"
path_acc_health = f"{data_dir}/data_EU/accessibility/healthcare/grid_accessibility_health.geoparquet"
path_acc_edu = f"{data_dir}/data_EU/accessibility/education/grid_accessibility_educ.geoparquet"
path_NUTS = f"{data_dir}/NUTS.shp"
path_LAU = f"{data_dir}/LAU.shp"
path_DGURBA = f"{data_dir}/DGURBA"

# dataset names and their file formats:
# format: {dataset_name: (path, file_format), ...}

datasets: dict[str, tuple[str, str]] = {
    # "comuni_EU": (path_com_EU, "shapefile"),
    # "countries": (path_contr, "shapefile"),
    # "pop_grids": (path_grids, "geoparquet"),
    # # "pop_grids_new": (path_grids_new, "geopackage"),
    # "regions_IT": (path_reg, "shapefile"),
    # "provinces_IT": (path_prov, "shapefile"),
    # "comuni_IT": (path_com, "shapefile"),
    # "healthcare": (path_hlth, "geopackage"),
    # "education": (path_edu, "geopackage"),
    # "acc_health": (path_acc_health, "geoparquet"),
    # "acc_edu": (path_acc_edu, "geoparquet"),
    # "nuts": (path_NUTS, "shapefile"),
    # "lau": (path_LAU, "shapefile"),
    "dg_urban": (path_DGURBA, "shapefile"),
}

obj = Enricher(crs="EPSG:3035")

# setup the Enricher object with "sedona" (or, "wherobots")
# "ex_mem" and "dr_mem" are the executor and driver memory in GB
obj.setup_cluster(
    data_dir=data_dir, 
    which="sedona", 
    ex_mem=26,  # change this
    dr_mem=24,  # change this
    log_level="ERROR"
)

# use "load()" to load all the datasets in {data_dir}, 
# according to the paths and file formats provided in "datasets{}"
obj.load(datasets, silent=True)

# optionally, run "fix_geometries()" to fix invalid geometries, if any; 
# if you want to skip the check for some dataframes, pass their names in "skip[]"
obj.fix_geometries(
    skip=['pop_grids', 'pop_grids_new']
)

# Optionally, inspect the partitions and data skew:
# obj.inspect_partitions()

# and force the dataframes to be repartitioned to the number of available cores
# pass the names of the dataframes to be skipped in skip[]
obj.force_repartition(skip=['pop_grids'])

# obj.inspect_partitions()

# transform the CRS of loaded datasets to the CRS passed in the Enricher constructor
# lazy=True will not cache the dataframes. 
obj.transform_CRS(lazy=False)

# pickle all the dataframes to disk for quick access later.
# default directory: "./{data_dir}/pickle_parquets/dfs_list"
# you may change the directory where they are saved by pasing it in "parquet_dir"; relative to the {data_dir}
obj.parquet_all(preserve_partitions=True)


In [3]:

from geoenricher import Enricher

'''
Load data from pickled parquets 

'''

if 'obj' in globals():
    del obj

# provide the data directory
data_dir = "./data"

obj = Enricher(crs="EPSG:3035")

# setup the Enricher object with "sedona" (or, "wherobots")
# "ex_mem" and "dr_mem" are the executor and driver memory in GB
obj.setup_cluster(
    data_dir=data_dir, 
    which="sedona", 
    ex_mem=26,  # change this
    dr_mem=24,  # change this
    log_level="INsdFO"
)

# loads all the datasets from pickled parquets in the default direcoty: "./{data_dir}/pickle_parquets/dfs_list"
# optionally, to load from a different directory, pass the directory path in "parquet_dir"; reative to the {data_dir}
obj.load_from_parquets()
# obj.inspect_partitions()


25/03/13 11:40:39 WARN Utils: Your hostname, marvin resolves to a loopback address: 127.0.1.1; using 172.20.27.4 instead (on interface eth0)
25/03/13 11:40:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/data/homes_data/sudheer/benchmark_data/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /data/homes_data/sudheer/.ivy2/cache
The jars for the packages stored in: /data/homes_data/sudheer/.ivy2/jars
org.apache.sedona#sedona-spark-shaded-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-51a8baf1-8270-469e-a223-06b612d9944c;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-shaded-3.5_2.12;1.7.0 in central
	found org.datasyslab#geotools-wrapper;1.7.0-28.5 in central
:: resolution report :: resolve 223ms :: artifacts dl 8ms
	:: modules in use:
	org.apache.sedona#sedona-spark-shaded-3.5_2.12;1.7.0 from central in [default]
	org.datasyslab#geotools-wrapper;1.7.0-28.5 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	------------------------------------------

sedona initialized with 10 cores for parellelism.



                                                                                

Loaded dataframe 'hospitals'
Loaded dataframe 'com_X_pop_accssblty_hosps'
Loaded dataframe 'lau'
Loaded dataframe 'dg_urban'
Loaded dataframe 'countries'
Loaded dataframe 'pop_grids_full'
Loaded dataframe 'comuni_EU'
Loaded dataframe 'nuts'


In [None]:

'''
Enrich by Spatial Joion

'''

from pyspark.sql import functions as F

grids_IT_df = obj.enrich_sjoin(
    df1="pop_grids", 
    df2=obj.dfs_list['countries'], 
    enr_cols=["CNTR_ID", "CNTR_NAME"]
    ).filter(F.col('CNTR_ID').isin("IT"))

with obj.get_time("exporting"):
    obj.parquet_this("grids_IT", grids_IT_df, preserve_partitions=True)


comuni_IT_df = obj.dfs_list['comuni_EU'].filter(F.col('CNTR_ID') == 'IT')

with obj.get_time("exporting"):
    obj.parquet_this("comuni_IT", comuni_IT_df, preserve_partitions=True)


In [None]:

from geoenricher import EnricherGUIOverlay

'''
# GUI for Enrich by Overlay

'''
# pass the `Enricher` object (loaded with the datasets) to the EnricherGUIOverlay constructor
obj_ui = EnricherGUIOverlay(obj)


VBox(children=(HTML(value='<h1>Enrich with Overlay & Aggregation</h1>'), HTML(value="<div style='height: 5px;'…

In [None]:

'''
Visualize the datasets

'''

# pass a list of: names of the loaded datasets or directly the Spark dataframes in memory
# make sure the geometry column is named "geometry".
# ifg not, pass individual dataframes with their geometry column with the argument `geom_col`
obj.plot_this(
    df=[ # str | SparkDataFrame | list[str | SparkDataFrame]
        obj.dfs_list[''].filter(F.col('CNTR_ID') == 'IT'),
        # obj.dfs_list["dg_urban"].filter(F.col('CNTR_CODE') == 'IT'),
        # temp.filter(F.col('CNTR_ID') == 'IT'),
        ],  
    new_map=True,  # if it is False, the dataset will be added to the old map (if it exists, or else, makes a new one)
    save_html=False,
    )

# you can plot the map in another cell, by running `<obj>.map`
