In [1]:
from sedona.spark import *
from contextlib import contextmanager
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import *
from pyspark.sql.functions import col, first, expr
from pyspark.sql import functions as F
import time
import ipywidgets as widgets
from IPython.display import display, clear_output
import sys
import io
import pandas as pd

class Enricher:

    def __init__(self, crs=3035, which="wherobots"):
        self.crs = crs
        self.cores = None
        self.res = None
        self.sedona = None
        self.res_agr = None
        self.df1 = None
        self.df2 = None
        self.dfs_list = {}
        if which == "wherobots":
            self._setup()
            
    def _setup(self, which="wherobots", ex_mem=26, dr_mem=24, log_level=None):
        if which == "wherobots":
            config = SedonaContext.builder().getOrCreate()
            self.sedona = SedonaContext.create(config)
            
            self.cores = self.sedona.sparkContext.defaultParallelism
            print(f"Wherobots setup started with {self.cores} cores for parellelism.")
        elif which == "sedona":
            config = SedonaContext.builder() .\
                config("spark.executor.memory", f"{ex_mem}g") .\
                config("spark.driver.memory", f"{dr_mem}g") .\
                config('spark.jars.packages',
                    'org.apache.sedona:sedona-spark-shaded-3.5_2.12:1.7.0,'
                    'org.datasyslab:geotools-wrapper:1.7.0-28.5'). \
                getOrCreate()

            self.sedona = SedonaContext.create(config)
            
            if log_level in ["OFF", "ERROR", "WARN", "INFO", "DEBUG"]:
                self.sedona.sparkContext.setLogLevel(log_level)
                
            self.cores = self.sedona.sparkContext.defaultParallelism
            print(f"Sedona initialized with {self.cores} cores for parellelism.")
        else:
            raise ValueError("Invalid 'which'. Choose either 'wherobots' or 'sedona'")
        
    @contextmanager
    def get_time(self, task_name):
        start = time.time()
        yield
        elapsed = time.time() - start
    
        print(f"{task_name}... DONE in {(elapsed/60):.2f} min" \
              if elapsed >= 60 else f"{task_name}... DONE in {elapsed:.2f} sec")

    
    def load(self, datasets, schema=True):
        print(f"Make sure the geometry column is named \"geometry\" in the datasets")
    
        for name, (path, fformat) in datasets.items():
            self.dfs_list[name] = self.sedona.read.format(fformat).load(path)
        
        print(f"{len(self.dfs_list)} datasets loaded. \n")
        if schema:    
            for name, df in self.dfs_list.items():
                print(f"\n Dataset: \"{name}\", count: {df.count()}")
                df.printSchema()

    def _make_cache(self, *dfs):
        cached_dfs = []
        for df in dfs:
            if df.storageLevel != StorageLevel.NONE:
                df.unpersist()
            df.cache()
            print(f"Dataset cached. {df.count()} rows.")
            cached_dfs.append(df)
        return cached_dfs
    
    def clear_memory(self, *keep):
        if self.res.storageLevel != StorageLevel.NONE:
            self.res.unpersist()
        self.res = None

    def join_chey_new(self, selected_aggs, group_by=None, pred="ST_Intersects", rel_str="2********", make_geom=True, ratio=True, madre=False, cache=True, grid_area=None):
        
        join_expr = f"{pred}(df1.geometry, df2.geometry)"
        if pred == "ST_Relate":
            join_expr = f"{pred}(df1.geometry, df2.geometry, '{rel_str}')"
    
        self.res = self.df1.alias("df1").join(
            self.df2.alias("df2"), expr(join_expr)
        ).select(
            expr("df1.geometry").alias("df1_geom"),
            expr("df2.geometry").alias("df2_geom"),
            *[f"df1.{c}" for c in self.df1.columns if c != "geometry"],
            *[f"df2.{c}" for c in self.df2.columns if c != "geometry" and c not in self.df1.columns]
        )
    
        if make_geom:
            self.res = self.res.withColumn("intr_geom", expr("ST_Intersection(df1_geom, df2_geom)"))
            if ratio:
                if grid_area > 0:
                    self.res = self.res.withColumn("intr_ratio", expr(f"ST_Area(intr_geom) / {grid_area}"))
                else:
                    self.res = self.res.withColumn("intr_ratio", expr("ST_Area(intr_geom) / ST_Area(df2_geom)"))
    
        if selected_aggs:
            if not ratio:
                agg_exprs = []
                for col_name, agg_func in selected_aggs.items():
                    if agg_func == "sum":
                        agg_exprs.append(F.sum(col(col_name)).alias(f"{col_name}_agr"))
                    elif agg_func == "mean":
                        agg_exprs.append(F.mean(col(col_name)).alias(f"{col_name}_agr"))
                    elif agg_func == "min":
                        agg_exprs.append(F.min(col(col_name)).alias(f"{col_name}_agr"))
                    elif agg_func == "max":
                        agg_exprs.append(F.max(col(col_name)).alias(f"{col_name}_agr"))
                    elif agg_func == "count":
                        agg_exprs.append(F.count(col(col_name)).alias(f"{col_name}_agr"))
                    else:
                        raise ValueError(f"Unsupported aggregation function: {agg_func}")
        
                group_by_cols = [first(col(c)).alias(f"{c}_noAgr") for c in self.res.columns if c not in selected_aggs and c != group_by]
            
            else:
                if "intr_ratio" not in self.res.columns:
                    raise ValueError("ratio column not found. Run 'make_int_ratio()' first")
                
                # agg_exprs = [expr(f"ceil(sum({c} * intr_ratio))").alias(f"{c}_agr") for c in selected_aggs.keys()]
                
                agg_exprs = []
                for col_name, agg_func in selected_aggs.items():
                    if agg_func == "sum":
                        temp = F.sum(col(col_name) * col("intr_ratio")).alias(f"{col_name}_agr")
                        agg_exprs.append(temp)
                    elif agg_func == "mean":
                        agg_exprs.append(F.mean(col(col_name) * col("intr_ratio")).alias(f"{col_name}_agr"))
                    elif agg_func == "min":
                        agg_exprs.append(F.min(col(col_name)).alias(f"{col_name}_agr"))
                    elif agg_func == "max":
                        agg_exprs.append(F.max(col(col_name)).alias(f"{col_name}_agr"))
                    elif agg_func == "count":
                        agg_exprs.append(F.count(col(col_name)).alias(f"{col_name}_agr"))
                    else:
                        raise ValueError(f"Unsupported aggregation function: {agg_func}")
                
                group_by_cols = [first(col(c)).alias(f"{c}_noAgr") for c in self.res.columns if c not in selected_aggs and c != group_by]
    
            self.res_agr = self.res.groupBy(group_by).agg(*group_by_cols, *agg_exprs)
    
            if madre:
                self.res = self.res.join(self.res_agr, on=group_by, how="left")
    
        if cache:
            if selected_aggs:
                self._make_cache(self.res_agr)
            self._make_cache(self.res)
    
        return self.res

    
    def join_chey(self, *cols, group_by=None, pred="ST_Intersects", rel_str="2********", make_geom=True, ratio=True, aggr=True, madre=True, cache=True, grid_area=1e6):
        join_expr = f"{pred}(df1.geometry, df2.geometry)"
        if pred == "ST_Relate":
            join_expr = f"{pred}(df1.geometry, df2.geometry, '{rel_str}')"

        self.res = self.df1.alias("df1").join(
            self.df2.alias("df2"), expr(join_expr)
        ).select(
            expr("df1.geometry").alias("df1_geom"),
            expr("df2.geometry").alias("df2_geom"),
            *[f"df1.{c}" for c in self.df1.columns if c != "geometry"],
            *[f"df2.{c}" for c in self.df2.columns if c != "geometry" and c not in self.df1.columns]
        )

        if make_geom:
            self.res = self.res.withColumn("intr_geom", expr("ST_Intersection(df1_geom, df2_geom)"))
            if ratio:
                if grid_area > 0:
                    self.res = self.res.withColumn("intr_ratio", expr(f"ST_Area(intr_geom) / {grid_area}"))
                else:
                    self.res = self.res.withColumn("intr_ratio", expr("ST_Area(intr_geom) / ST_Area(df2_geom)"))
        
        if aggr:
            if not ratio:
                self.res_agr = self.res.groupBy(group_by).\
                    agg(*([first(col(c)).alias(c) for c in self.res.columns if c not in cols and c != group_by]\
                        + [_sum(col(c)).alias(f"agr_{c}") for c in cols]))
            else:
                if "intr_ratio" not in self.res.columns:
                    raise ValueError("ratio column not found. run 'make_int_ratio()' first")
                self.res_agr = self.res.groupBy(group_by).\
                    agg(*([first(col(c)).alias(c) for c in self.res.columns if c not in cols and c != group_by]\
                    + [ceil(_sum(col(c) * col("intr_ratio"))).alias(f"agr_{c}") for c in cols])
                )
            if madre:
                self.res = self.res.join(self.res_agr, on=group_by, how="left")
            
        if cache:
            if aggr:
                self._make_cache(self.res_agr)
            self._make_cache(self.res)

        return self.res

    def export(self, df="madre", path="outputs", name="unnamed", how="repartition", num=None, clear=False):
        if num is None:
            num = self.cores
        if how == "repartition":
            self.res = self.res.repartition(num)
        elif how == "coalesce":
            self.res = self.res.coalesce(num)
        else:
            raise ValueError("Invalid 'how'. Choose either 'repartition' or 'coalesce'")

        if df == "madre":
            self.res.write.mode("overwrite").format("geoparquet").save(f"./{path}/" + f"/{name}")
        else:
            self.res_agr.write.mode("overwrite").format("geoparquet").save(f"./{path}/" + f"/{name}_agr")
        
        if clear:
            self.clear_memory()














import ipywidgets as widgets
from ipywidgets import Button, Layout
from IPython.display import *
from io import StringIO
import sys

class EnricherUI:
    def __init__(self, enricher):
        self.enricher = enricher
        self.loaded_dataframes = {}  # Track dataframes loaded into memory
        self.selected_cols = []  # Track selected columns for aggregation
        self.group_by_col = None  # Track the selected group_by column
        self._init_ui()
        self.loaded_dataframes = self.list_dataframes_in_memory()
        self.df1_dropdown.options = list(self.loaded_dataframes.keys())
        self.df2_dropdown.options = list(self.loaded_dataframes.keys())
        self.df1_dropdown.disabled = False
        self.df2_dropdown.disabled = False        
        self.selected_aggs = {}
        self.agg_options = ["sum", "count", "mean", "min", "max"]

    def list_dataframes_in_memory(self):
        # return {name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame)}
        return {name: df for name, df in self.enricher.dfs_list.items()}
    
    def _init_ui(self):
        # Heading: Enrich with Overlay
        self.heading = widgets.HTML(value="<h1>Enrich with Overlay</h1>")

        # First line: Enrich <df1> with <df2>
        self.df1_dropdown = widgets.Dropdown(options=[], description="df1:", disabled=True, style={'description_width': 'initial'}, layout=widgets.Layout(margin="5px 20px", width="150px"))
        self.df2_dropdown = widgets.Dropdown(options=[], description="df2:", disabled=True, style={'description_width': 'initial'}, layout=widgets.Layout(margin="5px 20px", width="150px"))
        self.load_button = widgets.Button(description="Load", disabled=True, layout=widgets.Layout(margin="5px 0px", width="100px"))
        self.load_status = widgets.HTML(value="<small>Status: No dataframes loaded.</small>")

        # Second line: with attributes: <cols>
        self.cols_dropdown = widgets.SelectMultiple(options=[], description="aggr cols:", disabled=True, style={'description_width': 'initial'})
        self.agg_table_output = widgets.Output()        
        
        # Third line: unique id: <col>
        self.group_by_dropdown = widgets.Dropdown(options=[], description="unique id:", disabled=True, style={'description_width': 'initial'}, layout=widgets.Layout(margin="5px 20px", width="250px"))
        
        self.agg_status = widgets.HTML(value="<small><i>Aggregating with: select cols, grouping by: select col</i></small>")

        # Advanced options
        self.advanced_checkbox = widgets.Checkbox(value=False, description="Advanced options", style={'description_width': 'initial'})
        self.preserve_geoms_checkbox = widgets.Checkbox(value=False, description="Preserve geoms from both layers", disabled=True, layout=Layout(visibility='hidden'))
        self.intersection_ratio_checkbox = widgets.Checkbox(value=True, description="Consider intersection ratio of geoms when aggregating", disabled=True, layout=Layout(visibility='hidden', width='auto'))
        self.grid_area_text = widgets.FloatText(value=1e6, description="B grid area:", disabled=True, layout=Layout(visibility='hidden'))
        self.custom_predicate_checkbox = widgets.Checkbox(value=False, description="Custom ST_Relate predicate string:", disabled=True, layout=Layout(visibility='hidden', width='auto'))
        self.custom_predicate_text = widgets.Text(value="2********", disabled=True, layout=Layout(visibility='hidden'))
        
        self.go_button = widgets.Button(description="Go", disabled=True, layout=widgets.Layout(margin="5px 0px", width="100px"))

        # Console output text box
        self.console_output = widgets.Textarea(value="", description="Console:", layout=widgets.Layout(width="100%", height="200px"))
        self.clear_console_button = widgets.Button(description="Clear Console")

        # Layout
        self._setup_layout()
        self._setup_event_handlers()

    def _setup_layout(self):
        # First line: Enrich <df1> with <df2>
        df_selection_line = widgets.HBox([
            widgets.HTML(value="<h2 style='display: inline; margin-right: 10px;'>Enrich </h2>"),
            self.df1_dropdown,
            widgets.HTML(value="<h2 style='display: inline; margin-right: 10px;'> with </h2>"),
            self.df2_dropdown,
            self.load_button
        ])

        # Second line: with attributes: <cols>
        cols_selection_line = widgets.HBox([
            widgets.HTML(value="<h2 style='display: inline; margin-right: 10px;'> with attributes: </h2>"),
            self.cols_dropdown,
            self.agg_table_output,
        ])

        # Third line: unique id: <col>
        grp_by_selection_line = widgets.HBox([
            widgets.HTML(value=f"<h2 style='display: inline; margin-right: 10px;'><span id='unique_id_text'>unique identifier:</span> </h2>"),
            self.group_by_dropdown,
            self.go_button
        ])

        # Advanced options
        self.advanced_options = widgets.VBox([
            self.preserve_geoms_checkbox,
            widgets.HBox([self.intersection_ratio_checkbox, self.grid_area_text]),
            widgets.HBox([self.custom_predicate_checkbox, self.custom_predicate_text]),
        ], disabled=True)

        # Main layout
        self.main_layout = widgets.VBox([
            self.heading,
            widgets.HTML(value="<div style='height: 5px;'></div>"),
            df_selection_line,
            self.load_status,
            cols_selection_line,
            widgets.HTML(value="<div style='height: 4px;'></div>"),
            grp_by_selection_line,
            self.agg_status,
            widgets.HTML(value="<div style='height: 3px;'></div>"),
            self.advanced_checkbox,
            self.advanced_options,
            # self.console_output,
            # self.clear_console_button
        ])

        # Display everything
        display(self.main_layout)

    def _setup_event_handlers(self):
        # Enable/disable load button based on dataframe selection
        def on_df_selection_change(change):
            if self.df1_dropdown.value and self.df2_dropdown.value:
                self.load_button.disabled = False
            else:
                self.load_button.disabled = True
        self.df1_dropdown.observe(on_df_selection_change, names='value')
        self.df2_dropdown.observe(on_df_selection_change, names='value')

        # Handle load button click
        def on_load_button_click(b):
            try:
                df1_name = self.df1_dropdown.value
                df2_name = self.df2_dropdown.value

                if df1_name not in self.loaded_dataframes or df2_name not in self.loaded_dataframes:
                    raise ValueError("Selected dataframes are not loaded in memory.")

                # Set the selected dataframes in the Enricher
                self.enricher.df1 = self.loaded_dataframes[df1_name]
                self.enricher.df2 = self.loaded_dataframes[df2_name]

                # Update column dropdowns
                self.cols_dropdown.options = self.enricher.df2.columns
                self.group_by_dropdown.options = self.enricher.df1.columns
                self.cols_dropdown.disabled = False
                self.group_by_dropdown.disabled = False

                self.load_status.value = f"<small>Status: Loaded {df1_name} and {df2_name}.</small>"
                self.main_layout.children[6].children[0].value = f"<h2 style='display: inline; margin-right: 10px;'>{df1_name}'s unique identifier: </h2>"
            except Exception as e:
                self.load_status.value = f"<small>Error: {str(e)}</small>"

        self.load_button.on_click(on_load_button_click)

        # Handle column selection
        def on_cols_change(change):
            for col in change["new"]:
                if col not in self.selected_cols:
                    self.selected_cols.append(col)
            
            self.cols_dropdown.options = [col for col in self.enricher.df2.columns if col not in self.selected_cols]
            
            # Preserve previously selected operations, default to "sum" for new columns
            for col in self.selected_cols:
                if col not in self.selected_aggs:
                    self.selected_aggs[col] = "sum"            
            
            def generate_agg_table():
                headers = ["Column", "Operation", ""]
                
                cell_style = widgets.Layout(
                    border="1px solid black", 
                    padding="0px 2px",
                    align_items="center", 
                    justify_content="center", 
                    width="125px"
                )
                
                clr_style = widgets.Layout(padding="0px 2px",align_items="center", justify_content="center", width="80px")
                
                header_row = [
                    widgets.HTML(f"<b>{headers[0]}</b>", layout=cell_style),
                    widgets.HTML(f"<b>{headers[1]}</b>", layout=cell_style),
                    widgets.HTML("", layout=clr_style)
                ]
            
                rows = []
                for col in self.selected_aggs:
                    dropdown = widgets.Dropdown(
                        options=self.agg_options, 
                        value=self.selected_aggs[col], 
                        layout=cell_style
                    )
                    dropdown.observe(lambda change, col=col: self.selected_aggs.update({col: change["new"]}), names="value")
            
                    clear_button = widgets.Button(description="Clear", layout=clr_style)

                    def on_clear(btn, col=col):
                        self.selected_cols.remove(col)
                        self.agg_status.value = f"Status: Aggregating with: {', '.join([f'<b>{col}</b>' for col in self.selected_cols]) if self.selected_cols else '<i>select cols</i>'}, grouping by: {f'<b>{self.group_by_col}</b>' if self.group_by_col else '<i>select cols</i>'}"
                        del self.selected_aggs[col]
                        self.cols_dropdown.options = [col for col in self.enricher.df2.columns if col not in self.selected_cols]
                        
                        with self.agg_table_output:
                            self.agg_table_output.clear_output()
                            display(generate_agg_table())
                        
                    clear_button.on_click(on_clear)
                        
                    rows.extend([
                        widgets.HTML(col, layout=cell_style),
                        dropdown,
                        clear_button
                    ])
                
                scrollable_container = widgets.VBox([
                    widgets.GridBox(
                        children=header_row + rows,
                        layout=widgets.Layout(
                            grid_template_columns="150px 150px 90px",
                            grid_template_rows="auto",
                            border="1px solid black",
                            padding="1px",
                            width="max-content",
                        )
                    )
                ], layout=widgets.Layout(
                    max_height="150px",
                    overflow_y="auto",
                    border="1px solid black"
                ))
            
                return scrollable_container


            with self.agg_table_output:
                self.agg_table_output.clear_output()
                display(generate_agg_table())

            self.agg_status.value = f"Status: Aggregating with: {', '.join([f'<b>{col}</b>' for col in self.selected_cols]) if self.selected_cols else '<i>select cols</i>'}, grouping by: {f'<b>{self.group_by_col}</b>' if self.group_by_col else '<i>select cols</i>'}"
            if self.group_by_col and self.selected_cols:
                self.go_button.disabled = False            
        self.cols_dropdown.observe(on_cols_change, names='value')

        
        def on_group_by_change(change):
            self.group_by_col = change["new"]
            self.agg_status.value = f"Status: Aggregating with: {', '.join([f'<b>{col}</b>' for col in self.selected_cols]) if self.selected_cols else '<i>select col</i>'}, grouping by: {f'<b>{self.group_by_col}</b>' if self.group_by_col else '<i>select col</i>'}"
            if self.group_by_col and self.selected_cols:
                self.go_button.disabled = False                
                

        self.group_by_dropdown.observe(on_group_by_change, names='value')


        def on_advanced_checkbox_change(change):
            if change["new"]:
                self.advanced_options.disabled = False
                self.preserve_geoms_checkbox.disabled = False
                self.intersection_ratio_checkbox.disabled = False
                self.custom_predicate_checkbox.disabled = False
                self.preserve_geoms_checkbox.layout.visibility = 'visible'
                self.intersection_ratio_checkbox.layout.visibility = 'visible'
                self.custom_predicate_checkbox.layout.visibility = 'visible'
                self.grid_area_text.layout.visibility = 'visible'
                self.custom_predicate_text.layout.visibility = 'visible'

            else:
                self.advanced_options.disabled = True
                self.preserve_geoms_checkbox.disabled = True
                self.intersection_ratio_checkbox.disabled = True
                self.custom_predicate_checkbox.disabled = True
                self.preserve_geoms_checkbox.layout.visibility = 'hidden'
                self.intersection_ratio_checkbox.layout.visibility = 'hidden'
                self.custom_predicate_checkbox.layout.visibility = 'hidden'
                self.grid_area_text.layout.visibility = 'hidden'
                self.custom_predicate_text.layout.visibility = 'hidden'

        self.advanced_checkbox.observe(on_advanced_checkbox_change, names='value')

        def on_intersection_ratio_change(change):
            self.grid_area_text.disabled = not change["new"]
        self.intersection_ratio_checkbox.observe(on_intersection_ratio_change, names='value')

        def on_custom_predicate_change(change):
            self.custom_predicate_text.disabled = not change["new"]
        self.custom_predicate_checkbox.observe(on_custom_predicate_change, names='value')

        def on_go_button_click(b):
            try:
                print("Performing operation. This may take a while. Check logs for Spark logs and completion status.")
                self.enricher.join_chey_new(
                    selected_aggs=self.selected_aggs,  # Ensure this is passed as a keyword argument
                    group_by=self.group_by_col,
                    pred="ST_Relate" if self.custom_predicate_checkbox.value else "ST_Intersects",
                    rel_str=self.custom_predicate_text.value if self.custom_predicate_checkbox.value else "2********",
                    make_geom=True,
                    ratio=self.intersection_ratio_checkbox.value,
                    madre=self.preserve_geoms_checkbox.value,
                    cache=True,
                    grid_area=float(self.grid_area_text.value) if self.intersection_ratio_checkbox.value else 1e6
                )                

                print("Join operation completed.")
            except Exception as e:
                print(f"Error: {str(e)}")
        
        def on_go_button_click_old(b):
            old_stdout = sys.stdout
            sys.stdout = captured_output = StringIO()
            try:
                print("Performing operation. This may take a while. Check logs for Spark logs and completion status.")
                self.enricher.join_chey_new(
                    selected_aggs=self.selected_aggs,
                    group_by=self.group_by_col,
                    pred="ST_Relate" if self.custom_predicate_checkbox.value else "ST_Intersects",
                    rel_str=self.custom_predicate_text.value if self.custom_predicate_checkbox.value else "2********",
                    make_geom=True,
                    ratio=self.intersection_ratio_checkbox.value,
                    madre=self.preserve_geoms_checkbox.value,
                    cache=True,
                    grid_area=float(self.grid_area_text.value) if self.intersection_ratio_checkbox.value else 1e6
                )                

                print("Join operation completed.")
            except Exception as e:
                print(f"Error: {str(e)}")
            finally:
                sys.stdout = old_stdout
                self.console_output.value += captured_output.getvalue()
        self.go_button.on_click(on_go_button_click)

        def on_clear_console_button_click(b):
            self.console_output.value = ""
        self.clear_console_button.on_click(on_clear_console_button_click)

    def add_dataframe(self, name, dataframe):
        self.loaded_dataframes[name] = dataframe
        self.df1_dropdown.options = list(self.loaded_dataframes.keys())
        self.df2_dropdown.options = list(self.loaded_dataframes.keys())
        self.df1_dropdown.disabled = False
        self.df2_dropdown.disabled = False

In [2]:
# Setup

obj = Enricher()
obj.setup(which="sedona", ex_mem=26, dr_mem=24)
path1="./data_EU/countries_shp/"
path2="./data_EU/census_grid_EU/grids.parquet"

obj.load_datsets(which="sedona", path1=path1, path2=path2)


25/01/29 14:07:19 WARN Utils: Your hostname, marvin resolves to a loopback address: 127.0.1.1; using 172.20.27.4 instead (on interface eth0)
25/01/29 14:07:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/data/homes_data/sudheer/benchmark_data/sedona_venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /data/homes_data/sudheer/.ivy2/cache
The jars for the packages stored in: /data/homes_data/sudheer/.ivy2/jars
org.apache.sedona#sedona-spark-shaded-3.5_2.12 added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4a142982-c8ba-44fc-b4fa-1313450d77e5;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-shaded-3.5_2.12;1.7.0 in central
	found org.datasyslab#geotools-wrapper;1.7.0-28.5 in central
:: resolution report :: resolve 242ms :: artifacts dl 10ms
	:: modules in use:
	org.apache.sedona#sedona-spark-shaded-3.5_2.12;1.7.0 from central in [default]
	org.datasyslab#geotools-wrapper;1.7.0-28.5 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-----------------------------------------

Sedona initialized with 10 cores for parellelism.
Make sure the geometry column is named "geometry" in the datasets


[Stage 4:>                                                          (0 + 1) / 1]

Loaded. 
 A.cols: ['geometry', 'COMM_ID', 'CNTR_ID', 'CNTR_CODE', 'COMM_NAME', 'NAME_ASCI', 'TRUE_FLAG', 'NSI_CODE', 'NAME_NSI', 'NAME_LATN', 'NUTS_CODE', 'FID'] 
 
 B.cols: ['GRD_ID', 'T', 'M', 'F', 'Y_LT15', 'Y_1564', 'Y_GE65', 'EMP', 'NAT', 'EU_OTH', 'OTH', 'SAME', 'CHG_IN', 'CHG_OUT', 'LAND_SURFACE', 'POPULATED', 'CONFIDENTIALSTATUS', 'geometry']



                                                                                

In [3]:
# GUI

obj_ui = EnricherUI(obj)


VBox(children=(HTML(value='<h1>Enrich with Overlay</h1>'), HBox(children=(HTML(value="<h2 style='display: inli…

In [7]:
obj_ui.custom_predicate_text.value

'2********'