In [17]:
# IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
import geopandas as gpd
import numpy as np
from shapely.geometry import box
from typing import Tuple, Dict, Any, List
import panel as pn
from matplotlib.figure import Figure
from tkinter import Tk
from tkinter import simpledialog
from tkinter.filedialog import askopenfilenames
from tkinter import Tk, Toplevel, Button, Checkbutton, IntVar, Label, Frame, filedialog
import cupy as cp
import easygui
import ipywidgets as widgets
from matplotlib.widgets import Slider
from IPython.display import display

from joblib import Parallel, delayed\




In [None]:
#SELECT FILES

# Function to interactively select files
def select_geojson_files():
    # Create a Tkinter root window (hidden)
    root = Tk()
    root.withdraw()  # Hide the main window
    root.attributes("-topmost", True)  # Bring the dialog to the front

    # Open the file selection dialog
    geojson_files = filedialog.askopenfilenames(
        title="Select GeoJSON Files",
        filetypes=[("GeoJSON files", "*.geojson"), ("All files", "*.*")]
    )
    
    root.destroy()  # Close the root window after selection
    return list(geojson_files)  # Convert tuple to list and return

# Use the function to select files
geojson_files = select_geojson_files()

# Print the selected files for verification
print(f"Selected GeoJSON files: {geojson_files}")


In [None]:
# Store the selected columns (fields) in the correct format
features_to_process = []

def select_columns(geojson_file):
    """Function to display column selection widgets for a given GeoJSON file."""
    # Read the GeoJSON file using GeoPandas
    gdf = gpd.read_file(geojson_file)
    
    # Get the list of columns
    columns = gdf.columns.tolist()
    
    # Create a multiple selection widget for columns
    selection = widgets.SelectMultiple(
        options=columns,
        description=f'Select columns for {os.path.basename(geojson_file)}:',
        rows=10
    )
    
    # Display the widget and button
    display(selection)

    # Define button click event
    def on_button_click(b):
        # For each selected column, create a tuple of (geojson_file, column_name)
        selected_columns = [(geojson_file, col) for col in selection.value]
        features_to_process.extend(selected_columns)
        print(f'Selected columns from {geojson_file}: {selected_columns}')
    
    # Create and display button
    button = widgets.Button(description="Submit Selection")
    button.on_click(on_button_click)
    display(button)

# Iterate through each GeoJSON file and let the user select columns
for file in geojson_files:
    select_columns(file)

In [None]:
print(features_to_process)

In [36]:
# PARALLEL PROCESSING FUNCTION - CPU - HYBRID STRATEGY - ALL LAYERS

def process_cell(idx, cell, gdf, sindex, feature_column, category_to_int, filename_prefix):
    """
    Process a single cell by finding intersections and determining the feature value.
    """
    i, j = divmod(idx, grid_size[1])

    # Use the spatial index to find potential intersecting polygons
    possible_matches_index = list(sindex.intersection(cell.bounds))
    if not possible_matches_index:
        return i, j, np.nan  # No intersecting features, return NaN for no data

    # Check for actual intersection and assign the feature value
    possible_matches = gdf.iloc[possible_matches_index]
    if possible_matches.empty:
        return i, j, np.nan

    # Calculate intersections more precisely
    intersections = possible_matches.geometry.intersection(cell)

    # Consider all non-zero intersections
    valid_intersections = intersections[intersections.area > 0]

    if valid_intersections.empty:
        return i, j, np.nan

    # Determine strategy based on the number of intersecting features
    if len(valid_intersections) > 5:  # Threshold for choosing strategy
        # Many small polygons: Sum the areas for each unique category
        areas_per_category = {}
        for idx, intersection in enumerate(valid_intersections):
            if not intersection.is_empty:
                category = possible_matches.iloc[idx][feature_column]
                category_key = f"{filename_prefix}_{category}"
                if category_key not in areas_per_category:
                    areas_per_category[category_key] = 0
                areas_per_category[category_key] += intersection.area

        # Choose the category with the largest cumulative area
        if areas_per_category:
            max_category = max(areas_per_category, key=areas_per_category.get)
            return i, j, category_to_int[max_category]
        else:
            return i, j, np.nan  # Fallback to NaN

    else:
        # Few large polygons: Choose the largest single intersection by area
        largest_intersection_idx = valid_intersections.area.idxmax()
        category = possible_matches.loc[largest_intersection_idx, feature_column]
        return i, j, category_to_int[f"{filename_prefix}_{category}"]

def process_feature_column(geojson_file, feature_column, grid_size, target_crs, filename_prefix, x, y):
    # Read the GeoJSON file
    gdf = gpd.read_file(geojson_file)

    # Reproject to target CRS
    gdf = gdf.to_crs(target_crs)

    # Get unique categories and create a mapping to integers
    unique_categories = gdf[feature_column].unique()
    category_to_int = {f"{filename_prefix}_{cat}": i for i, cat in enumerate(unique_categories)}

    # Initialize the 2D NumPy array with NaN (representing no data)
    grid = np.full(grid_size, np.nan)

    # Create a spatial index for faster intersection checks
    sindex = gdf.sindex

    # Pre-compute cell geometries
    cells = [box(x[j], y[i], x[j + 1], y[i + 1])
             for i in range(grid_size[0])
             for j in range(grid_size[1])]

    # Use joblib to parallelize cell processing
    results = Parallel(n_jobs=-1)(delayed(process_cell)(
        idx, cell, gdf, sindex, feature_column, category_to_int, filename_prefix
    ) for idx, cell in enumerate(cells))

    # Fill the grid with the results
    for i, j, value in results:
        grid[i, j] = value

    # Return the grid and category mapping for this feature
    return (f"{filename_prefix}_{feature_column}", grid, category_to_int)

def geojson_to_numpy_grid_3d_batch(
    grid_size: Tuple[int, int],  # Grid size for the output array
    target_crs: str = "EPSG:3857"  # Web Mercator projection
) -> Tuple[np.ndarray, Dict[str, np.ndarray], Dict[str, Dict[Any, int]], List[Dict[str, Any]]]:
    # Open a file selection dialog for the user to select multiple GeoJSON files
    root = Tk()
    root.withdraw()  # Hide the root window
    root.attributes("-topmost", True)  # Bring the dialog to the front

    # Open the file selection dialog
    geojson_files = askopenfilenames(
        title="Select GeoJSON Files",
        filetypes=[("GeoJSON files", "*.geojson"), ("All files", "*.*")]
    )

    all_feature_grids = {}
    all_feature_mappings = {}
    geospatial_info_list = []

    results = []

    for geojson_file in geojson_files:
        # Read the GeoJSON file to get the total bounds
        gdf = gpd.read_file(geojson_file)
        gdf = gdf.to_crs(target_crs)
        minx, miny, maxx, maxy = gdf.total_bounds

        # Create a fixed-size grid
        x = np.linspace(minx, maxx, grid_size[1] + 1)
        y = np.linspace(miny, maxy, grid_size[0] + 1)

        # Automatically extract all relevant feature columns, excluding geometry columns
        feature_columns = [col for col in gdf.columns if col != gdf.geometry.name]

        # Get the filename without extension for prefixing
        filename_prefix = os.path.splitext(os.path.basename(geojson_file))[0]

        # Store geospatial information for each file
        geospatial_info = {
            'transform': (minx, miny, maxx, maxy),
            'crs': target_crs,
            'file_name': filename_prefix
        }
        geospatial_info_list.append(geospatial_info)

        # Use joblib to parallelize the processing of each feature column
        results.extend(Parallel(n_jobs=-1)(delayed(process_feature_column)(
            geojson_file, feature_column, grid_size, target_crs, filename_prefix, x, y
        ) for feature_column in feature_columns))

    # Process results to merge grids and mappings
    for feature_name, grid, category_to_int in results:
        all_feature_grids[feature_name] = grid
        all_feature_mappings[feature_name] = category_to_int

    # Stack all grids into a 3D array
    grid_3d = np.stack(list(all_feature_grids.values()), axis=0)

    return grid_3d, all_feature_grids, all_feature_mappings, geospatial_info_list


In [22]:
# PARALLEL PROCESSING FUNCTION - CPU - HYBRID STRATEGY - MANUAL INPUTS

# Function to process each cell in the grid
def process_cell(idx, cell, gdf, sindex, feature_column, category_to_int, filename_prefix):
    i, j = divmod(idx, grid_size[1])

    # Use the spatial index to find potential intersecting polygons
    possible_matches_index = list(sindex.intersection(cell.bounds))
    if not possible_matches_index:
        return i, j, np.nan

    possible_matches = gdf.iloc[possible_matches_index]
    if possible_matches.empty:
        return i, j, np.nan

    intersections = possible_matches.geometry.intersection(cell)
    valid_intersections = intersections[intersections.area > 0]

    if valid_intersections.empty:
        return i, j, np.nan

    if len(valid_intersections) > 5:
        areas_per_category = {}
        for idx, intersection in enumerate(valid_intersections):
            if not intersection.is_empty:
                category = possible_matches.iloc[idx][feature_column]
                category_key = f"{filename_prefix}_{category}"
                if category_key not in areas_per_category:
                    areas_per_category[category_key] = 0
                areas_per_category[category_key] += intersection.area

        if areas_per_category:
            max_category = max(areas_per_category, key=areas_per_category.get)
            return i, j, category_to_int[max_category]
        else:
            return i, j, np.nan
    else:
        largest_intersection_idx = valid_intersections.area.idxmax()
        category = possible_matches.loc[largest_intersection_idx, feature_column]
        return i, j, category_to_int[f"{filename_prefix}_{category}"]

# Function to process each feature column
def process_feature_column(geojson_file, feature_column, grid_size, target_crs, filename_prefix, x, y):
    gdf = gpd.read_file(geojson_file)
    print(f"Processing feature column: {feature_column} from file: {geojson_file}")

    gdf = gdf.to_crs(target_crs)
    if gdf.empty:
        print(f"GeoDataFrame for {geojson_file} is empty after reprojecting. Skipping column: {feature_column}")
        return None

    unique_categories = gdf[feature_column].unique()
    print(f"Unique categories in {feature_column}: {unique_categories}")
    category_to_int = {f"{filename_prefix}_{cat}": i for i, cat in enumerate(unique_categories)}

    grid = np.full(grid_size, np.nan)
    sindex = gdf.sindex

    cells = [box(x[j], y[i], x[j + 1], y[i + 1])
             for i in range(grid_size[0])
             for j in range(grid_size[1])]

    results = Parallel(n_jobs=-1)(delayed(process_cell)(
        idx, cell, gdf, sindex, feature_column, category_to_int, filename_prefix
    ) for idx, cell in enumerate(cells))

    if not results:
        print(f"No results were generated for feature column: {feature_column} from file: {geojson_file}")
        return None

    for i, j, value in results:
        grid[i, j] = value

    return (f"{filename_prefix}_{feature_column}", grid, category_to_int)

# Batch processing function
def geojson_to_numpy_grid_3d_batch(
    grid_size: Tuple[int, int],  # Grid size for the output array
    geojson_files: List[str],  # List of GeoJSON files
    features_to_process: List[Tuple[str, str]],  # List of (file, feature) tuples to process
    target_crs: str = "EPSG:3857"  # Web Mercator projection
) -> Tuple[np.ndarray, Dict[str, np.ndarray], Dict[str, Dict[Any, int]], List[Dict[str, Any]]]:
    all_feature_grids = {}
    all_feature_mappings = {}
    geospatial_info_list = []

    results = []

    # Process each file and its corresponding features
    for geojson_file in geojson_files:
        # Get the filename without extension for prefixing
        filename_prefix = os.path.splitext(os.path.basename(geojson_file))[0]

        # Read the GeoJSON file to get the total bounds
        gdf = gpd.read_file(geojson_file)
        gdf = gdf.to_crs(target_crs)
        minx, miny, maxx, maxy = gdf.total_bounds

        x = np.linspace(minx, maxx, grid_size[1] + 1)
        y = np.linspace(miny, maxy, grid_size[0] + 1)

        # Extract relevant features for this file
        file_features = [feature for file, feature in features_to_process if file == geojson_file]

        # Store geospatial information for each file
        geospatial_info = {
            'transform': (minx, miny, maxx, maxy),
            'crs': target_crs,
            'file_name': filename_prefix
        }
        geospatial_info_list.append(geospatial_info)

        # Use joblib to parallelize the processing of each feature column
        results.extend(Parallel(n_jobs=-1)(delayed(process_feature_column)(
            geojson_file, feature_column, grid_size, target_crs, filename_prefix, x, y
        ) for feature_column in file_features))

    for feature_name, grid, category_to_int in results:
        all_feature_grids[feature_name] = grid
        all_feature_mappings[feature_name] = category_to_int

    grid_3d = np.stack(list(all_feature_grids.values()), axis=0)

    return grid_3d, all_feature_grids, all_feature_mappings, geospatial_info_list


In [None]:
# COMPUTE GRID SIZE

def compute_grid_size(geojson_file: str, short_edge_cells: int = 1200) -> Tuple[int, int]:
    # Read the GeoJSON file
    gdf = gpd.read_file(geojson_file)
    
    # Get the bounding box of the masking region
    minx, miny, maxx, maxy = gdf.total_bounds
    
    # Calculate width and height of the bounding box
    width = maxx - minx
    height = maxy - miny

    # Determine which is the short and long edge
    if width < height:
        short_edge = width
        long_edge = height
        orientation = 'portrait'
    else:
        short_edge = height
        long_edge = width
        orientation = 'landscape'

    # Compute the aspect ratio
    aspect_ratio = long_edge / short_edge

    # Compute the number of cells for the long edge
    long_edge_cells = int(short_edge_cells * aspect_ratio)

    # Determine the grid size based on the orientation
    if orientation == 'portrait':
        grid_size = (short_edge_cells, long_edge_cells)
    else:
        grid_size = (long_edge_cells, short_edge_cells)

    return grid_size

mask_file = r"C:\Users\TyHow\Documents\3. Work\GIS Stuff\ML_pilot_data\MASK.geojson"
grid_size = compute_grid_size(mask_file, short_edge_cells=20)[::-1]
print(f"Calculated grid size: {grid_size}")


In [None]:
# COMPUTE GRID SIZE

def compute_grid_size(geojson_file: str, short_edge_cells: int = 1200) -> Tuple[int, int]:
    # Read the GeoJSON file
    gdf = gpd.read_file(geojson_file)
    
    # Get the bounding box of the masking region
    minx, miny, maxx, maxy = gdf.total_bounds
    
    # Calculate width and height of the bounding box
    width = maxx - minx
    height = maxy - miny

    # Determine which is the short and long edge
    if width < height:
        short_edge = width
        long_edge = height
        orientation = 'portrait'
    else:
        short_edge = height
        long_edge = width
        orientation = 'landscape'

    # Compute the aspect ratio
    aspect_ratio = long_edge / short_edge

    # Compute the number of cells for the long edge
    long_edge_cells = int(short_edge_cells * aspect_ratio)

    # Determine the grid size based on the orientation
    if orientation == 'portrait':
        grid_size = (short_edge_cells, long_edge_cells)
    else:
        grid_size = (long_edge_cells, short_edge_cells)

    return grid_size


# Prompt the user for the short_edge_cells value using tkinter
root = Tk()
root.withdraw()  # Hide the root window

# Ask the user for the short edge size
short_edge_cells = simpledialog.askinteger("Input", "Enter the number of cells for the short edge:", minvalue=1)

root.destroy()  # Close the tkinter root window

if short_edge_cells is None:
    raise ValueError("You must enter a valid number for the short edge size.")


mask_file = r"C:\Users\TyHow\Documents\3. Work\GIS Stuff\ML_pilot_data\MASK.geojson"
grid_size = compute_grid_size(mask_file, short_edge_cells=short_edge_cells)[::-1]
print(f"Calculated grid size: {grid_size}")

In [None]:
#RUN FUNCTION

#grid_size = (1200, 1550)  # Define the grid size

# Call the function
#grid_3d, feature_grids, feature_mappings, geospatial_info_list = geojson_to_numpy_grid_3d_batch(grid_size)
grid_3d, feature_grids, feature_mappings, geospatial_info_list = geojson_to_numpy_grid_3d_batch(grid_size, geojson_files, features_to_process)

# Print results
print("Shape of the 3D grid array:", grid_3d.shape)
print("Feature grids:", feature_grids.keys())
print("Feature mappings:", feature_mappings)
print("Geospatial information for each file:", geospatial_info_list)


In [26]:
np.save(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\output_array_deposits", grid_3d)
np.save(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\output_feature_grid_deposits", feature_grids)
np.save(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\output_feature_mappings_deposits", feature_mappings)
np.save(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\output_geospatial_info_deposits", geospatial_info_list)


In [None]:
print("VECTOR MAPPINGS:", np.load(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\output_feature_mappings_600.npy", allow_pickle=True))
print("RASTER MAPPINGS:", np.load(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\output_6_rasters_layer_mappings.npy", allow_pickle=True))

In [None]:
#PLOT output array

# Initialize the Panel extension
pn.extension()

# Function to plot a specific layer using Matplotlib
def plot_layer_bokeh(layer_index):
    # Debugging: Print information about the current layer being plotted
    #print(f"Plotting Layer {layer_index + 1}/{grid_3d.shape[0]}: {list(feature_grids.keys())[layer_index]}")
    #print(f"Min value in layer: {np.min(grid_3d[layer_index])}, Max value in layer: {np.max(grid_3d[layer_index])}")

    # Create the plot
    fig = Figure(figsize=(3, 4))
    ax = fig.add_subplot(111)
    im = ax.imshow(grid_3d[layer_index], cmap='tab20', interpolation='nearest', aspect='auto')
    ax.set_title(f"Layer {layer_index + 1}: {list(feature_grids.keys())[layer_index]}")
    fig.colorbar(im, ax=ax, label='Classes')
    ax.set_xlabel('X Coordinate')
    ax.set_ylabel('Y Coordinate')

    # Debugging: Display the array data for the current layer
    #print("Layer data:\n", grid_3d[layer_index])

    return pn.pane.Matplotlib(fig, tight=True)

# Create a Panel widget for selecting the layer
layer_slider = pn.widgets.IntSlider(name='Layer Index', start=0, end=grid_3d.shape[0] - 1, step=1, value=0)

# Bind the plotting function to the slider value
panel = pn.bind(plot_layer_bokeh, layer_index=layer_slider)

# Display the Panel with the slider and plot
pn.Column(layer_slider, panel).servable()


In [None]:
# Load rasters in, combine with other arrays of same shape

def load_and_combine_npy_files():
    # Open a file selection dialog for the user to select multiple npy files
    root = Tk()
    root.withdraw()  # Hide the root window
    root.attributes("-topmost", True)  # Bring the dialog to the front

    # Open the file selection dialog
    npy_files = filedialog.askopenfilenames(
        title="Select NPY Files",
        filetypes=[("NumPy array files", "*.npy"), ("All files", "*.*")]
    )
    
    root.destroy()

    # List to store loaded arrays
    arrays = []

    # Load each npy file and add it to the list
    for npy_file in npy_files:
        arr = np.load(npy_file)
        arrays.append(arr)

    # Check if all arrays have the same spatial dimensions (x and y)
    shapes = [arr.shape[1:] for arr in arrays]  # Check only x and y dimensions
    if not all(shape == shapes[0] for shape in shapes):
        raise ValueError("All arrays must have the same spatial dimensions to combine. Found spatial shapes: {}".format(shapes))

    # Combine the arrays along the first axis (layer axis)
    combined_array = np.concatenate(arrays, axis=0)

    return combined_array

# Call the function to load and combine the npy files
combined_array = load_and_combine_npy_files()

# Print the shape of the combined array
print(f"Combined array shape: {combined_array.shape}")



In [None]:

# Initialize the Panel extension
pn.extension()

# Assume combined_array is already loaded with the combined layers
# combined_array = np.load("combined_array.npy")  # Uncomment if needed

# Function to plot a specific layer using Matplotlib
def plot_layer_bokeh(layer_index):
    # Create the plot
    fig = Figure(figsize=(3, 4))
    ax = fig.add_subplot(111)
    ax.imshow(combined_array[layer_index], cmap='viridis', interpolation='nearest', aspect='auto')
    ax.set_title(f"Layer {layer_index + 1}")
    ax.set_xlabel('X Coordinate')
    ax.set_ylabel('Y Coordinate')

    return pn.pane.Matplotlib(fig, tight=True)

# Create a Bokeh slider widget for selecting the layer
layer_slider = pn.widgets.IntSlider(name='Layer Index', start=0, end=combined_array.shape[0] - 1, step=1, value=0)

# Bind the plotting function to the slider value
panel = pn.bind(plot_layer_bokeh, layer_index=layer_slider)

# Display the Panel with the slider and plot
pn.Column(layer_slider, panel).servable()


In [16]:
# Optional: Save the combined array to a new npy file
np.save(r"C:\Users\TyHow\Documents\3. Work\ML_test_area\exports\combined_array_600_2", combined_array)
