# Bulk download

In [None]:
import ee
import geemap

ee.Initialize()

# Parameters
tile_scale = 50  # meters per pixel
export_folder = "satellite-image-predictor/training-data"
input_filename = "taiwan_input_stack"
target_filename = "taiwan_rgb_target"

In [None]:
# Define ROI: union of Taiwan and Japan polygons
countries = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017").filter(
    ee.Filter.inList("country_na", ["China", "Taiwan", "Japan"])
)

roi = countries.geometry()

Map = geemap.Map()
Map.addLayer(roi, {}, "ROI")
Map.centerObject(roi, zoom=5)

Map

In [None]:
# Load DEM (Copernicus Global 30m)
dem = (
    ee.ImageCollection("COPERNICUS/DEM/GLO30")
    .mosaic()
    .select("DEM")
    .reproject(crs="EPSG:3857", scale=tile_scale)
)

terrain = ee.Terrain.products(dem)

for i in range(roi.geometries().size().getInfo()):
    sub_roi = ee.Geometry(roi.geometries().get(i))

    export_task = ee.batch.Export.image.toDrive(
        image=dem,
        description=f"dem-part-{i}",
        folder=export_folder,
        fileNamePrefix=f"dem-part-{i}",
        region=sub_roi,
        scale=tile_scale,
        fileFormat="GeoTIFF",
        maxPixels=1e13,
    )
    export_task.start()

In [None]:
slope = terrain.select("slope")
aspect = terrain.select("aspect")
hillshade = ee.Terrain.hillshade(dem)

# Normalise feature
slope = slope.divide(90).rename("slope")  # scale to [0, 1]
aspect = aspect.divide(360).rename("aspect")  # scale to [0, 1]
hillshade = hillshade.divide(255).rename("hillshade")  # already 0–255

# Load Land Cover (ESA WorldCover 2020)
landcover = (
    ee.Image("ESA/WorldCover/v100/2020")
    .select("Map")
    .reproject(crs="EPSG:4326", scale=100)
)

# Remap ESA classes → unified values
esa_original = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]
esa_remapped = [4, 5, 6, 8, 9, 3, 2, 1, 7, 7, 10]
landcover = landcover.remap(esa_original, esa_remapped).toFloat()

# Combine DEM and land cover into input stack
input_stack = (
    dem.rename("DEM")
    .addBands(landcover.rename("LC").toFloat())
    .addBands(slope)
    .addBands(aspect)
    .addBands(hillshade)
)

# Load Sentinel-2 RGB composite (2021, low cloud)
sentinel = (
    ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
    .filterBounds(roi)
    .filterDate("2020-01-01", "2021-12-31")
    .filterMetadata("CLOUDY_PIXEL_PERCENTAGE", "less_than", 10)
    .median()
    .select(["B4", "B3", "B2"])
    .clip(roi)
    .reproject(crs="EPSG:4326", scale=100)
)

# Normalize to [0, 255] and convert to 8-bit
sentinel_rgb = sentinel.min(3000).divide(3000).multiply(255).uint8()

# --------------------------------------------
# Export Input Stack (DEM + LC)
task_input = ee.batch.Export.image.toDrive(
    image=input_stack,
    description="export_input_stack",
    folder=export_folder,
    fileNamePrefix=input_filename,
    region=roi,
    scale=tile_scale,
    fileFormat="GeoTIFF",
    # maxPixels=1e13,
)
task_input.start()

# Export Sentinel RGB Target
task_target = ee.batch.Export.image.toDrive(
    image=sentinel_rgb,
    description="export_target_rgb",
    folder=export_folder,
    fileNamePrefix=target_filename,
    region=roi,
    scale=tile_scale,
    fileFormat="GeoTIFF",
    # maxPixels=1e13,
)
task_target.start()

print("Export started: input and target will be saved to Google Drive.")

In [None]:
import rasterio
from rasterio.windows import Window
import numpy as np
from PIL import Image
import os

input_path = "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input_stack.tif"
target_path = "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-target_stack.tif"
tile_size = 256

output_input_dir = "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input_tiles/"
output_target_dir = "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-target_tiles/"
os.makedirs(output_input_dir, exist_ok=True)
os.makedirs(output_target_dir, exist_ok=True)

with rasterio.open(input_path) as input_src, rasterio.open(target_path) as target_src:
    width, height = input_src.width, input_src.height

    patch_id = 0
    for top in range(0, height, round(tile_size / 1.5)):
        for left in range(0, width, round(tile_size / 1.5)):
            window = Window(left, top, tile_size, tile_size)

            # Skip tiles that don't fully fit
            if top + tile_size > height or left + tile_size > width:
                continue

            # Read input (DEM + LC)
            input_tile = input_src.read(window=window)  # shape: [2, 256, 256]

            # Replace NaN values in DEM and LC
            dem = input_tile[0]
            lc = input_tile[1]
            hs = input_tile[5]
            dem[np.isnan(dem)] = 0
            lc[np.isnan(lc)] = 1
            hs[np.isnan(hs)] = 1 / np.sqrt(2)
            # Only keep 3 channels: DEM, LC, Slope
            input_tile = np.stack([dem, lc, slope], axis=0)  # shape: [3, 256, 256]

            if np.isnan(input_tile).any():
                # print(f"Skipping patch {patch_id}: contains NaN values in input.")
                continue

            # Skip if too much water
            water_ratio = np.mean(lc == 1)
            is_flat = np.all(dem == 0)

            if water_ratio > 0.9 or is_flat:
                # print(
                #     f"Skipping patch {patch_id}: water ratio {water_ratio:.2f}, flat={is_flat}"
                # )
                continue

            # Read target (RGB)
            target_tile = target_src.read(window=window)  # shape: [3, 256, 256]
            target_tile = np.transpose(target_tile, (1, 2, 0))  # [H, W, C]

            # Normalize input if needed (assuming already normalized, skip this)
            np.save(
                os.path.join(output_input_dir, f"input_{patch_id+1:04d}.npy"),
                input_tile,
            )

            # Save target as PNG
            img = Image.fromarray(target_tile.astype(np.uint8))
            img.save(os.path.join(output_target_dir, f"target_{patch_id+1:04d}.png"))

            patch_id += 1

print(f"Saved {patch_id} training samples (256x256)")

In [None]:
import rasterio
from rasterio.windows import Window
import numpy as np
from PIL import Image
import os


def create_tiles(input_raw_path, target_raw_path, tile_size=256, in_channels=2):
    input_raw_folder = os.path.dirname(input_raw_path)
    input_raw_filename = os.path.basename(input_raw_path)
    target_raw_folder = os.path.dirname(target_raw_path)
    target_raw_filename = os.path.basename(target_raw_path)
    input_file = f"{input_raw_filename.replace('_stack', '').replace('.tif', '')}-T{tile_size}C{in_channels}"
    target_file = (
        f"{target_raw_filename.replace('_stack', '').replace('.tif', '')}-T{tile_size}"
    )
    input_folder = os.path.join(input_raw_folder, f"{input_file}/")
    target_folder = os.path.join(target_raw_folder, f"{target_file}/")
    os.makedirs(input_folder, exist_ok=True)
    os.makedirs(target_folder, exist_ok=True)

    with rasterio.open(input_raw_path) as input_src, rasterio.open(
        target_raw_path
    ) as target_src:
        width, height = input_src.width, input_src.height

        patch_id = 0
        for top in range(0, height, round(tile_size / 1.5)):
            for left in range(0, width, round(tile_size / 1.5)):
                window = Window(left, top, tile_size, tile_size)

                # Skip tiles that don't fully fit
                if top + tile_size > height or left + tile_size > width:
                    continue

                # Read input (DEM + LC)
                input_tile = input_src.read(window=window)  # shape: [5, 256, 256]

                # Replace NaN values in DEM and LC
                dem = input_tile[0]
                lc = input_tile[1]
                dem[np.isnan(dem)] = 0
                lc[np.isnan(lc)] = 1

                # Skip if too much water
                water_ratio = np.mean(lc == 1)
                is_flat = np.all(dem == 0)

                if water_ratio > 0.9 or is_flat:
                    continue

                if in_channels == 2:
                    input_tile = np.stack([dem, lc], axis=0)
                elif in_channels == 3:
                    hs = input_tile[4]
                    hs[np.isnan(hs)] = 1 / np.sqrt(2)
                    input_tile = np.stack([dem, lc, hs], axis=0)
                else:
                    raise ValueError(
                        f"Unsupported number of input channels: {in_channels}. Use 2 or 3."
                    )

                if np.isnan(input_tile).any():
                    # print(f"Skipping patch {patch_id}: contains NaN values in input.")
                    continue

                # Read target (RGB)
                target_tile = target_src.read(window=window)  # shape: [3, 256, 256]
                target_tile = np.transpose(target_tile, (1, 2, 0))  # [H, W, C]

                # Skip if target has too much black (0) pixels
                too_much_black = np.mean(target_tile == 0) > 0.2
                if too_much_black:
                    # print(f"Skipping patch {patch_id}: too much black in target image")
                    continue

                # Normalize input if needed (assuming already normalized, skip this)
                np.save(
                    os.path.join(input_folder, f"input_{patch_id+1:04d}.npy"),
                    input_tile,
                )

                # Save target as PNG
                img = Image.fromarray(target_tile.astype(np.uint8))
                img.save(os.path.join(target_folder, f"target_{patch_id+1:04d}.png"))

                patch_id += 1

    print(f"Saved {patch_id} training samples (256x256)")

    return input_folder, target_folder


input_path = "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input.tif"
target_path = "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-target.tif"

input_folder, target_folder = create_tiles(
    input_path, target_path, tile_size=256, in_channels=2
)

In [None]:
import os
import rasterio
from rasterio.merge import merge
from rasterio.io import MemoryFile


def merge_geotiffs(input_folder, output_path):
    # Collect all the input .tif files
    tif_files = [
        os.path.join(input_folder, f)
        for f in os.listdir(input_folder)
        if f.endswith(".tif")
    ]

    if not tif_files:
        raise ValueError("No .tif files found in the folder!")

    # Open all tiles with rasterio
    src_files_to_mosaic = [rasterio.open(f) for f in tif_files]

    print(f"Found {len(src_files_to_mosaic)} tiles to merge.")

    # Merge into one mosaic
    mosaic, out_transform = merge(src_files_to_mosaic)

    print(f"Merging {len(src_files_to_mosaic)} tiles...")

    # Copy metadata from the first tile
    out_meta = src_files_to_mosaic[0].meta.copy()
    out_meta.update(
        {
            "driver": "GTiff",
            "height": mosaic.shape[1],
            "width": mosaic.shape[2],
            "transform": out_transform,
        }
    )

    # Close all the source files
    for src in src_files_to_mosaic:
        src.close()

    print("Source files closed.")

    # Save the merged file
    with rasterio.open(output_path, "w", **out_meta) as dest:
        dest.write(mosaic)

    print(f"Merged {len(tif_files)} tiles into {output_path}")

merge_geotiffs(
    input_folder="/Users/williameclee/geodata/projects/satellite-image-predictor/unmerged/",
    output_path="/Users/williameclee/geodata/projects/satellite-image-predictor/dem_merged.tif",
)

In [None]:
import os
import rasterio
import numpy as np


def extract_and_save_individual_channels(input_folder, output_base_dir):
    os.makedirs(output_base_dir, exist_ok=True)

    dem_dir = os.path.join(output_base_dir, "dem")
    lc_dir = os.path.join(output_base_dir, "lc")
    hillshade_dir = os.path.join(output_base_dir, "hillshade")

    os.makedirs(dem_dir, exist_ok=True)
    os.makedirs(lc_dir, exist_ok=True)
    os.makedirs(hillshade_dir, exist_ok=True)

    for fname in os.listdir(input_folder):
        if not fname.endswith(".tif"):
            continue

        input_path = os.path.join(input_folder, fname)
        base_name = os.path.splitext(fname)[0]

        with rasterio.open(input_path) as src:
            profile = src.profile.copy()

            # Extract and save DEM (band 1, float32)
            dem = src.read(1)
            dem[np.isnan(dem)] = 0  # Replace NaN with 0
            dem = dem.astype(np.float32)
            profile_dem = profile.copy()
            profile_dem.update(count=1, dtype="float32")
            with rasterio.open(
                os.path.join(dem_dir, f"{base_name}_dem.tif"), "w", **profile_dem
            ) as dst:
                dst.write(dem, 1)

            # Extract and save Land Cover (band 2, uint8)
            lc = src.read(2)
            lc[np.isnan(lc)] = 1  # Replace NaN with 1 (assuming 1 is the default land cover)
            lc = lc.astype(np.uint8)
            profile_lc = profile.copy()
            profile_lc.update(count=1, dtype="uint8")
            with rasterio.open(
                os.path.join(lc_dir, f"{base_name}_lc.tif"), "w", **profile_lc
            ) as dst:
                dst.write(lc, 1)

            # Extract and save Hillshade (band 5, uint8 after *255)
            hs = src.read(5)
            hs[np.isnan(hs)] = 1 / np.sqrt(2)  # Replace NaN with a default value
            hs = (hs * 255).clip(0, 255).astype(np.uint8)
            profile_hs = profile.copy()
            profile_hs.update(count=1, dtype="uint8")
            with rasterio.open(
                os.path.join(hillshade_dir, f"{base_name}_hillshade.tif"),
                "w",
                **profile_hs,
            ) as dst:
                dst.write(hs, 1)

    print("✅ Finished exporting individual channels.")


def merge_single_band_folder(input_folder, output_file):
    import rasterio
    from rasterio.merge import merge

    tif_files = [
        os.path.join(input_folder, f)
        for f in os.listdir(input_folder)
        if f.endswith(".tif")
    ]
    if not tif_files:
        raise ValueError("No files to merge.")

    src_files = [rasterio.open(fp) for fp in tif_files]
    mosaic, transform = merge(src_files)
    meta = src_files[0].meta.copy()
    meta.update(
        {
            "height": mosaic.shape[1],
            "width": mosaic.shape[2],
            "transform": transform,
            "count": 1,
        }
    )

    with rasterio.open(output_file, "w", **meta) as dst:
        dst.write(mosaic)

    print(f"Merged into {output_file}")


# Step 1: Split and export channels
extract_and_save_individual_channels(
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-unmerged/",
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-merged/",
)

# Step 2: Merge each one
merge_single_band_folder(
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-merged/dem",
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-dem.tif",
)
merge_single_band_folder(
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-merged/lc",
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-lc.tif",
)
merge_single_band_folder(
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-merged/hillshade",
    "/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-hillshade.tif",
)

In [None]:
from rasterio.windows import Window
import rasterio
import numpy as np
from PIL import Image
import os


def create_tiles_separate_inputs(
    dem_path,
    lc_path,
    hillshade_path,
    target_rgb_path,
    tile_size=256,
    in_channels=3,
    force_reload=True,
):
    # assert (
    #     in_channels == 3
    # ), "This version supports 3-channel input: [DEM, LC, Hillshade]"

    # Output folder naming
    input_base = os.path.splitext(os.path.basename(dem_path))[0]
    target_base = os.path.splitext(os.path.basename(target_rgb_path))[0]
    input_folder = os.path.join(
        os.path.dirname(dem_path), f"{input_base.replace('-dem', '')}-T{tile_size}C{in_channels}"
    )
    target_folder = os.path.join(
        os.path.dirname(target_rgb_path), f"{target_base}-T{tile_size}"
    )

    if (
        not force_reload
        and os.path.exists(input_folder)
        and os.path.exists(target_folder)
    ):
        print(f"Using existing tiles in {input_folder} and {target_folder}")
        return input_folder, target_folder

    os.makedirs(input_folder, exist_ok=True)
    os.makedirs(target_folder, exist_ok=True)

    # Open all input rasters
    with rasterio.open(dem_path) as dem_src, rasterio.open(
        lc_path
    ) as lc_src, rasterio.open(hillshade_path) as hs_src, rasterio.open(
        target_rgb_path
    ) as target_src:

        width, height = dem_src.width, dem_src.height
        assert (
            lc_src.width == width and lc_src.height == height
        ), "Mismatch in input dimensions"
        assert hs_src.width == width and hs_src.height == height
        assert target_src.width == width and target_src.height == height

        patch_id = 0

        for top in range(0, height, round(tile_size / 1.5)):
            for left in range(0, width, round(tile_size / 1.5)):
                window = Window(left, top, tile_size, tile_size)

                # Skip tiles that don't fully fit
                if top + tile_size > height or left + tile_size > width:
                    continue

                # Read and clean input channels
                dem = dem_src.read(1, window=window).astype(np.float32)
                lc = lc_src.read(1, window=window).astype(np.uint8)

                dem[np.isnan(dem)] = 0
                lc[np.isnan(lc)] = 1

                if in_channels == 2:
                    input_tile = np.stack([dem, lc], axis=0)
                elif in_channels == 3:
                    hs = hs_src.read(1, window=window).astype(np.float32) / 255.0
                    hs[np.isnan(hs)] = 1 / np.sqrt(2)
                    input_tile = np.stack([dem, lc, hs], axis=0)

                if np.isnan(input_tile).any():
                    continue

                # Skip uninformative patches
                water_ratio = np.mean(lc == 1)
                is_flat = np.all(dem == 0)
                if water_ratio > 0.9 or is_flat:
                    continue

                # Read target image
                target_tile = target_src.read(window=window)  # shape: [3, H, W]
                target_tile = np.transpose(target_tile, (1, 2, 0))  # [H, W, 3]

                # Skip mostly black targets
                if np.mean(target_tile == 0) > 0.2:
                    continue

                # Save input and target
                np.save(
                    os.path.join(input_folder, f"input_{patch_id+1:04d}.npy"),
                    input_tile,
                )

                img = Image.fromarray(target_tile.astype(np.uint8))
                img.save(os.path.join(target_folder, f"target_{patch_id+1:04d}.png"))

                patch_id += 1

    print(f"Saved {patch_id} training samples (256x256)")
    return input_folder, target_folder


create_tiles_separate_inputs(
    dem_path="/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-dem.tif",
    lc_path="/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-lc.tif",
    hillshade_path="/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-input-hillshade.tif",
    target_rgb_path="/Users/williameclee/Documents/college/MATH/2025_1-MATH496T/satellite-image-predictor/training-data/unet-target.tif",
    tile_size=256,
    in_channels=3,
)