# Data Processing Script

This notebook handles two tasks:
1. Clipping global phytoplankton NetCDF files to the Nile Delta region.
2. Exporting turbidity maps (NDTI) from Landsat 7 imagery using Google Earth Engine.

It prepares datasets for use in CNN training and prediction steps.

In [1]:
"""
process_data.py

1. Clips global phytoplankton NetCDF data to Nile Delta using nile_polygon.geojson.
2. Saves daily normalized Diatom (Diat) maps as PNGs.
3. Initiates Earth Engine exports of daily NDTI turbidity GeoTIFFs using Landsat 7.

Run this script from the root directory of the project (where nile_polygon.geojson is located).
"""

import os
import json
import xarray as xr
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import mapping

# Directories
RAW_PHYTO_DIR = "/home/user/MSc/Clean_Codebase/data/raw/phytoplankton/"
PROC_PHYTO_DIR = "/home/user/MSc/Clean_Codebase/data/processed/phytoplankton/"
DIAT_IMG_DIR = "/home/user/MSc/Clean_Codebase/data/processed/diatom_pngs/"
os.makedirs(PROC_PHYTO_DIR, exist_ok=True)
os.makedirs(DIAT_IMG_DIR, exist_ok=True)

# Load polygon
polygon_path = "nile_polygon.geojson"
gdf = gpd.read_file(polygon_path)
polygon = gdf.geometry.iloc[0]
crs = gdf.crs or "EPSG:4326"


def clip_phytoplankton():
    # Define bounding box manually
    lat_min, lat_max = 31.0, 34.0
    lon_min, lon_max = 28.0, 33.0

    for fname in os.listdir(RAW_PHYTO_DIR):
        if fname.endswith(".nc"):
            in_path = os.path.join(RAW_PHYTO_DIR, fname)
            out_path = os.path.join(PROC_PHYTO_DIR, f"Nile_{fname}")
            try:
                ds = xr.open_dataset(in_path)

                # Get lat/lon orientation
                lat_vals = ds.lat.values
                lon_vals = ds.lon.values

                if lat_vals[0] > lat_vals[-1]:  # descending
                    lat_slice = slice(lat_max, lat_min)
                else:
                    lat_slice = slice(lat_min, lat_max)

                if lon_vals[0] > lon_vals[-1]:  # descending
                    lon_slice = slice(lon_max, lon_min)
                else:
                    lon_slice = slice(lon_min, lon_max)

                subset = ds.sel(lat=lat_slice, lon=lon_slice)
                subset.to_netcdf(out_path)
                print(f"Clipped and saved: {out_path}")
                ds.close()

            except Exception as e:
                print(f"Error processing {fname}: {e}")


# 2. GEE Landsat NDTI Export
def export_ndti_gee():
    import ee
    from datetime import datetime, timedelta

    ee.Initialize(project='earthproject145')

    # Load GeoJSON polygon for Earth Engine
    with open(polygon_path, "r") as f:
        geojson_data = json.load(f)

    if "coordinates" in geojson_data:
        user_polygon = ee.Geometry.Polygon(geojson_data["coordinates"], geodesic=False)
    else:
        raise ValueError("Invalid GeoJSON format.")

    def apply_scale_factors(image):
        optical = image.select(['SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B7']).multiply(0.0000275).add(-0.2)
        return image.addBands(optical, None, True)

    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 2, 1)

    current_date = start_date
    while current_date < end_date:
        next_date = current_date + timedelta(days=1)
        landsat = ee.ImageCollection("LANDSAT/LE07/C02/T1_L2") \
            .filterBounds(user_polygon) \
            .filterDate(current_date.strftime('%Y-%m-%d'), next_date.strftime('%Y-%m-%d')) \
            .filter(ee.Filter.lt("CLOUD_COVER", 10)) \
            .map(apply_scale_factors)

        if landsat.size().getInfo() > 0:
            image = landsat.median()
            ndti = image.normalizedDifference(["SR_B4", "SR_B3"]).rename("NDTI")
            ndwi = image.normalizedDifference(["SR_B3", "SR_B5"]).rename("NDWI")
            water_mask = ndwi.gt(0).clip(user_polygon)
            ndti_masked = ndti.updateMask(water_mask).clip(user_polygon)

            task = ee.batch.Export.image.toDrive(
                image=ndti_masked,
                description=f"Nile_Turbidity_{current_date.strftime('%Y-%m-%d')}",
                folder="Nile_Turbidity_Raw_GeoTIFF2",
                fileNamePrefix=f"nile_turbidity_{current_date.strftime('%Y_%m_%d')}",
                region=user_polygon,
                scale=30,
                crs='EPSG:4326',
                maxPixels=1e9,
                fileFormat='GeoTIFF'
            )
            task.start()
            print(f"Started export for {current_date.strftime('%Y-%m-%d')}")
        else:
            print(f"No image on {current_date.strftime('%Y-%m-%d')}")

        current_date = next_date

# MAIN
if __name__ == "__main__":
    print("\n--- Clipping Phytoplankton NetCDFs and Generating Diatom Maps ---")
    clip_phytoplankton()

    print("\n--- Exporting NDTI GeoTIFFs from GEE ---")
    export_ndti_gee()



--- Clipping Phytoplankton NetCDFs and Generating Diatom Maps ---
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-01.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-02.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-03.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-04.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-05.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-06.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-07.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AIGD-PFT-2023-01-08.nc
Clipped and saved: /home/user/MSc/Clean_Codebase/data/processed/phytoplankton/Nile_AI

KeyboardInterrupt: 

Notes: 

The phytoplankton NetCDF data is subset to the specified lat/lon range and saved as smaller NetCDF files for efficient access. We rely on xarray for convenient slicing and NetCDF I/O​
https://docs.xarray.dev/en/stable/user-guide/io.html#:~:text=netCDF

For turbidity, we assume the raw GeoTIFFs are already focused on the Nile Delta region (e.g., exported from Google Earth Engine). The code includes placeholders to clip the raster with rasterio if needed. GeoTIFF is a common format for gridded raster data like satellite-derived turbidity​
https://rasterio.readthedocs.io/#:~:text=Geographic%20information%20systems%20use%20GeoTIFF,satellite%20imagery%20and%20terrain%20models