In [74]:
"""
Reference: https://www.drivendata.co/blog/predict-pm25-benchmark/

pyhdf appears to be more powerful than gdal, so it may be worth adopting some of the 
methods used here for working with hdf files.

Additionally, the tutorial shows how to make a masked numpy array, which allows us to work
with sparse arrays? (I'm not sure how this works yet.)

Finally, the tutorial explains how to align AOD data with coordinates. This could let us
make some useful model features, like local weather conditions, etc.

"""

import pandas as pd
from datetime import datetime
from osgeo import gdal
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
import keras.backend as backend
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from dateutil import parser
import matplotlib.pyplot as plt
import os



# from pathlib import Path
# import random
# from typing import Dict, List, Union

# from cloudpathlib import S3Path
import geopandas as gpd
# import rasterio

# DATA_PATH = Path.cwd().parent / "data"
# RAW = DATA_PATH / "raw"
# INTERIM = DATA_PATH / "interim"

In [2]:
pm_md = pd.read_csv(
    "pm25_satellite_metadata.csv",
    parse_dates=["time_start", "time_end"],
    index_col=0
)

grid_md = pd.read_csv(
    "grid_metadata.csv",
    index_col=0
)


maiac_md = pm_md[(pm_md["product"] == "maiac") & (pm_md["split"] == "train")].copy()
maiac_md.shape
print(maiac_md['time_start']['20180201T191000_maiac_la_0.hdf'])
pm_md.keys()

2018-02-01 17:25:00+00:00


Index(['time_start', 'time_end', 'product', 'location', 'split', 'us_url',
       'eu_url', 'as_url', 'cksum', 'granule_size'],
      dtype='object')

In [35]:
#Playing around with datasets

from pyhdf.SD import SD, SDC, SDS
import pyproj


hdf = SD('train/20180201T191000_maiac_la_0.hdf')
print(hdf.info())
print(hdf.datasets())


(13, 8)
{'Optical_Depth_047': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 0), 'Optical_Depth_055': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 1), 'AOD_Uncertainty': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 2), 'FineModeFraction': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 3), 'Column_WV': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 4), 'AOD_QA': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 23, 5), 'AOD_MODEL': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 21, 6), 'Injection_Height': (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 5, 7), 'cosSZA': (('Orbits:grid5km', 'YDim:grid5km', 'XDim:grid5km'), (4, 240, 240), 22, 8), 'cosVZA': (('Orbits:grid5km', 'YDim:grid5km', 'XDim:grid5km'), (4, 240, 240), 22, 9), 'RelAZ': (('Orbits:grid5km', 'YDim:grid5km', 'XDim:grid5km'

In [38]:
blue_band_AOD = hdf.select("Optical_Depth_047")
name, num_dim, shape, types, num_attr = blue_band_AOD.info()
print(
f"""Dataset name: {name}
Number of dimensions: {num_dim}
Shape: {shape}
Data type: {types}
Number of attributes: {num_attr}"""
)

Dataset name: Optical_Depth_047
Number of dimensions: 3
Shape: [4, 1200, 1200]
Data type: 22
Number of attributes: 6


In [39]:
blue_band_AOD.get()

array([[[-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        ...,
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672]],

       [[-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        ...,
        [    21,     21,     21, ..., -28672, -28672, -28672],
        [    22,     21,     21, ..., -28672, -28672, -28672],
        [    24,     22,     21, ..., -28672, -28672, -28672]],

       [[-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        ...,
        [   

In [40]:
calibration_dict = blue_band_AOD.attributes()
calibration_dict

{'long_name': 'AOD at 0.47 micron',
 'scale_factor': 0.001,
 'add_offset': 0.0,
 'unit': 'none',
 '_FillValue': -28672,
 'valid_range': [-100, 5000]}

In [55]:
raw_attr = hdf.attributes()["StructMetadata.0"]
group_1 = raw_attr.split("END_GROUP=GRID_1")[0]
# print(group_1)

hdf_metadata = dict([x.split("=") for x in group_1.split() if "=" in x])

# Parse expressions still wrapped in apostrophes
for key, val in hdf_metadata.items():
    try:
        hdf_metadata[key] = eval(val)
    except:
        pass

hdf_metadata

{'GROUP': 'MergedFields',
 'END_GROUP': 'MergedFields',
 'GridName': 'grid1km',
 'XDim': 1200,
 'YDim': 1200,
 'UpperLeftPointMtrs': (-11119505.196667, 4447802.078667),
 'LowerRightMtrs': (-10007554.677, 3335851.559),
 'Projection': 'GCTP_SNSOID',
 'ProjParams': (6371007.181, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 'SphereCode': -1,
 'GridOrigin': 'HDFE_GD_UL',
 'OBJECT': 'DataField_8',
 'DimensionName': 'Orbits',
 'Size': 4,
 'END_OBJECT': 'DataField_8',
 'DataFieldName': 'Injection_Height',
 'DataType': 'DFNT_FLOAT32',
 'DimList': ('Orbits', 'YDim', 'XDim')}

In [56]:
# Note that coordinates are provided in meters
alignment_dict = {
    "upper_left": hdf_metadata["UpperLeftPointMtrs"],
    "lower_right": hdf_metadata["LowerRightMtrs"],
    "crs": hdf_metadata["Projection"],
    "crs_params": hdf_metadata["ProjParams"]
}
alignment_dict

{'upper_left': (-11119505.196667, 4447802.078667),
 'lower_right': (-10007554.677, 3335851.559),
 'crs': 'GCTP_SNSOID',
 'crs_params': (6371007.181, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)}

In [None]:
"""
DATA PROCESSING

"""

In [59]:
# Loop over orbits to apply the attributes
def calibrate_data(dataset: SDS, shape: list[int], calibration_dict: dict):
    """Given a MAIAC dataset and calibration parameters, return a masked
    array of calibrated data.
    
    Args:
        dataset (SDS): dataset in SDS format (e.g. blue band AOD).
        shape (List[int]): dataset shape as a list of [orbits, height, width].
        calibration_dict (Dict): dictionary containing, at a minimum,
            `valid_range` (list or tuple), `_FillValue` (int or float),
            `add_offset` (float), and `scale_factor` (float).
    
    Returns:
        corrected_AOD (np.ma.MaskedArray): masked array of calibrated data
            with a fill value of nan.
    """
    corrected_AOD = np.ma.empty(shape, dtype=np.double)
    for orbit in range(shape[0]):
        data = dataset[orbit, :, :].astype(np.double)
        invalid_condition = (
            (data < calibration_dict["valid_range"][0]) |
            (data > calibration_dict["valid_range"][1]) |
            (data == calibration_dict["_FillValue"])
        )
        data[invalid_condition] = np.nan
        data = (
            (data - calibration_dict["add_offset"]) *
            calibration_dict["scale_factor"]
        )
        data = np.ma.masked_array(data, np.isnan(data))
        corrected_AOD[orbit, : :] = data
    corrected_AOD.fill_value = np.nan
    return corrected_AOD



In [60]:
corrected_AOD = calibrate_data(blue_band_AOD, shape, calibration_dict)
corrected_AOD

masked_array(
  data=[[[--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         ...,
         [--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --]],

        [[--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         ...,
         [0.021, 0.021, 0.021, ..., --, --, --],
         [0.022, 0.021, 0.021, ..., --, --, --],
         [0.024, 0.022, 0.021, ..., --, --, --]],

        [[--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         ...,
         [0.073, 0.065, 0.053, ..., --, --, --],
         [0.062, 0.057, 0.051000000000000004, ..., --, --, --],
         [0.056, 0.048, 0.041, ..., --, --, --]],

        [[--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         [--, --, --, ..., --, --, --],
         ...,
         [--, -

In [61]:
pd.DataFrame(corrected_AOD.ravel(), columns=['AOD']).describe()

Unnamed: 0,AOD
count,1203568.0
mean,0.09204344
std,0.06629422
min,0.0
25%,0.045
50%,0.072
75%,0.122
max,0.594


In [66]:
"""

Aligning AOD data with real world coordinates


"""


def create_meshgrid(alignment_dict: dict, shape: list[int]):
    """Given an image shape, create a meshgrid of points
    between bounding coordinates.
    
    Args:
        alignment_dict (Dict): dictionary containing, at a minimum,
            `upper_left` (tuple), `lower_right` (tuple), `crs` (str),
            and `crs_params` (tuple).
        shape (List[int]): dataset shape as a list of
            [orbits, height, width].
    
    Returns:
        xv (np.array): x (longitude) coordinates.
        yv (np.array): y (latitude) coordinates.
    """
    # Determine grid bounds using two coordinates
    x0, y0 = alignment_dict["upper_left"]
    x1, y1 = alignment_dict["lower_right"]
    
    # Interpolate points between corners, inclusive of bounds
    x = np.linspace(x0, x1, shape[2], endpoint=True)
    y = np.linspace(y0, y1, shape[1], endpoint=True)
    
    # Return two 2D arrays representing X & Y coordinates of all points
    xv, yv = np.meshgrid(x, y)
    return xv, yv

xv, yv = create_meshgrid(alignment_dict, shape)

In [70]:
from pyproj import CRS, Proj
from typing import Union

# Source: https://spatialreference.org/ref/sr-org/modis-sinusoidal/proj4js/
sinu_crs = Proj(f"+proj=sinu +R={alignment_dict['crs_params'][0]} +nadgrids=@null +wktext").crs
wgs84_crs = CRS.from_epsg("4326")

def transform_arrays(
    xv: Union[np.array, float],
    yv: Union[np.array, float],
    crs_from: CRS,
    crs_to: CRS
):
    """Transform points or arrays from one CRS to another CRS.
    
    Args:
        xv (np.array or float): x (longitude) coordinates or value.
        yv (np.array or float): y (latitude) coordinates or value.
        crs_from (CRS): source coordinate reference system.
        crs_to (CRS): destination coordinate reference system.
    
    Returns:
        lon, lat (tuple): x coordinate(s), y coordinate(s)
    """
    transformer = pyproj.Transformer.from_crs(
        crs_from,
        crs_to,
        always_xy=True,
    )
    lon, lat = transformer.transform(xv, yv)
    return lon, lat



# Project sinu grid onto wgs84 grid
lon, lat = transform_arrays(xv, yv, sinu_crs, wgs84_crs)

In [77]:
def convert_array_to_df(
    corrected_arr: np.ma.MaskedArray,
    lat:np.ndarray,
    lon: np.ndarray,
    granule_id: str,
    crs: CRS,
    total_bounds: np.ndarray = None
):
    """Align data values with latitude and longitude coordinates
    and return a GeoDataFrame.
    
    Args:
        corrected_arr (np.ma.MaskedArray): data values for each pixel.
        lat (np.ndarray): latitude for each pixel.
        lon (np.ndarray): longitude for each pixel.
        granule_id (str): granule name.
        crs (CRS): coordinate reference system
        total_bounds (np.ndarray, optional): If provided,
            will filter out points that fall outside of these bounds.
            Composed of xmin, ymin, xmax, ymax.
    """
    lats = lat.ravel()
    lons = lon.ravel()
    n_orbits = len(corrected_arr)
    size = lats.size
    values = {
        "value": np.concatenate([d.data.ravel() for d in corrected_arr]),
        "lat": np.tile(lats, n_orbits),
        "lon": np.tile(lons, n_orbits),
        "orbit": np.arange(n_orbits).repeat(size),
        "granule_id": [granule_id] * size * n_orbits
        
    }
    
    df = pd.DataFrame(values).dropna()
    if total_bounds is not None:
        x_min, y_min, x_max, y_max = total_bounds
        df = df[df.lon.between(x_min, x_max) & df.lat.between(y_min, y_max)]
    
    gdf = gpd.GeoDataFrame(df)
    gdf["geometry"] = gpd.points_from_xy(gdf.lon, gdf.lat)
    gdf.crs = crs
    return gdf[["granule_id", "orbit", "geometry", "value"]].reset_index(drop=True)

gdf = convert_array_to_df(corrected_AOD, lat, lon, '20180201T191000_maiac_la_0.hdf', wgs84_crs)
print(gdf.shape)
gdf.head(3)

(1203568, 4)


Unnamed: 0,granule_id,orbit,geometry,value
0,20180201T191000_maiac_la_0.hdf,0,POINT (-110.79078 35.36280),0.11
1,20180201T191000_maiac_la_0.hdf,0,POINT (-110.78956 35.35446),0.076
2,20180201T191000_maiac_la_0.hdf,0,POINT (-110.28947 35.28774),0.112


In [78]:
"""

Will derive features from this and train a model on Tuesday.


"""

'\n\nWill derive features from this and train a model tomorrow\n\n\n'