In [1]:
"""
Reference: https://www.drivendata.co/blog/predict-pm25-benchmark/

pyhdf appears to be more powerful than gdal, so it may be worth adopting some of the 
methods used here for working with hdf files.

Additionally, the tutorial shows how to make a masked numpy array, which allows us to work
with sparse arrays? (I'm not sure how this works yet.)

Finally, the tutorial explains how to align AOD data with coordinates. This could let us
make some useful model features, like local weather conditions, etc.

"""
import os
import pandas as pd
from datetime import datetime
from osgeo import gdal
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
import keras.backend as backend
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from dateutil import parser
import matplotlib.pyplot as plt
from pyhdf.SD import SD, SDC, SDS
import pyproj
from pyproj import CRS, Proj
from typing import Union
from shapely.geometry import Point, Polygon
import geopandas as gpd
from datetime import datetime, timedelta
import pickle
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


Using TensorFlow backend.


In [8]:
pm_md = pd.read_csv("pm25_satellite_metadata.csv", parse_dates=["time_start", "time_end"], index_col=0)
grid_md = pd.read_csv("grid_metadata.csv", index_col=0)

maiac_md = pm_md[(pm_md["product"] == "maiac") & (pm_md["split"] == "train")].copy()

In [47]:
# Inspect one dataset
hdf = SD(os.path.join('train', '20180201T191000_maiac_la_0.hdf'))

for field, data in hdf.datasets().items():
    print(field, "->", data)
    
print("")

blue_band_AOD = hdf.select("Optical_Depth_047")
name, num_dim, shape, types, num_attr = blue_band_AOD.info()
print(
f"""
Dataset name: {name}
Number of dimensions: {num_dim}
Shape: {shape}
Data type: {types}
Number of attributes: {num_attr}
"""
)

print("")

print(blue_band_AOD.get())

print("")

calibration_dict = blue_band_AOD.attributes()
print("Blue Band AOD Attributes:")
for key, value in calibration_dict.items():
    print(key, '->', value)

Optical_Depth_047 -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 0)
Optical_Depth_055 -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 1)
AOD_Uncertainty -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 2)
FineModeFraction -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 3)
Column_WV -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 22, 4)
AOD_QA -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 23, 5)
AOD_MODEL -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 21, 6)
Injection_Height -> (('Orbits:grid1km', 'YDim:grid1km', 'XDim:grid1km'), (4, 1200, 1200), 5, 7)
cosSZA -> (('Orbits:grid5km', 'YDim:grid5km', 'XDim:grid5km'), (4, 240, 240), 22, 8)
cosVZA -> (('Orbits:grid5km', 'YDim:grid5km', 'XDim:grid5km'), (4, 240, 240), 22, 9)
RelAZ -> (('Orbits:grid5km', 'YDim:grid5km', 'XDim:grid5km'), (4, 240, 240), 2

In [45]:
raw_attr = hdf.attributes()["StructMetadata.0"] # Look at Metadata
# print(raw_attr)
group_1 = raw_attr.split("END_GROUP=GRID_1")[0] # Pick out only 5km field in the metadata
# print(group_1)

hdf_metadata = dict([x.split("=") for x in group_1.split() if "=" in x])

# Parse expressions still wrapped in apostrophes
for key, val in hdf_metadata.items():
    try:
        hdf_metadata[key] = eval(val)
    except (NameError, ValueError, SyntaxError):
        hdf_metadata[key] = val

for key, value in hdf_metadata.items():
    print(key, '->', value)

print("")
    
# Note that coordinates are provided in meters
# We construct an alignment dictionary parameter
alignment_dict = {
    "upper_left": hdf_metadata["UpperLeftPointMtrs"],
    "lower_right": hdf_metadata["LowerRightMtrs"],
    "crs": hdf_metadata["Projection"],
    "crs_params": hdf_metadata["ProjParams"]
}

for key, value in alignment_dict.items():
    print(key, '->', value)


GROUP -> MergedFields
END_GROUP -> MergedFields
GridName -> grid1km
XDim -> 1200
YDim -> 1200
UpperLeftPointMtrs -> (-11119505.196667, 4447802.078667)
LowerRightMtrs -> (-10007554.677, 3335851.559)
Projection -> GCTP_SNSOID
ProjParams -> (6371007.181, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
SphereCode -> -1
GridOrigin -> HDFE_GD_UL
OBJECT -> DataField_8
DimensionName -> Orbits
Size -> 4
END_OBJECT -> DataField_8
DataFieldName -> Injection_Height
DataType -> DFNT_FLOAT32
DimList -> ('Orbits', 'YDim', 'XDim')

upper_left -> (-11119505.196667, 4447802.078667)
lower_right -> (-10007554.677, 3335851.559)
crs -> GCTP_SNSOID
crs_params -> (6371007.181, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)


In [52]:
##################
#DATA PROCESSING #
##################
# Loop over orbits to apply the attributes
def calibrate_data(dataset, shape, calibration_dict):
    """
    Given a MAIAC dataset and calibration parameters, return a masked
    array of calibrated data.
    
    Args:
        dataset (SDS): dataset in SDS format (e.g. blue band AOD).
        shape (List[int]): dataset shape as a list of [orbits, height, width].
        calibration_dict (Dict): dictionary containing, at a minimum,
            `valid_range` (list or tuple), `_FillValue` (int or float),
            `add_offset` (float), and `scale_factor` (float).
    
    Returns:
        corrected_AOD (np.ma.MaskedArray): masked array of calibrated data
            with a fill value of nan.
    """
    corrected_AOD = np.ma.empty(shape, dtype=np.double)
    for orbit in range(shape[0]):
        data = dataset[orbit, :, :].astype(np.double)
        invalid_condition = (
            (data < calibration_dict["valid_range"][0]) |
            (data > calibration_dict["valid_range"][1]) |
            (data == calibration_dict["_FillValue"])
        )
        data[invalid_condition] = np.nan
        data = (
            (data - calibration_dict["add_offset"]) *
            calibration_dict["scale_factor"]
        )
        data = np.ma.masked_array(data, np.isnan(data))
        corrected_AOD[orbit, : :] = data
    corrected_AOD.fill_value = np.nan
    return corrected_AOD

In [55]:
# Test calibrate_data function
corrected_AOD = calibrate_data(blue_band_AOD, shape, calibration_dict)
print(corrected_AOD)
print(pd.DataFrame(corrected_AOD.ravel(), columns=['AOD']).describe())

[[[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]]

 [[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [0.021 0.021 0.021 ... -- -- --]
  [0.022 0.021 0.021 ... -- -- --]
  [0.024 0.022 0.021 ... -- -- --]]

 [[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [0.073 0.065 0.053 ... -- -- --]
  [0.062 0.057 0.051000000000000004 ... -- -- --]
  [0.056 0.048 0.041 ... -- -- --]]

 [[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]]]
                AOD
count  1.203568e+06
mean   9.204344e-02
std    6.629422e-02
min    0.000000e+00
25%    4.500000e-02
50%    7.200000e-02
75%    1.220000e-01
max    5.940000e-01


In [58]:
# Aligning AOD data with real world coordinates
def create_meshgrid(alignment_dict, shape):
    """Given an image shape, create a meshgrid of points
    between bounding coordinates.
    
    Args:
        alignment_dict (Dict): dictionary containing, at a minimum,
            `upper_left` (tuple), `lower_right` (tuple), `crs` (str),
            and `crs_params` (tuple).
        shape (List[int]): dataset shape as a list of
            [orbits, height, width].
    
    Returns:
        xv (np.array): x (longitude) coordinates.
        yv (np.array): y (latitude) coordinates.
    """
    # Determine grid bounds using two coordinates
    x0, y0 = alignment_dict["upper_left"]
    x1, y1 = alignment_dict["lower_right"]
    
    # Interpolate points between corners, inclusive of bounds
    x = np.linspace(x0, x1, shape[2], endpoint=True)
    y = np.linspace(y0, y1, shape[1], endpoint=True)
    
    # Return two 2D arrays representing X & Y coordinates of all points
    xv, yv = np.meshgrid(x, y)
    return xv, yv

xv, yv = create_meshgrid(alignment_dict, shape)
print(xv, yv)

[[-11119505.196667   -11118577.79840206 -11117650.40013711 ...
  -10009409.47352989 -10008482.07526494 -10007554.677     ]
 [-11119505.196667   -11118577.79840206 -11117650.40013711 ...
  -10009409.47352989 -10008482.07526494 -10007554.677     ]
 [-11119505.196667   -11118577.79840206 -11117650.40013711 ...
  -10009409.47352989 -10008482.07526494 -10007554.677     ]
 ...
 [-11119505.196667   -11118577.79840206 -11117650.40013711 ...
  -10009409.47352989 -10008482.07526494 -10007554.677     ]
 [-11119505.196667   -11118577.79840206 -11117650.40013711 ...
  -10009409.47352989 -10008482.07526494 -10007554.677     ]
 [-11119505.196667   -11118577.79840206 -11117650.40013711 ...
  -10009409.47352989 -10008482.07526494 -10007554.677     ]] [[4447802.078667   4447802.078667   4447802.078667   ... 4447802.078667
  4447802.078667   4447802.078667  ]
 [4446874.68040206 4446874.68040206 4446874.68040206 ... 4446874.68040206
  4446874.68040206 4446874.68040206]
 [4445947.28213711 4445947.28213711 

In [59]:
# Source: https://spatialreference.org/ref/sr-org/modis-sinusoidal/proj4js/
sinu_crs = Proj(f"+proj=sinu +R={alignment_dict['crs_params'][0]} +nadgrids=@null +wktext").crs
wgs84_crs = CRS.from_epsg("4326")

def transform_arrays(
    xv: Union[np.array, float],
    yv: Union[np.array, float],
    crs_from: CRS,
    crs_to: CRS
):
    """Transform points or arrays from one CRS to another CRS.
    
    Args:
        xv (np.array or float): x (longitude) coordinates or value.
        yv (np.array or float): y (latitude) coordinates or value.
        crs_from (CRS): source coordinate reference system.
        crs_to (CRS): destination coordinate reference system.
    
    Returns:
        lon, lat (tuple): x coordinate(s), y coordinate(s)
    """
    transformer = pyproj.Transformer.from_crs(
        crs_from,
        crs_to,
        always_xy=True,
    )
    lon, lat = transformer.transform(xv, yv)
    return lon, lat

# Project sinu grid onto wgs84 grid
lon, lat = transform_arrays(xv, yv, sinu_crs, wgs84_crs)
print(lon, lat)

[[-130.54072891 -130.52984145 -130.51895398 ... -117.50843096
  -117.49754349 -117.48665602]
 [-130.52478749 -130.51390136 -130.50301522 ... -117.49408102
  -117.48319488 -117.47230874]
 [-130.50885273 -130.49796792 -130.48708311 ... -117.47973707
  -117.46885227 -117.45796746]
 ...
 [-115.4894707  -115.47983855 -115.4702064  ... -103.95978793
  -103.95015578 -103.94052363]
 [-115.47976022 -115.47012888 -115.46049755 ... -103.95104688
  -103.94141554 -103.9317842 ]
 [-115.47005382 -115.46042329 -115.45079276 ... -103.9423095
  -103.93267897 -103.92304844]] [[40.         40.         40.         ... 40.         40.
  40.        ]
 [39.99165971 39.99165971 39.99165971 ... 39.99165971 39.99165971
  39.99165971]
 [39.98331943 39.98331943 39.98331943 ... 39.98331943 39.98331943
  39.98331943]
 ...
 [30.01668056 30.01668056 30.01668056 ... 30.01668056 30.01668056
  30.01668056]
 [30.00834028 30.00834028 30.00834028 ... 30.00834028 30.00834028
  30.00834028]
 [30.         30.         30.      

In [60]:
def convert_array_to_df(
    corrected_arr,
    lat,
    lon,
    granule_id,
    crs,
    total_bounds = None
):
    """Align data values with latitude and longitude coordinates
    and return a GeoDataFrame.
    
    Args:
        corrected_arr (np.ma.MaskedArray): data values for each pixel.
        lat (np.ndarray): latitude for each pixel.
        lon (np.ndarray): longitude for each pixel.
        granule_id (str): granule name.
        crs (CRS): coordinate reference system
        total_bounds (np.ndarray, optional): If provided,
            will filter out points that fall outside of these bounds.
            Composed of xmin, ymin, xmax, ymax.
    """
    lats = lat.ravel()
    lons = lon.ravel()
    n_orbits = len(corrected_arr)
    size = lats.size
    values = {
        "value": np.concatenate([d.data.ravel() for d in corrected_arr]),
        "lat": np.tile(lats, n_orbits),
        "lon": np.tile(lons, n_orbits),
        "orbit": np.arange(n_orbits).repeat(size),
        "granule_id": [granule_id] * size * n_orbits
        
    }
    
    df = pd.DataFrame(values).dropna()
    if total_bounds is not None:
        x_min, y_min, x_max, y_max = total_bounds
        df = df[df.lon.between(x_min, x_max) & df.lat.between(y_min, y_max)]
    
    gdf = gpd.GeoDataFrame(df)
    gdf["geometry"] = gpd.points_from_xy(gdf.lon, gdf.lat)
    gdf.crs = crs
    return gdf[["granule_id", "orbit", "geometry", "value"]].reset_index(drop=True)

gdf = convert_array_to_df(corrected_AOD, lat, lon, '20180201T191000_maiac_la_0.hdf', wgs84_crs)
print(gdf.shape)
gdf.head(3)

(1203568, 4)


Unnamed: 0,granule_id,orbit,geometry,value
0,20180201T191000_maiac_la_0.hdf,0,POINT (-110.79078 35.36280),0.11
1,20180201T191000_maiac_la_0.hdf,0,POINT (-110.78956 35.35446),0.076
2,20180201T191000_maiac_la_0.hdf,0,POINT (-110.28947 35.28774),0.112


In [61]:
# Some more helpful functions from the tutorial
def create_calibration_dict(data):
    """Define calibration dictionary given a SDS dataset,
    which contains:
        - name
        - scale factor
        - offset
        - unit
        - fill value
        - valid range
    
    Args:
        data (SDS): dataset in the SDS format.
    
    Returns:
        calibration_dict (Dict): dict of calibration parameters.
    """
    return data.attributes()


def create_alignment_dict(hdf):
    """Define alignment dictionary given a SD data file, 
    which contains:
        - upper left coordinates
        - lower right coordinates
        - coordinate reference system (CRS)
        - CRS parameters
    
    Args:
        hdf (SD): hdf data object
    
    Returns:
        alignment_dict (Dict): dict of alignment parameters.
    """
    group_1 = hdf.attributes()["StructMetadata.0"].split("END_GROUP=GRID_1")[0]
    hdf_metadata = dict([x.split("=") for x in group_1.split() if "=" in x])
    alignment_dict = {
        "upper_left": eval(hdf_metadata["UpperLeftPointMtrs"]),
        "lower_right": eval(hdf_metadata["LowerRightMtrs"]),
        "crs": hdf_metadata["Projection"],
        "crs_params": hdf_metadata["ProjParams"]
    }
    return alignment_dict

In [63]:
# Importing HDF data and creating a set of raw hdf files.
train_labels = pd.read_csv("train_labels_100.csv") # Smallest subset
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("pm25_satellite_metadata.csv")
satellite_metadata = satellite_metadata[satellite_metadata.granule_id.str.endswith('f')]
satellite_metadata = satellite_metadata[satellite_metadata['split'] == 'train']
# print(satellite_metadata)
raw_hdf_set = set(satellite_metadata['granule_id'])

In [128]:
list(grid_md.location.keys())

['1X116',
 '1Z2W7',
 '3S31A',
 '6EIL6',
 '7334C',
 '78V83',
 '7F1D1',
 '8KNI6',
 '90BZ1',
 '90S79',
 '9Q6TA',
 'A2FBI',
 'A7UCQ',
 'AZJ0Z',
 'C7PGV',
 'CPR0W',
 'D72OT',
 'D7S1G',
 'DHO4M',
 'DJN0F',
 'E2AUK',
 'E5P9N',
 'FRITQ',
 'GAC6R',
 'GJLB2',
 'GVQXS',
 'H96P6',
 'HANW9',
 'HM74A',
 'IUMEZ',
 'KW43U',
 'KZ9W9',
 'NE7BV',
 'P8JA5',
 'PG3MI',
 'PJNW1',
 'PW0JT',
 'QJHW4',
 'S77YN',
 'SZLMT',
 'UC74Z',
 'VBLD0',
 'VR4WG',
 'VXNN3',
 'VYH7U',
 'WT52R',
 'WZNCR',
 'X5DKW',
 'XJF9O',
 'XNLVD',
 'YHOPV',
 'ZF3ZW',
 'ZP1FZ',
 'ZZ8JF']

In [146]:


"""

Everything here is original code that uses the functions from the tutorial.

within(): taken from https://automating-gis-processes.github.io/2017/lessons/L3/point-in-polygon.html

Make_Submatrix(): A function which takes raw AOD matrix and a Grid ID of interest as input and outputs a submatrix 
of AOD values which are inside this grid point (5km by 5km).

Currently, Make_Submatrix() only returns the number of pixels in the AOD matrix are within the location determined
by Grid ID.

The rest of the code in this cell runs extremely slowly, but this is because we are running it for all possible 
combinations of HDF file and Grid ID. When we actually use these functions to run a model on a given Grid ID and 
datetime, we will first filter the set of HDF files such that we only search through HDF files with matching city 
and matching datetime.

"""

# poly_dict = {}

# for gridID in list(grid_md.location.keys()):
#     poly = Make_Poly(grid_md['wkt'][gridID])
#     poly_dict[gridID] = poly

# print(poly_dict[gridID].bounds)

#Helper function
def Make_Poly(polyString):
    poly_coords = []
    for string in polyString.split(','):
        split_string = string.split(' ')
        if split_string[0] == 'POLYGON':
            split_string = split_string[1:]
            split_string[0] = str(split_string[0])[2:]
    #         print(tuple(float(x) for x in split_string))
        elif split_string[0] == '':
            split_string = split_string[1:]
        if split_string[1][-2] == ')':
            split_string[1] = split_string[1][0:-2]
        poly_coords.append(tuple(float(x) for x in split_string))

    return Polygon(poly_coords)



#Main function
def Make_Submatrix(corrected_AOD, lon, lat, alignment_dict, grid_md, gridID):
#     xv, yv = create_meshgrid(alignment_dict, shape)
#     lon, lat = transform_arrays(xv, yv, sinu_crs, wgs84_crs)
    
    poly = Make_Poly(grid_md['wkt'][gridID])
    return_list = []
    for band in range(len(corrected_AOD)):
        print('here 1')
    
        
    #     counter = 0
        triples_array = []
        counter_i = 0
        for i in range(len(corrected_AOD[0])):
            if lat[i,0] > poly.bounds[3]:
                continue
                
            if lat[i,0] < poly.bounds[1]:
                continue
    
            for j in range(len(corrected_AOD[0][0])):
                if lon[i,j] > poly.bounds[2]:
                    continue
                p1 = Point(lon[i,j], lat[i,j]) 
    #             print(p1, poly)
                if(p1.within(poly)):
                    triples_array.append((i, j, corrected_AOD[band,i,j]))
                        
                
                
        print('here 2')
                
        if len(triples_array) == 0:
            continue

        temp_array = np.zeros((10,10))
        temp_array = np.ma.masked_array(temp_array, mask=np.ones((10,10)))

        i_array = [x[0] for x in triples_array]
        j_array = [x[1] for x in triples_array]
        min_i = min(i_array)
        min_j = min(j_array)

        

        for triple in triples_array:
            if triple[2] is np.ma.masked:
                pass
    #             return_array[triple[0]-min_i, triple[1]-min_j] = 'masked!!!'
            else:
                temp_array[triple[0]-min_i, triple[1]-min_j] = triple[2]
        return_list.append(temp_array)
        print('here 3')

#     number_of_bands_returned = len(return_list)
    return return_list 
    
#     if counter > 0:
#         print(counter)
            

    
    #     print(tuple(float(x) for x in split_string))
#     print(triples_array)
#     for i in range(3):
#         plt.imshow(corrected_AOD[i])
#         plt.show()
    
    
# for hdf_filename in raw_hdf_set:
#     print(hdf_filename)
#     filepath = 'train/' + hdf_filename
#     raw_hdf = SD(filepath)
    
#     alignment_dict = create_alignment_dict(raw_hdf)
# #     print(alignment_dict['upper_left'], alignment_dict['lower_right'])
    
#     blue_band_AOD = raw_hdf.select("Optical_Depth_047")
#     name, num_dim, shape, types, num_attr = blue_band_AOD.info()
#     calibration_dict = create_calibration_dict(blue_band_AOD)
#     corrected_AOD = calibrate_data(blue_band_AOD, shape, calibration_dict)
    
#     xv, yv = create_meshgrid(alignment_dict, shape)
#     lon, lat = transform_arrays(xv, yv, sinu_crs, wgs84_crs)
    
    
#     for gridID in grid_md['wkt'].keys():
#         Make_Submatrix(corrected_AOD, lon, lat, alignment_dict, grid_md, gridID)
    

In [140]:
#Note that each AOD_array in array_of_AOD_arrays should be a precalculated subarray corresponding to grid id
def collect_features(array_of_AOD_arrays, area_per_subarray):
    
#     print('checkpoint 1')
    
    total_values = len(array_of_AOD_arrays)*area_per_subarray
#     print(total_values, 'values')
    
    all_values = np.zeros((total_values))
    all_values = np.ma.masked_array(all_values, mask=np.ones((total_values)))
    
#     print('checkpoint 2')
    
    counter = 0
    for AOD_array in array_of_AOD_arrays:
#         print('checkpoint 2.5')
        for row in AOD_array:
            for value in row:
#                 print('checkpoint 2.6', value)
                if not (value is np.ma.masked):
#                     print('checkpoint 2.7')
                    all_values[counter] = value
                    counter+=1
#                     print(all_values)
            
#     print('checkpoint 3')
    
    mean = np.ma.mean(all_values)
    minimum = np.ma.min(all_values)
    maximum = np.ma.max(all_values)
    std = np.ma.std(all_values)
    summ = np.ma.sum(all_values)
    
    print('done')
    return(mean, minimum, maximum, std, summ)
    
    

In [20]:
print(train_labels.keys())
print(satellite_metadata['location'].unique())
print(grid_metadata['location'].unique())
print(alignment_dict)
print(calibration_dict)
print(satellite_metadata)
print(grid_metadata)

Index(['datetime', 'grid_id', 'value'], dtype='object')
['la' 'tpe' 'dl']
['Taipei' 'Delhi' 'Los Angeles (SoCAB)']
{'upper_left': (-11119505.196667, 4447802.078667), 'lower_right': (-10007554.677, 3335851.559), 'crs': 'GCTP_SNSOID', 'crs_params': (6371007.181, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)}
{'long_name': 'AOD at 0.47 micron', 'scale_factor': 0.001, 'add_offset': 0.0, 'unit': 'none', '_FillValue': -28672, 'valid_range': [-100, 5000]}
                          granule_id                time_start  \
0     20180201T191000_maiac_la_0.hdf  2018-02-01T17:25:00.000Z   
1     20180202T195000_maiac_la_0.hdf  2018-02-02T18:05:00.000Z   
2     20180203T203000_maiac_la_0.hdf  2018-02-03T17:10:00.000Z   
3     20180204T194000_maiac_la_0.hdf  2018-02-04T17:55:00.000Z   
4     20180205T202000_maiac_la_0.hdf  2018-02-05T17:00:00.000Z   
...                              ...                       ...   
4255  20201227T071500_maiac_dl_0.hdf  2020-12-27T05:25:00.000Z   
4256  20201228T062000_maiac_d

In [147]:



features = []
train_labels = pd.read_csv("train_labels.csv") # Smallest subset
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("pm25_satellite_metadata.csv")
satellite_metadata = satellite_metadata[satellite_metadata.granule_id.str.endswith('f')]
satellite_metadata = satellite_metadata[satellite_metadata['split'] == 'train'].copy()


grid_id_list = list(grid_metadata['grid_id'])
for i in range(len(train_labels)):
    
    satellite_metadata_cut = satellite_metadata
    
    grid_id = train_labels['grid_id'][i]
    j = grid_id_list.index(grid_id)
    location = grid_metadata['location'][j]
    tz = grid_metadata['tz'][j]
    datetime = pd.to_datetime(
        train_labels['datetime'][j],
        format="%Y%m%dT%H:%M:%S",
        utc=True
    )
    polygon = grid_metadata['wkt'][j]
#     print(grid_id, datetime, location, timezone, polygon)
    
    if location == 'Delhi':
        satellite_metadata_cut = satellite_metadata[satellite_metadata['location'] == 'dl'].copy()
    elif location == 'Los Angeles (SoCAB)':
        satellite_metadata_cut = satellite_metadata[satellite_metadata['location'] == 'la'].copy()
    elif location == 'Taipei':
        satellite_metadata_cut = satellite_metadata[satellite_metadata['location'] == 'tpe'].copy()
        
#     print(location, tz)
#     print(len(satellite_metadata_cut))
#     print(len(satellite_metadata))
        
    valid_datetime = [None]*len(satellite_metadata_cut)
    satellite_metadata_cut.reset_index(drop=True, inplace=True) # ensure indexes pair with number of rows
#     count = 0
    for index, row in satellite_metadata_cut.iterrows():
        
        datetime1 = pd.to_datetime((row['time_start']), format="%Y%m%dT%H:%M:%S", 
                                utc=True) #.tz_localize(None).tz_localize(tz)
        datetime2 = pd.to_datetime(row['time_end'], format="%Y%m%dT%H:%M:%S", 
                                            utc=True) #.tz_localize(None).tz_localize(tz)
        
#         print(index, datetime1.tzinfo)
#         print(datetime2.tzinfo)

#         print(datetime1, datetime, datetime2)
        
        truth1 = (datetime <= datetime2)
        truth2 = datetime2 <= datetime + timedelta(hours=24)
#         count+=1
#         print(count)
        valid_datetime[index] = (truth1 & truth2)
#         if valid_datetime[index] == True:
#             print('Yay!')
    
    
    
    satellite_metadata_cut['valid_datetime'] = valid_datetime
    satellite_metadata_cut = satellite_metadata_cut[satellite_metadata_cut['valid_datetime'] == True]
#     print(len(satellite_metadata_cut), location)
#     print(satellite_metadata_cut['granule_id'])
    raw_hdf_set = list(satellite_metadata_cut['granule_id']) #Note for now we are only including the first HDF;
                                                          #Should make a function later which chooses the best HDF
                                                          #File and the best band.
    
    
    print(raw_hdf_set, location)
    print('next:')
    
# """
    list_of_all_AOD_arrays = []
    for hdf_filename in raw_hdf_set:
#         print(hdf_filename)
        filepath = 'train/' + hdf_filename
#         print(filepath)
        raw_hdf = SD(filepath)

        alignment_dict = create_alignment_dict(raw_hdf)
    #     print(alignment_dict['upper_left'], alignment_dict['lower_right'])

        blue_band_AOD = raw_hdf.select("Optical_Depth_047")
        name, num_dim, shape, types, num_attr = blue_band_AOD.info()
        calibration_dict = create_calibration_dict(blue_band_AOD)
        corrected_AOD = calibrate_data(blue_band_AOD, shape, calibration_dict)

        xv, yv = create_meshgrid(alignment_dict, shape)
        lon, lat = transform_arrays(xv, yv, sinu_crs, wgs84_crs)
        
        print('made it here1')

        temp = Make_Submatrix(corrected_AOD, lon, lat, alignment_dict, grid_md, grid_id)
        print('made it here2')
        if len(temp) == 0:
            continue
        for AOD_array in temp:
            list_of_all_AOD_arrays.append(AOD_array)
            
    if list_of_all_AOD_arrays == 0:
        features.append(np.array((np.nan, np.nan, np.nan, np.nan, np.nan)))
    else:
        features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))
        print('made it here3')
    
# """
print(features)

# Etc/GMT+8
    
    

# print(train_labels[train_labels['grid_id'] == '1X116']) demonstration of the fact that grids are repeated at 
#different datetimes.

    

['20180201T191000_maiac_la_0.hdf'] Los Angeles (SoCAB)
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180201T191000_maiac_la_0.hdf'] Los Angeles (SoCAB)
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180201T191000_maiac_la_0.hdf'] Los Angeles (SoCAB)
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180201T191000_maiac_la_0.hdf'] Los Angeles (SoCAB)
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180201T191000_maiac_la_0.hdf'] Los Angeles (SoCAB)
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180201T191000_maiac_la_0.hdf'] Los A

  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180202T032500_maiac_tpe_0.hdf', '20180202T032500_maiac_tpe_1.hdf'] Taipei
next:
made it here1
here 1
here 2
here 1
here 2
here 1
here 2
here 1
here 2
made it here2
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3


  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180202T032500_maiac_tpe_0.hdf', '20180202T032500_maiac_tpe_1.hdf'] Taipei
next:
made it here1
here 1
here 2
here 1
here 2
here 1
here 2
here 1
here 2
made it here2
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3


  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180202T032500_maiac_tpe_0.hdf', '20180202T032500_maiac_tpe_1.hdf'] Taipei
next:
made it here1
here 1
here 2
here 1
here 2
here 1
here 2
here 1
here 2
made it here2
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3


  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180203T041000_maiac_tpe_0.hdf', '20180203T023000_maiac_tpe_0.hdf'] Taipei
next:
made it here1
here 1
here 2
here 1
here 2
here 1
here 2
here 1
here 2
made it here2
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3


  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180203T041000_maiac_tpe_0.hdf', '20180203T023000_maiac_tpe_0.hdf'] Taipei
next:
made it here1
here 1
here 2
here 1
here 2
here 1
here 2
here 1
here 2
made it here2
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3


  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180204T031500_maiac_tpe_0.hdf', '20180204T031500_maiac_tpe_1.hdf'] Taipei
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3


  features.append(np.array(collect_features(list_of_all_AOD_arrays, 100)))


['20180202T064500_maiac_dl_0.hdf'] Delhi
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180202T064500_maiac_dl_0.hdf'] Delhi
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180202T064500_maiac_dl_0.hdf'] Delhi
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180202T064500_maiac_dl_0.hdf'] Delhi
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180202T064500_maiac_dl_0.hdf'] Delhi
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
here 1
here 2
here 3
made it here2
done
made it here3
['20180202T064500_maiac_dl_0.hdf'] Delhi
next:
made it here1
here 1
here 2
here 3
here 1
here 2
here 3
here 1


KeyboardInterrupt: 

In [22]:

    
pickle.dump( features, open( "save1.p", "wb" ) )
print(features)



[array([0.08363158, 0.063     , 0.107     , 0.01132631, 3.178     ]), array([0.08105263, 0.059     , 0.106     , 0.01330205, 3.08      ]), array([0.07797297, 0.036     , 0.133     , 0.03143289, 2.885     ]), array([0.10215385, 0.052     , 0.161     , 0.02714778, 3.984     ]), array([0.12309091, 0.093     , 0.163     , 0.02256991, 5.416     ]), array([0.13273913, 0.109     , 0.166     , 0.01513768, 3.053     ]), array([0.12613158, 0.077     , 0.192     , 0.03005361, 4.793     ]), array([0.099     , 0.025     , 0.134     , 0.02732947, 1.98      ]), array([0.223     , 0.139     , 0.46      , 0.09415147, 3.568     ]), array([0.17009091, 0.096     , 0.226     , 0.03558972, 7.484     ]), array([0.09357895, 0.047     , 0.124     , 0.01710591, 3.556     ]), array([0.0768    , 0.        , 0.139     , 0.03463081, 2.304     ]), array([nan, nan, nan, nan, nan]), array([nan, nan, nan, nan, nan]), array([nan, nan, nan, nan, nan]), array([nan, nan, nan, nan, nan]), array([nan, nan, nan, nan, nan]), a

In [82]:
#I stopped the above code after about 200 iterations, so I will need to obtain a subarray for labels




features_cut = features.copy()

        


labels_array = np.array(train_labels.value)
cut_labels_array = list(labels_array[0:227].copy())

i = 0
while( i < len(features_cut)):
    print(features_cut[i][0])
    if np.isnan(features_cut[i][0]):
        features_cut.pop(i)
        cut_labels_array.pop(i)
    else:
        i+=1
        
print(len(features_cut))
print(len(cut_labels_array))

def make_model():

    model = Sequential()
    model.add(Dense(13, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

model = make_model()
model.summary()
model.fit(X, Y, epochs=1000, batch_size=10)

# estimator = KerasRegressor(build_fn=make_model, nb_epoch=100, batch_size=5, verbose=0)

# X = np.array(features_cut)
# Y = np.array(cut_labels_array)

# kfold = KFold(n_splits=10)
# results = cross_val_score(estimator, X, Y, n_jobs=1)
# print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))


0.08363157894736843
0.08105263157894738
0.07797297297297297
0.10215384615384615
0.1230909090909091
0.1327391304347826
0.1261315789473684
0.099
0.22300000000000003
0.1700909090909091
0.09357894736842105
0.07680000000000001
nan
nan
nan
nan
nan
nan
nan
0.8204347826086957
0.6804117647058824
0.7850000000000001
0.5046904761904761
0.7068333333333333
0.572375
0.5551489361702128
0.688741935483871
0.746741935483871
0.29331578947368425
0.34060714285714283
0.2911666666666667
nan
nan
nan
nan
nan
nan
nan
0.8204347826086957
0.6804117647058824
0.7850000000000001
0.5046904761904761
0.7068333333333333
0.5551489361702128
0.688741935483871
0.746741935483871
0.29331578947368425
0.34060714285714283
0.2911666666666667
0.08363157894736843
0.08105263157894738
0.07797297297297297
0.10215384615384615
0.1230909090909091
0.1327391304347826
0.1261315789473684
0.099
0.22300000000000003
0.1700909090909091
0.09357894736842105
0.07680000000000001
nan
nan
nan
nan
nan
nan
nan
0.8204347826086957
0.6804117647058824
0.78500

Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/1000
Epoch 179/1000
Epoch 180/1000
Epoch 181/1000
Epoch 182/1000
Epoch 183/1000
Epoch 184/1000
Epoch 185/1000
Epoch 186/1000
Epoch 187/1000
Epoch 188/1000
Epoch 189/1000
Epoch 190/1000
Epoch 191/1000
Epoch 192/1000
Epoch 193/1000
Epoch 194/1000
Epoch 195/1000
Epoch 196/1000
Epoch 197/1000
Epoch 198/1000
Epoch 199/1000
Epoch 200/1000
Epoch 201/1000
Epoch 202/1000
Epoch 203/1000
Epoch 204/1000
Epoch 205/1000
Epoch 206/

Epoch 326/1000
Epoch 327/1000
Epoch 328/1000
Epoch 329/1000
Epoch 330/1000
Epoch 331/1000
Epoch 332/1000
Epoch 333/1000
Epoch 334/1000
Epoch 335/1000
Epoch 336/1000
Epoch 337/1000
Epoch 338/1000
Epoch 339/1000
Epoch 340/1000
Epoch 341/1000
Epoch 342/1000
Epoch 343/1000
Epoch 344/1000
Epoch 345/1000
Epoch 346/1000
Epoch 347/1000
Epoch 348/1000
Epoch 349/1000
Epoch 350/1000
Epoch 351/1000
Epoch 352/1000
Epoch 353/1000
Epoch 354/1000
Epoch 355/1000
Epoch 356/1000
Epoch 357/1000
Epoch 358/1000
Epoch 359/1000
Epoch 360/1000
Epoch 361/1000
Epoch 362/1000
Epoch 363/1000
Epoch 364/1000
Epoch 365/1000
Epoch 366/1000
Epoch 367/1000
Epoch 368/1000
Epoch 369/1000
Epoch 370/1000
Epoch 371/1000
Epoch 372/1000
Epoch 373/1000
Epoch 374/1000
Epoch 375/1000
Epoch 376/1000
Epoch 377/1000
Epoch 378/1000
Epoch 379/1000
Epoch 380/1000
Epoch 381/1000
Epoch 382/1000
Epoch 383/1000
Epoch 384/1000
Epoch 385/1000
Epoch 386/1000
Epoch 387/1000
Epoch 388/1000
Epoch 389/1000
Epoch 390/1000
Epoch 391/1000
Epoch 392/

Epoch 512/1000
Epoch 513/1000
Epoch 514/1000
Epoch 515/1000
Epoch 516/1000
Epoch 517/1000
Epoch 518/1000
Epoch 519/1000
Epoch 520/1000
Epoch 521/1000
Epoch 522/1000
Epoch 523/1000
Epoch 524/1000
Epoch 525/1000
Epoch 526/1000
Epoch 527/1000
Epoch 528/1000
Epoch 529/1000
Epoch 530/1000
Epoch 531/1000
Epoch 532/1000
Epoch 533/1000
Epoch 534/1000
Epoch 535/1000
Epoch 536/1000
Epoch 537/1000
Epoch 538/1000
Epoch 539/1000
Epoch 540/1000
Epoch 541/1000
Epoch 542/1000
Epoch 543/1000
Epoch 544/1000
Epoch 545/1000
Epoch 546/1000
Epoch 547/1000
Epoch 548/1000
Epoch 549/1000
Epoch 550/1000
Epoch 551/1000
Epoch 552/1000
Epoch 553/1000
Epoch 554/1000
Epoch 555/1000
Epoch 556/1000
Epoch 557/1000
Epoch 558/1000
Epoch 559/1000
Epoch 560/1000
Epoch 561/1000
Epoch 562/1000
Epoch 563/1000
Epoch 564/1000
Epoch 565/1000
Epoch 566/1000
Epoch 567/1000
Epoch 568/1000
Epoch 569/1000
Epoch 570/1000
Epoch 571/1000
Epoch 572/1000
Epoch 573/1000
Epoch 574/1000
Epoch 575/1000
Epoch 576/1000
Epoch 577/1000
Epoch 578/

Epoch 697/1000
Epoch 698/1000
Epoch 699/1000
Epoch 700/1000
Epoch 701/1000
Epoch 702/1000
Epoch 703/1000
Epoch 704/1000
Epoch 705/1000
Epoch 706/1000
Epoch 707/1000
Epoch 708/1000
Epoch 709/1000
Epoch 710/1000
Epoch 711/1000
Epoch 712/1000
Epoch 713/1000
Epoch 714/1000
Epoch 715/1000
Epoch 716/1000
Epoch 717/1000
Epoch 718/1000
Epoch 719/1000
Epoch 720/1000
Epoch 721/1000
Epoch 722/1000
Epoch 723/1000
Epoch 724/1000
Epoch 725/1000
Epoch 726/1000
Epoch 727/1000
Epoch 728/1000
Epoch 729/1000
Epoch 730/1000
Epoch 731/1000
Epoch 732/1000
Epoch 733/1000
Epoch 734/1000
Epoch 735/1000
Epoch 736/1000
Epoch 737/1000
Epoch 738/1000
Epoch 739/1000
Epoch 740/1000
Epoch 741/1000
Epoch 742/1000
Epoch 743/1000
Epoch 744/1000
Epoch 745/1000
Epoch 746/1000
Epoch 747/1000
Epoch 748/1000
Epoch 749/1000
Epoch 750/1000
Epoch 751/1000
Epoch 752/1000
Epoch 753/1000
Epoch 754/1000
Epoch 755/1000
Epoch 756/1000
Epoch 757/1000
Epoch 758/1000
Epoch 759/1000
Epoch 760/1000
Epoch 761/1000
Epoch 762/1000
Epoch 763/

Epoch 791/1000
Epoch 792/1000
Epoch 793/1000
Epoch 794/1000
Epoch 795/1000
Epoch 796/1000
Epoch 797/1000
Epoch 798/1000
Epoch 799/1000
Epoch 800/1000
Epoch 801/1000
Epoch 802/1000
Epoch 803/1000
Epoch 804/1000
Epoch 805/1000
Epoch 806/1000
Epoch 807/1000
Epoch 808/1000
Epoch 809/1000
Epoch 810/1000
Epoch 811/1000
Epoch 812/1000
Epoch 813/1000
Epoch 814/1000
Epoch 815/1000
Epoch 816/1000
Epoch 817/1000
Epoch 818/1000
Epoch 819/1000
Epoch 820/1000
Epoch 821/1000
Epoch 822/1000
Epoch 823/1000
Epoch 824/1000
Epoch 825/1000
Epoch 826/1000
Epoch 827/1000
Epoch 828/1000
Epoch 829/1000
Epoch 830/1000
Epoch 831/1000
Epoch 832/1000
Epoch 833/1000
Epoch 834/1000
Epoch 835/1000
Epoch 836/1000
Epoch 837/1000
Epoch 838/1000
Epoch 839/1000
Epoch 840/1000
Epoch 841/1000
Epoch 842/1000
Epoch 843/1000
Epoch 844/1000
Epoch 845/1000
Epoch 846/1000
Epoch 847/1000
Epoch 848/1000
Epoch 849/1000
Epoch 850/1000
Epoch 851/1000
Epoch 852/1000
Epoch 853/1000
Epoch 854/1000
Epoch 855/1000
Epoch 856/1000
Epoch 857/

Epoch 977/1000
Epoch 978/1000
Epoch 979/1000
Epoch 980/1000
Epoch 981/1000
Epoch 982/1000
Epoch 983/1000
Epoch 984/1000
Epoch 985/1000
Epoch 986/1000
Epoch 987/1000
Epoch 988/1000
Epoch 989/1000
Epoch 990/1000
Epoch 991/1000
Epoch 992/1000
Epoch 993/1000
Epoch 994/1000
Epoch 995/1000
Epoch 996/1000
Epoch 997/1000
Epoch 998/1000
Epoch 999/1000
Epoch 1000/1000


<keras.callbacks.History at 0x2c4f5b4c0>

In [117]:


predicted = model.predict(np.array(features_cut))
print(mean_squared_error(predicted, Y))
r2_score(predicted, Y)


1154.1287177313234


0.5873456021965002

In [118]:
pickle.dump( model, open( "model1.p", "wb" ) )

INFO:tensorflow:Assets written to: ram://41937174-8843-4691-a700-08b99f437415/assets
