In [None]:
Data downloading and cleansing mostly focuses on the TMY 2020 data provided at https://nrel-pds-nsrdb.s3.amazonaws.com/v3/tmy/nsrdb_tmy-2020.h5 
Not familiar with hdf5 data format, we started off from downloading the single file into a local storage of an Azure VM, and analyzes its applicability on our ML algorithms, which was positive.
However, as each weather field of the dataset was compose of hourly data with 4.7 billion values - 7860 (1hr * 24 * 365 days) * 56K (US coordinates at 4km x 4km resolution) * value - aggregating it on a 16-gig VM turned out to be not feasible due to memory limitation.
Decided to switch to PySpark on AWS EMR approach and leveraging multi-node distributed computing, processing time became significantly reduced, and we were able to extract the annual mean values of all 56K US coordinates - ghi, wind_speed and so on.

# install dependancies 
# !pip install h5pyd
# !pip install h5py
# !pip install pandas
# !pip install matplotlib
# !pip install netCDF4 
# !pip install scipy
# !pip install s3fs
# !pip install boto3

In [2]:
import h5pyd
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from scipy.spatial import cKDTree
import csv
import s3fs

# s3 = s3fs.S3FileSystem()

# filename = 's3://nrel-pds-nsrdb/v3/tmy/nsrdb_tmy-2020.h5'
# with s3.open(filename, 'rb') as f:
#     nc_bytes = f.read()
    
# root = Dataset(f'inmemory.nc', memory=nc_bytes)    

# Open the desired year of nsrdb data
# server endpoint, username, password is found via a config file
f = h5py.File('dataset/tmy2020.h5', 'r')
list(f.keys())


['air_temperature',
 'alpha',
 'aod',
 'asymmetry',
 'cld_opd_dcomp',
 'cld_reff_dcomp',
 'clearsky_dhi',
 'clearsky_dni',
 'clearsky_ghi',
 'cloud_press_acha',
 'cloud_type',
 'coordinates',
 'dew_point',
 'dhi',
 'dni',
 'fill_flag',
 'ghi',
 'meta',
 'ozone',
 'relative_humidity',
 'solar_zenith_angle',
 'ssa',
 'surface_albedo',
 'surface_pressure',
 'time_index',
 'tmy_year',
 'tmy_year_short',
 'total_precipitable_water',
 'wind_direction',
 'wind_speed']

In [3]:
# Get coordinates in USA only (2018392 items -> 546219 items)
meta = pd.DataFrame(f['meta'][...])

USA = meta.loc[meta['country'] == b'United States'] # Note .h5 saves strings as bit-strings
USA.head()

# Keep 'elevation' and 'state' as well. They may be useful later.
df_coord_usa = USA[['latitude', 'longitude', 'elevation', 'state']].copy()
df_coord_usa.shape
df_coord_usa.head(100)

# dset = f['ghi']
# data = dset[0][USA.index]  # full-resolution subset

Unnamed: 0,latitude,longitude,elevation,state
14294,18.930000,-155.660004,66.263161,b'Hawaii'
14327,18.969999,-155.740005,46.846153,b'Hawaii'
14328,18.969999,-155.699997,64.631577,b'Hawaii'
14329,18.969999,-155.660004,211.360001,b'Hawaii'
14330,18.969999,-155.619995,93.428574,b'Hawaii'
...,...,...,...,...
14700,19.290001,-155.179993,320.359985,b'Hawaii'
14701,19.290001,-155.139999,234.440002,b'Hawaii'
14702,19.290001,-155.100006,103.250000,b'Hawaii'
14740,19.330000,-155.860001,416.299988,b'Hawaii'


In [None]:
# Helper functions for convert h5 dataset into csv for USA coordinates only

# meta = pd.DataFrame(f['meta'][...])
# dset = f['ghi']

# # Get coordinates in USA only (2018392 items -> 546219 items)
# USA = meta.loc[meta['country'] == b'United States'] # Note .h5 saves strings as bit-strings
# #USA.head()
# #USA.index.tolist()

# %time df = dset[0, USA.index.tolist()]
# pd_df = pd.DataFrame(df)

# # %time data = dset[...][USA.index]  # full-resolution subset
# # df['ghi'] = data / dset.attrs['psm_scale_factor']
# # df.shape

# pd_df.head()
# df.to_csv('dataset/ghi_all_us_coords.csv')

# def save_to_csv_usa_only(dset_name):
#     print(dset_name)
#     df = pd.DataFrame(dset)

#     return df

# df = save_to_csv_usa_only('ghi')
# df.to_csv('dataset/us_powerplants_new_solar.csv')

#header, table, df = data_wrangling('dataset/us_powerplants_orig_wind.csv')
#df.to_csv('dataset/us_powerplants_new_wind.csv')


In [4]:
meta = pd.DataFrame(f['meta'][...])
ca_meta = meta.loc[meta['country'] == b'United States']
ca_pos = ca_meta.index.values.copy()
ca_slice = slice(ca_pos[0], ca_pos[-1] + 1)
ca_pos -= ca_pos[0]
down_size = 17520 * len(ca_pos) * 2 * 10**-6
ca_meta.head()
print('Number of NSRDB pixels in CA = {}'.format(len(ca_meta)))
#print('Download size per year = {:.4f} MB'.format(down_size))

ca_df = ca_meta[['latitude', 'longitude']].copy()
ca_df.head(100)
ghi = f['ghi'][:, ca_slice]

# %time arr = f['ghi'][:, ca_slice] / f['ghi'].attrs['psm_scale_factor']
#%time np.savetxt('dataset/ghi.csv', ghi, delimiter=",")

Number of NSRDB pixels in CA = 546219
