In [1]:
import h5pyd
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from scipy.spatial import cKDTree
import csv


In [2]:
# Open the desired year of nsrdb data
# server endpoint, username, password is found via a config file

f = h5py.File('dataset/tmy2020.h5', 'r')
list(f.keys())


['air_temperature',
 'alpha',
 'aod',
 'asymmetry',
 'cld_opd_dcomp',
 'cld_reff_dcomp',
 'clearsky_dhi',
 'clearsky_dni',
 'clearsky_ghi',
 'cloud_press_acha',
 'cloud_type',
 'coordinates',
 'dew_point',
 'dhi',
 'dni',
 'fill_flag',
 'ghi',
 'meta',
 'ozone',
 'relative_humidity',
 'solar_zenith_angle',
 'ssa',
 'surface_albedo',
 'surface_pressure',
 'time_index',
 'tmy_year',
 'tmy_year_short',
 'total_precipitable_water',
 'wind_direction',
 'wind_speed']

In [3]:
# Helper functions for calculating wind power density

# Mathematical constants
Rv = 461.4964
Rd = 287.0531
Eso = 6.1078
c0 = 0.99999683
c1 = -0.90826951e-2
c2 = 0.78736169e-4
c3 = -0.61117958e-6
c4 = 0.43884187e-8
c5 = -0.29883885e-10
c6 = 0.21874425e-12
c7 = -0.17892321e-14
c8 = 0.11112018e-16
c9 = -0.30994571e-19


def CalculateAirDensity(temp, press, dew_point):
    """
    Function for calculating the density of humidified air.
    :param temp: ambient temperature in degrees Celsius
    :param press: ambient air pressure in hPa
    :param dew_point: dew point in degrees Celsius
    :return: air density in kg/m3
    Reference: All constants and formulas were taken from
               https://www.gribble.org/cycling/air_density.html
    """
    # Calculate pressure of water vapor
    p = c0 + dew_point * \
        (c1 + dew_point * \
        (c2 + dew_point * \
        (c3 + dew_point * \
        (c4 + dew_point * \
        (c5 + dew_point * \
        (c6 + dew_point * \
        (c7 + dew_point * \
        (c8 + dew_point * \
        (c9)))))))))

    press_water_vapor = Eso / (p ** 8)
    
    # Calculate pressure of dry air
    press_dry_air = press - press_water_vapor
    
    # Convert air temperature from Celcius to Kelvins
    temp_K = temp + 273.15
    
    # Calculate air density
    return ((press_dry_air / (Rd * temp_K)) + (press_water_vapor / (Rv * temp_K))) * 100


def WindPowerDensity(wind_speed, temp, press, dew_point):
    """
    Function to calculate the theoretical wind power density according to:
    [12] H. Cetinay, F. A. Kuipers, and A. N. Guven, “Optimal siting and sizing of wind farms,”
         Renewable Energy, 101, 51-58, 2017.
    :param wind_speed: wind speed
    :param temp: ambient temperature in degrees Celsius
    :param press: ambient air pressure in hPa
    :param dew_point: dew point in degrees Celsius
    :return:
    """
    return 0.5 * CalculateAirDensity(temp, press, dew_point) * (wind_speed**3)

In [4]:
def which_bucket(value, data_type):
    """
    Helper function to determine which bucket a given wind speed or temperature falls into.
    :value: the value to test
    :data: whether the value is 'wind' or 'temp'
    :return: the list index the value falls into 
    """
    if data_type == 'wind':
        if value < 5.0:
            return 0
        elif value >= 5.0 and value < 10.0:
            return 1
        elif value >= 10.0 and value < 15.0:
            return 2      
        elif value >= 15.0 and value < 20.0:
            return 3        
        elif value >= 20.0 and value < 25.0:
            return 4 
        elif value >= 25.0:
            return 5
        
    elif data_type == 'temp':
        if value < 0.0:
            return 0
        elif value >= 0.0 and value < 10.0:
            return 1
        elif value >= 10.0 and value < 20.0:
            return 2      
        elif value >= 20.0 and value < 30.0:
            return 3        
        elif value >= 30.0 and value < 40.0:
            return 4 
        elif value >= 40.0:
            return 5
    

In [None]:
# Generate a CSV file with MEAN values for all USA coordinates (data points decreased from 2018392 items -> 546219 items)
# CAVEAT: Running this cell take a long time, apprx 4 hrs
dset_names = \
['air_temperature',
 'alpha',
 'aod',
 'asymmetry',
 'cld_opd_dcomp',
 'cld_reff_dcomp',
 'clearsky_dhi',
 'clearsky_dni',
 'clearsky_ghi',
 'cloud_press_acha',
 'cloud_type',
 'dew_point',
 'dhi',
 'dni',
 'ghi',
 'ozone',
 'relative_humidity',
 'solar_zenith_angle',
 'ssa',
 'surface_albedo',
 'surface_pressure',
 'total_precipitable_water',
 'wind_direction',
 'wind_speed']

def get_yearly_variables(dset_name):
    print ('processing... ' + dset_name)
    dset = f[dset_name]
    # Extract scale factor
    scale_factor = dset.attrs['psm_scale_factor']
    
    # Extract, average, and un-scale dset
    mean_dset = np.mean(dset[:,:], axis=0) / scale_factor
    return mean_dset

meta = pd.DataFrame(f['meta'][...])

USA = meta.loc[meta['country'] == b'United States'] # Note .h5 saves strings as bit-strings
USA.head()

# Select US coordinates only
# Keep 'elevation' and 'state' as well. They may be useful later.
df_coord_usa = USA[['latitude', 'longitude', 'elevation', 'state']].copy()
df_coord_usa.shape
original_us_indice = df_coord_usa.index.copy()

#print(df_coord_usa.head(50))
total_no_str = str(len(df_coord_usa.index))
print (total_no_str + ' coordinates in the US')

# Get a series of mean values of all 2M coordinates, then filter out for US coordinates only
df = None
for dset_name in dset_names:
    %time mean_dset = get_yearly_variables(dset_name)
    mean_dset_us = mean_dset[original_us_indice]
    #std_dset_us = np.take(std_dset, df_coord_usa.index)
    df_mean_dset_us = pd.DataFrame(mean_dset_us, columns=['avg_' + dset_name])
    
    if df is None:
        df = pd.concat([df_coord_usa.reset_index(), df_mean_dset_us], axis=1)
    else:
        df = pd.concat([df, df_mean_dset_us], axis=1)
        
# Save into csv
df.to_csv('dataset/yearly_variables_us_coordinates_mean.csv')


546219 coordinates in the US
processing... air_temperature


In [5]:
# Generate a CSV file with STD values for all USA coordinates (data points decreased from 2018392 items -> 546219 items)
# CAVEAT: Running this cell take a long time, apprx 4 hrs
dset_names = \
['air_temperature',
 'alpha',
 'aod',
 'asymmetry',
 'cld_opd_dcomp',
 'cld_reff_dcomp',
 'clearsky_dhi',
 'clearsky_dni',
 'clearsky_ghi',
 'cloud_press_acha',
 'cloud_type',
 'dew_point',
 'dhi',
 'dni',
 'ghi',
 'ozone',
 'relative_humidity',
 'solar_zenith_angle',
 'ssa',
 'surface_albedo',
 'surface_pressure',
 'total_precipitable_water',
 'wind_direction',
 'wind_speed']

def get_yearly_variables(dset_name):
    print ('processing... ' + dset_name)
    dset = f[dset_name]
    # Extract scale factor
    scale_factor = dset.attrs['psm_scale_factor']
    
    # Extract, average, and un-scale dset
    mean_dset = np.std(dset[:,:], axis=0) / scale_factor
    return mean_dset

meta = pd.DataFrame(f['meta'][...])

USA = meta.loc[meta['country'] == b'United States'] # Note .h5 saves strings as bit-strings
USA.head()

# Select US coordinates only
# Keep 'elevation' and 'state' as well. They may be useful later.
df_coord_usa = USA[['latitude', 'longitude', 'elevation', 'state']].copy()
df_coord_usa.shape
original_us_indice = df_coord_usa.index.copy()

#print(df_coord_usa.head(50))
total_no_str = str(len(df_coord_usa.index))
print (total_no_str + ' coordinates in the US')

# Get a series of std values of all 2M coordinates, then filter out for US coordinates only
df = None
for dset_name in dset_names:
    %time std_dset = get_yearly_variables(dset_name)
    std_dset_us = std_dset[original_us_indice]
    df_std_dset_us = pd.DataFrame(std_dset_us, columns=['std_' + dset_name])
    
    if df is None:
        df = pd.concat([df_coord_usa.reset_index(), df_std_dset_us], axis=1)
    else:
        df = pd.concat([df, df_std_dset_us], axis=1)
        
# Save into csv
df.to_csv('dataset/yearly_variables_us_coordinates_std.csv')

546219 coordinates in the US
processing... air_temperature
CPU times: user 31.3 s, sys: 54.3 s, total: 1min 25s
Wall time: 3min 23s
processing... alpha
CPU times: user 30.7 s, sys: 23.6 s, total: 54.3 s
Wall time: 4min 26s
processing... aod
CPU times: user 32.1 s, sys: 58.5 s, total: 1min 30s
Wall time: 8min 1s
processing... asymmetry
CPU times: user 30.5 s, sys: 22.1 s, total: 52.6 s
Wall time: 3min 32s
processing... cld_opd_dcomp
CPU times: user 32.4 s, sys: 1min 1s, total: 1min 33s
Wall time: 8min 52s
processing... cld_reff_dcomp
CPU times: user 32.8 s, sys: 53.2 s, total: 1min 26s
Wall time: 8min 47s
processing... clearsky_dhi
CPU times: user 31.7 s, sys: 51.4 s, total: 1min 23s
Wall time: 8min 33s
processing... clearsky_dni
CPU times: user 32.1 s, sys: 51.3 s, total: 1min 23s
Wall time: 8min 38s
processing... clearsky_ghi
CPU times: user 32.7 s, sys: 52.4 s, total: 1min 25s
Wall time: 8min 40s
processing... cloud_press_acha
CPU times: user 32.5 s, sys: 44.2 s, total: 1min 16s
Wall

In [None]:
# Generate a CSV file with OTHER values for all USA coordinates (data points decreased from 2018392 items -> 546219 items)
# CAVEAT: Running this cell take a long time - apprx 12 hrs

def get_variables_on_idx(ploc_idx):
    """
    Helper function to calculate wind and temperature bands and wind power density for a single plant.
    :ploc: tuple containing the latitude and longitude of the desired power plant
    :return: pandas dataframe containing wind and temperature bands and wind power density for a single plant    
    """
    print('processing... ' + str(ploc_idx))
    
    # Create file objects for accessing needed datasets in the HDF5 file
    dset_coords = f['coordinates'][...]
    dset_temp = f['air_temperature']
    dset_press = f['surface_pressure']
    dset_dew_point = f['dew_point']
    dset_wind_speed = f['wind_speed']
    
    # Load the needed data into 1d numpy arrays
    np_temp = dset_temp[:, ploc_idx]
    np_press = dset_press[:, ploc_idx]
    np_dew_point = dset_dew_point[:, ploc_idx]
    np_wind_speed = dset_wind_speed[:, ploc_idx]
    
    # Scale each array
    np_temp = np_temp[:] / dset_temp.attrs['psm_scale_factor']
    np_press = np_press[:] / dset_press.attrs['psm_scale_factor']
    np_dew_point = np_dew_point[:] / dset_dew_point.attrs['psm_scale_factor']
    np_wind_speed = np_wind_speed[:] / dset_wind_speed.attrs['psm_scale_factor']
    
    # Create an empty dictionary to hold the results
    dict_result = {}
    
    # Create a list to hold the wind band data
    wind_buckets = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    
    # Add wind band data
    for i in range(len(np_wind_speed)):
        bucket = which_bucket(np_wind_speed[i], 'wind')
        wind_buckets[bucket] = wind_buckets[bucket] + 1
                   
    # Find the total number of hours in the year
    total_points = sum(wind_buckets)
    
    # Check if any values did not fall into a bucket
    if total_points != 8760:
        print('ploc', ploc, ' total_wind_points: ', total_points)
                   
    # Calculate the % time spent in each bucket
    for i in range(len(wind_buckets)):
        wind_buckets[i] = wind_buckets[i] / total_points
    
    # Add temp band data to the dictionary                
    dict_result['wind_u5'] = wind_buckets[0]
    dict_result['wind_5_10'] = wind_buckets[1] 
    dict_result['wind_10_15'] = wind_buckets[2] 
    dict_result['wind_15_20'] = wind_buckets[3] 
    dict_result['wind_20_25'] = wind_buckets[4] 
    dict_result['wind_o25'] = wind_buckets[5] 
    
    # Create a list to hold the temp band data
    temp_buckets = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]                 
                   
    # Add temp band data
    for i in range(len(np_temp)):
        bucket = which_bucket(np_temp[i], 'temp')
        temp_buckets[bucket] = temp_buckets[bucket] + 1
    
    # Find the total number of hours in the year
    total_points = sum(temp_buckets)
    
    # Check if any values did not fall into a bucket
    if total_points != 8760:
        print('ploc', ploc, ' total_temp_points: ', total_points)
    
    # Calculate the % time spent in each bucket
    for i in range(len(temp_buckets)):
        temp_buckets[i] = temp_buckets[i] / total_points
    
    # Add temp band data to the dictionary                
    dict_result['temp_u0'] = temp_buckets[0]
    dict_result['temp_0_10'] = temp_buckets[1] 
    dict_result['temp_10_20'] = temp_buckets[2] 
    dict_result['temp_20_30'] = temp_buckets[3] 
    dict_result['temp_30_40'] = temp_buckets[4] 
    dict_result['temp_o40'] = temp_buckets[5] 
    
    
    # Add wind power factor data to the dictionary
    wind_power_density = 0
    
    for i in range(len(np_temp)):
        wind_power_density = wind_power_density + WindPowerDensity(np_wind_speed[i], np_temp[i], np_press[i], np_dew_point[i])
    
    dict_result['wind_power_density'] = wind_power_density / len(np_temp)
    
    # Convert all the results to a pandas dataframe
    df = pd.DataFrame([dict_result])
    return df


def data_wrangling3(filepath):
    """
    Function to calculate wind and temperature bands and wind power density for all plants in csv file.
    :filepath: path to file as string
    :return: pandas dataframe containing the wind and temperature bands and wind power density of all plants
    """
    # Open the data file
    with open(filepath, 'r', encoding='utf-8') as f:
        # Read the data
        reader = csv.reader(f)
        
        # Discard the header row
        for header in reader:
            break
        
        # df_orig will hold the original power plant columns we are not editing
        df_orig = None
        
        # df_agg will hold the new data we are calculating
        df_agg = None
        
        # Read in each row 
        # Note the header row was discarded above
        for row in reader:
            
            # Get the latitude and longitude for the plant we need
            coord = (row[1], row[2])
            
            # If first row
            if df_agg is None:
                # Add original data for the plant
                df_orig = pd.DataFrame([row], columns = header)
                
                # Get data for first row
                df_agg = get_variables_on_idx(int(row[0]))
                
            else:
                # Add original data for the plant
                df_orig = pd.concat([df_orig, pd.DataFrame([row], columns = header)], axis=0)
                
                # Get data for row
                df_agg = pd.concat([df_agg, get_variables_on_idx(int(row[0]))], axis=0)

        # Concat original cols and agg columns horizontally
        df = pd.concat([df_orig, df_agg], axis=1)

    return df

# Tempoerarily save US coordinates into a CSV to reuse previous APIs
meta = pd.DataFrame(f['meta'][...])

USA = meta.loc[meta['country'] == b'United States'] # Note .h5 saves strings as bit-strings
USA.head()

# Select US coordinates only
# Keep 'elevation' and 'state' as well. They may be useful later.
df_coord_usa = USA[['latitude', 'longitude', 'elevation', 'state']].copy()
df_coord_usa.shape
original_us_indice = df_coord_usa.index.copy()
df_coord_usa.to_csv('dataset/us_points.csv')
print('US coordinates saved as a csv file')

# Get wind and temperature band data and wind power density for solar plants
%time df = data_wrangling3('dataset/us_points.csv')
print('processed us points')

# Save solar data to csv
df.to_csv('dataset/yearly_variables_us_coordinates_others.csv')
print('a new csv file is generated with other variables')