May 6, 2020

This notebook will loop through a given system folder and look for hdf files
from the HLS dataset and convert them to geotiffs with masking applied.

Notes:
- Water is not masked, but the code to do so is merely commented out (easily reinstated) 
- For L30 data, the hdr file is REQUIRED
- There is a threshold option for excluding images with too high percentage masked
- A list of images excluded from converstion is generated 

In [2]:
import os, re, sys, datetime
import numpy as np
import numpy.ma as ma
from pyhdf.SD import SD, SDC
import gdal
import rasterio

In [3]:
# Location of HLS data to be converted 
system_folder = 'filepath to HLS data folder'

#Percent of image masked for which an image in the folder 
# is skipped and not convered to geotiff
masking_exclusion_threshold = .5  

In [4]:
def read_hdf_data(file_obj, band_name, scaled=True):
    sds_obj = file_obj.select(band_name)
    data = sds_obj.get()
    attributes = sds_obj.attributes()
    if scaled:
        add_offset = float(attributes['add_offset'])
        scale_factor = float(attributes['scale_factor'])
        data = (data-add_offset)*scale_factor
    return data

def QA_mask(file, product_id):
    if product_id == 'L30':
        blue = read_hdf_data(file, 'band02')
        green = read_hdf_data(file, 'band03')
        red = read_hdf_data(file, 'band04')
        nir = read_hdf_data(file, 'band05')
    else:
        blue = read_hdf_data(file, 'B02')
        green = read_hdf_data(file, 'B03')
        red = read_hdf_data(file, 'B04')
        nir = read_hdf_data(file, 'B8A')
    
    # Remove pixels covered by cirrus, clouds, cloud shadows, snow/ice, water and thick aerosol.
    qa = read_hdf_data(file, 'QA', scaled=False)
    cirrus = np.right_shift(np.bitwise_and(qa, int('00000001', 2)), 0)
    cloud = np.right_shift(np.bitwise_and(qa, int('00000010', 2)), 1)
    cloud_shadow = np.right_shift(np.bitwise_and(qa, int('00001000', 2)), 3)
    snow_ice = np.right_shift(np.bitwise_and(qa, int('00010000', 2)), 4)
    # water = np.right_shift(np.bitwise_and(qa, int('00100000', 2)), 5)  Commented this out because I use water
    aerosol = np.right_shift(np.bitwise_and(qa, int('11000000', 2)), 6)
    mask1 = (cirrus==1)|(cloud==1)|(cloud_shadow==1)|(snow_ice==1)|(aerosol==3)# |(water==1)

    # Remove pixels with a reflectance out of the range 0.0-1.0.
    mask2 = (blue<=0.0)|(blue>=1.0)|(green<=0.0)|(green>=1.0)|(red<=0.0)|(red>=1.0)|(nir<=0.0)|(nir>=1.0)

    # Make the final mask.
    mask = mask1|mask2
    mask = ~mask1

    del mask1, mask2, qa, cirrus, cloud, cloud_shadow, snow_ice, aerosol, # water    
   
    return mask

def read_hdf_to_arr(hdf_path, band, datatype=np.int16):
    """
    Functionalizing the process of reading HDF files into arrays
    read a single band out of the hdf and load it into a numpy array
    """
    if os.path.isfile(hdf_path):
        src = gdal.Open(hdf_path)
        band_ds = gdal.Open(src.GetSubDatasets()[band][0], gdal.GA_ReadOnly)
        band_array = band_ds.ReadAsArray().astype(datatype)
        print(band_array)
        del src
        return band_array
    else:
        print("That file does not exist")
        return

In [5]:
#Prepare a list to populate with images that are 50% masked or greater
high_masking = []

for dirpath, dirnames, filenames in os.walk(system_folder):
    if not filenames:
        continue
    for f in filenames:
        if f.endswith(".hdf"):
            pathname = os.path.join(dirpath, f)
            outname = pathname[:-4]+".tif"
            
            print("Reading the metadata of a hdf file.")
    
            product_id = os.path.basename(pathname).split(".")[1]
            print("This is an " + product_id + ' scene.')
            print('    File name:' + f)

            try:
                file = SD(pathname, SDC.READ)
            except:
                raise IOError("Cannot open the hdf file: %s" %pathname)

            metadata = file.attributes()

            # acquisition time
            # if product_id == 'L30':
            #    tm = datetime.datetime.strptime(metadata['SENSING_TIME'], '%Y-%m-%dT%H:%M:%S.%f0Z')
            #    year = str(tm.timetuple().tm_year)
            #    doy = tm.timetuple().tm_yday
            #    print("    Acqusition time (YYYY-MM-DD hh:mm:ss): %s" %tm)

            # else:
            #    tm = datetime.datetime.strptime(metadata['SENSING_TIME'], '%Y-%m-%dT%H:%M:%S.%fZ')
            #    year = str(tm.timetuple().tm_year)
            #    doy = tm.timetuple().tm_yday
            #    print("    Acqusition time (YYYY-MM-DD hh:mm:ss): %s" %tm)
                
            # get cloud cover percent
            print('    Cloud coverage: ' + str(metadata['cloud_coverage']))

            # upper-left corner map coordinates
            ulx      = float(metadata['ULX'])
            uly      = float(metadata['ULY'])
            print("    Upper-left corner map coordinates (X, Y in m): %s, %s" %(ulx, uly))

            # pixel size
            p_size   = float(metadata['SPATIAL_RESOLUTION'])
            print("    Pixel size (in m): %s" %p_size)

            # image rows and columns
            nrows    = int(metadata['NROWS'])
            ncols    = int(metadata['NCOLS'])
            print("    Image rows and columns: %s, %s" %(nrows, ncols))

            # utm zone
            utm_zone = int(re.search('ZONE \d+', metadata['HORIZONTAL_CS_NAME'].upper()).group()[-2:])
            print("    UTM zone: %s" %utm_zone)

            # Get EPSG code from S30 image; L30 does not have this option
            if product_id == 'S30':
                epsg = metadata['HORIZONTAL_CS_CODE'].split(':')
                epsg = (int(epsg[1]))
                print('    EPSG code is: ' + str(epsg))
            
            if product_id == 'L30':
                bands = [1,2,3,4]   
            else:
                bands = [1,2,3,8] # b g r n (index 8 is the 8A band using zero-indexing)
                
            # Get Mask
            qa_mask = QA_mask(file, product_id)  # turn the array into a binary mask
                     
            # What percent of the image is masked?
            mask_vals = np.unique(qa_mask, return_counts=True)
            percent_masked = (mask_vals[1][0])/(sum(mask_vals[1]))
            print('    Percent of image masked: ' + str(percent_masked))
                                    
            # If image is over a given threshold percent of masking, put it in a list
            if percent_masked >= masking_exclusion_threshold:
                high_masking.append(f)
            
            else:
                qa_mask = np.invert(qa_mask) #Inverstion is necessary to work with ma.masked_array
                negatives = np.zeros_like(qa_mask, dtype=bool, subok=False)  # get the dimensions of the image from the QA mask
                arr = np.zeros((len(bands), np.shape(qa_mask)[0], np.shape(qa_mask)[1]))  # pre-allocate
                
                # apply QA mask and remove all negative values
                for count, b in enumerate(bands):
                    tmp_band = read_hdf_to_arr(pathname, b)
                    tmp = ma.masked_array(tmp_band, qa_mask)
                    arr[count,:,:] = ma.filled(tmp, 0)  # fill mask using nodata value. add to 'arr' to create a 4-band image
                    # find all the negative pixels
                    negatives = negatives + (tmp_band<0)  # True where there are negative values. False elsewhere.
                arr = ma.masked_array(arr, mask=np.broadcast_to(negatives[np.newaxis,:,:], arr.shape))
                arr = ma.filled(arr, 0)

                # Write data to a geotiff
                with rasterio.open(pathname) as src:
                    kwds = src.profile
                    kwds['nodata'] = 0
                    kwds['driver'] = 'GTiff'
                    kwds['dtype'] = rasterio.int16
                    kwds['width'] = arr.shape[2]
                    kwds['height'] = arr.shape[1]
                    kwds['count'] = arr.shape[0]
                    kwds['transform'] = rasterio.transform.from_origin(ulx, uly, 30., 30.) 
                    
                    #Get crs info
                    if product_id == 'S30':
                        kwds['crs'] = rasterio.crs.CRS.from_epsg(epsg) 
                    else: 
                        #for some reason, L30 data is incomplete in the hdf file.  
                        # Need to use hdr.
                        hdr_fp = f + '.hdr'
                        hdr = os.path.join(dirpath, hdr_fp)
                        info = open(hdr, 'r')
                        guts = info.read()
                        pieces = guts.split('\n')
                        crs = pieces[11].split('=')
                        kwds['crs'] = (crs[1][2:-1])

                    with rasterio.open(outname, 'w', **kwds) as dst:
                         dst.write(np.array(arr).astype(rasterio.int16))

Reading the metadata of a hdf file.
This is an S30 scene.
    File name:HLS.S30.T12SUC.2020148.v1.4.hdf
    Cloud coverage: 1
    Upper-left corner map coordinates (X, Y in m): 300000.0, 3800040.0
    Pixel size (in m): 30.0
    Image rows and columns: 3660, 3660
    UTM zone: 12
    EPSG code is: 32612
    Percent of image masked: 0.01225058974588671
[[ 706  563  593 ...  655  656  651]
 [ 762  611  592 ...  645  655  645]
 [ 673  607  601 ...  658  658  659]
 ...
 [1091 1116 1131 ... 1972 1740 1548]
 [1128 1139 1127 ... 1826 1724 1715]
 [1039  999  984 ... 1438 1832 1833]]
[[ 971  797  841 ...  899  882  895]
 [1041  846  822 ...  877  888  879]
 [ 942  858  841 ...  901  892  909]
 ...
 [1510 1532 1558 ... 2395 2174 2006]
 [1570 1576 1545 ... 2283 2107 2113]
 [1437 1382 1373 ... 1746 2252 2326]]
[[1258 1016 1111 ... 1246 1246 1259]
 [1342 1093 1058 ... 1218 1243 1242]
 [1245 1126 1093 ... 1224 1239 1268]
 ...
 [2089 2126 2141 ... 2859 2697 2435]
 [2163 2177 2152 ... 2728 2609 2599]


  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)
