# Get burned area data processing Code

This script processes burned area data from NOAA GOES-16 for a given region of interest (ROI) in the Amazon, computes total and average burned area, and saves the results to a CSV file.

1. Get Files Function
The `get_files` function retrieves relevant satellite files within a specified year, day, and hour range. It handles missing files gracefully using a `try-except` block.

2. Get Indexes and Matrix Function
`get_indexes_v3` creates a 0.5°x0.5° grid within the ROI and calculates the corresponding FEER coefficient for each grid cell, generating a matrix and a mask for valid data points.

3. Process Data Area Function
The `process_data_area` function processes each file by:
- Extracting the date and time.
- Calculating the total and average burned area for the ROI.
- Handling missing data by setting NaNs to -9999.
The results are saved in a CSV file with relevant details.

4. Main Execution Flow
- Latitude and longitude data are retrieved.
- Grid indexes and FEER coefficients are computed.
- A list of files is generated and processed.
- The results are saved in a CSV file.


In [None]:
# Import necessary libraries
# os: Provides functions to interact with the operating system, such as file and directory operations
import os
# BytesIO: Enables reading and writing of binary data in memory as if it were a file
from io import BytesIO
# s3fs: A Pythonic interface to Amazon S3, allowing for easy file operations on S3 buckets
import s3fs
# xarray: A powerful library for working with multi-dimensional arrays, particularly for geospatial and time-series data
import xarray as xr
# numpy: Provides support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions
import numpy as np
# glob: Used for finding all pathnames matching a specified pattern, useful for file pattern matching in directories
import glob
# pyproj: Provides tools for working with projections and coordinate transformations, such as converting between latitude/longitude and projected coordinates
from pyproj import Proj
# pandas: A data analysis library that provides data structures like DataFrame for handling structured data, useful for working with time-series and tabular data
import pandas as pd
# warnings: Used to issue warnings to the user, often to alert about potential issues or deprecated features
import warnings

In [None]:
# Define input and output directories and files

# datadir: Directory where the input data (CSV files with processed data) is located
datadir = '/home/jovyan/Article_review/Data/'

# glob: Search for all CSV files matching the pattern 'FEER*.csv' within the input directory
data = sorted(glob.glob(datadir+'/FEER*.csv'))

# Read the first matching CSV file (assumed to be the FEER data)
feer_data = pd.read_csv(data[0])

# Define output directory, year for data processing, and the output file name
# It is recommended to process data one year at a time
outdir = datadir  # Set the output directory to the same as the input directory
Year = 2022  # Year for processing the data (modify as needed)
outfile = outdir + 'goes_area_amazon_definitive_box_' + str(Year) + '_150_350.csv'  # Output file name based on the year

# Define Region of Interest (ROI) in degrees (longitude and latitude)
# The box represents the Amazon region by default
minlon, maxlon, minlat, maxlat = -72, -48, -11, -3  # Amazon ROI
# Alternative boxes (commented out examples)
# minlon, maxlon, minlat, maxlat = -57.5, -56.5, -17.5, -16.5  # Cerrado large box
# minlon, maxlon, minlat, maxlat = -72, -48, -9, -6  # Example Amazon box

# Define the header for the output file
aux1 = 'sat,year,julian,hhmm,code,sum_area,mean_area\n'  # CSV header for output file
header = aux1  # Set the header
outstring = ''  # Placeholder for data that will be written
outfn = open(outfile, 'w')  # Open output file in write mode
outfn.writelines(header)  # Write the header to the output file
outfn.close()  # Close the output file

# Initialize S3 file system to access data stored on Amazon S3
fs = s3fs.S3FileSystem(anon=True)  # Anonymous access to S3

In [None]:
# Initialize geometric variables

# This function extracts the geometric values (latitude and longitude) from one file,
# which contains satellite coordinates, and calculates the corresponding matrix of latitudes and longitudes.
# These values will be used for geospatial referencing in the satellite data.

def get_lat_lon(file_system):
    # List of 6 files from the 'noaa-goes16' directory for the specified date (2022, day 200) and time (15:00 to 15:50 UTC)
    # The directory structure includes data at 10-minute intervals
    files = file_system.ls('noaa-goes16/ABI-L2-FDCF/2022/'+str(200).zfill(3)+'/'+str(15).zfill(2)+'/')

    # Open the first file in the list to extract the geospatial information
    # Here, we use 'h5netcdf' as the engine to read the data from the file
    with fs.open(files[0], 'rb') as f:
        ds0 = xr.open_dataset(BytesIO(f.read()), engine='h5netcdf')

    # Extract satellite geometric parameters from the file
    # These parameters define the satellite's position and the projection system used
    sat_h = ds0.goes_imager_projection.perspective_point_height  # Satellite height
    sat_lon = ds0.goes_imager_projection.longitude_of_projection_origin  # Longitude of the satellite's projection origin
    sat_sweep = ds0.goes_imager_projection.sweep_angle_axis  # Sweep angle axis of the satellite

    # Create a geostationary projection object using pyproj
    # This projection is used to convert the satellite's (x, y) coordinates into geographic (latitude, longitude) coordinates
    p = Proj(proj='geos', h=sat_h, lon_0=sat_lon, sweep=sat_sweep)

    # Multiply the x and y coordinates by the satellite height to scale them properly for the projection
    X = np.array(ds0.x) * sat_h
    Y = np.array(ds0.y) * sat_h

    # Create mesh grids for the x and y coordinates
    XX, YY = np.meshgrid(X, Y)

    # Convert the satellite projection coordinates (XX, YY) into latitude and longitude
    rlon, rlat = p(XX, YY, inverse=True)

    # Return the calculated latitude and longitude matrices
    return rlat, rlon

In [None]:
# Function to collect and save file names in a list for the given period of interest with error handling
# This function iterates over the specified year, day, and hour ranges and collects the corresponding file names
# from the NOAA GOES-16 directory structure, with added error handling for missing files.

def get_files(s_year, e_year, s_day, e_day, s_hour, e_hour):
    print('Getting file names')
    aux = []  # List to store the file names
    # Loop over the years in the specified range
    for y in range(s_year, e_year + 1):
        # Loop over the days in the specified range
        for d in range(s_day, e_day):
            # The variable 'd' determines the days of the product (e.g., day 228 corresponds to 15:00, 15:10, etc.)
            for j in range(s_hour, e_hour):
                try:
                    # List the files for a specific year, day, and hour directory
                    # These directories contain 6 files for each 10-minute interval (e.g., 15:00, 15:10, ..., 15:50 UTC)
                    FD = fs.ls('noaa-goes16/ABI-L2-FDCF/' + str(y) + '/' + str(d).zfill(3) + '/' + str(j).zfill(2) + '/')
                    aux = np.append(aux, FD)  # Append the found files to the list
                except FileNotFoundError as e:
                    # In case a file is not found, print an error message and skip to the next file
                    print(f"FileNotFoundError file {'noaa-goes16/ABI-L2-FDCF/'+str(y)+'/'+str(d).zfill(3)+'/'+str(j).zfill(2)+'/'}: {e}. Skipping this file.")
                    continue  # Skip to the next file in the list
    return aux

In [None]:
# Function to create a 0.5°x0.5° grid over a specified area and calculate the corresponding FEER coefficient for each grid element.
# This function processes the latitude and longitude coordinates within a defined Region of Interest (ROI) and assigns
# the FEER coefficients to a grid with two different grid resolutions (1°x1° and 0.5°x0.5°). 

def get_indexes_v3(min_lon, max_lon, min_lat, max_lat, rlat, rlon, dados_feer):
    # Calculate the center latitude and longitude of each grid element in the 0.5°x0.5° grid within the ROI
    centers_lon = np.linspace(min_lon + 0.25, max_lon - 0.25, num=int(max_lon - min_lon) * 2)
    centers_lat = np.linspace(min_lat + 0.25, max_lat - 0.25, num=int(max_lat - min_lat) * 2)

    # Calculate the center latitude and longitude of each grid element in the 1°x1° grid within the ROI
    centers_lon2 = np.linspace(min_lon + 0.5, max_lon - 0.5, num=int(max_lon - min_lon))
    centers_lat2 = np.linspace(min_lat + 0.5, max_lat - 0.5, num=int(max_lat - min_lat))

    aux_list = []  # List to store the FEER coefficients for the 1°x1° grid
    # Calculate the matching FEER coefficient for each element of the 1°x1° grid
    for i in range(len(centers_lat2)):
        # Extract the corresponding FEER coefficient for the current 1°x1° grid element from the 'dados_feer' DataFrame
        df2 = dados_feer.loc[(dados_feer['Latitude'] == centers_lat2[i]) & 
                             (dados_feer['Longitude'] <= max_lon) & 
                             (dados_feer['Longitude'] >= min_lon), 'Ce_850'].to_numpy()
        aux_list = np.append(aux_list, df2)
        
        # Create a grid of latitude and longitude coordinates for the 1°x1° grid
        if i == 0:
            aux2 = np.repeat(centers_lat2[i], len(centers_lon2))
            lat_lon_feer2 = np.column_stack((aux2, centers_lon2))
        else:
            aux2 = np.repeat(centers_lat2[i], len(centers_lon2))
            aux2 = np.column_stack((aux2, centers_lon2))
            lat_lon_feer2 = np.vstack((lat_lon_feer2, aux2))
    
    # Add the FEER coefficients to the grid of latitudes and longitudes for the 1°x1° grid
    lat_lon_feer2 = np.column_stack((lat_lon_feer2, aux_list))

    # Calculate the FEER coefficient corresponding to each element of the 0.5°x0.5° grid
    for i in range(len(centers_lat)):
        # Create a grid of latitude and longitude coordinates for the 0.5°x0.5° grid
        if i == 0:
            aux = np.repeat(centers_lat[i], len(centers_lon))
            lat_lon_feer = np.column_stack((aux, centers_lon))
        else:
            aux = np.repeat(centers_lat[i], len(centers_lon))
            aux2 = np.column_stack((aux, centers_lon))
            lat_lon_feer = np.vstack((lat_lon_feer, aux2))

    # Create a list of latitudes, longitudes, and corresponding FEER coefficients for the 0.5°x0.5° grid
    aux_list_2 = []  # List to store the FEER coefficients for the 0.5°x0.5° grid
    for j in range(len(lat_lon_feer)):
        for n in range(len(lat_lon_feer2)):
            # Match the closest 1°x1° grid element to the 0.5°x0.5° grid element based on latitude and longitude
            if (lat_lon_feer[j, 0] == lat_lon_feer2[n, 0] + 0.25) or (lat_lon_feer[j, 0] == lat_lon_feer2[n, 0] - 0.25):
                if (lat_lon_feer[j, 1] == lat_lon_feer2[n, 1] + 0.25) or (lat_lon_feer[j, 1] == lat_lon_feer2[n, 1] - 0.25):
                    aux_list_2 = np.append(aux_list_2, lat_lon_feer2[n, 2])
    
    # Add the matched FEER coefficients to the 0.5°x0.5° grid
    lat_lon_feer = np.column_stack((lat_lon_feer, aux_list_2))
    matrix = lat_lon_feer  # Store the final matrix of latitudes, longitudes, and FEER coefficients

    # Create a mask for valid data within the ROI
    # This will be used to select the corresponding data from the full disk matrix based on latitude and longitude
    I = np.where((rlat >= min_lat) & (rlat <= max_lat) & (rlon >= min_lon) & (rlon <= max_lon))
    index_list = [I]  # Initialize the index list with the valid data points in the ROI

    # Repeat the process for each element of the 0.5°x0.5° grid
    for k in range(len(matrix)):
        # Find the matching grid elements for the 0.5°x0.5° grid based on the latitude and longitude boundaries
        aux1 = np.where((rlat >= matrix[k, 0] - 0.25) & (rlat <= matrix[k, 0] + 0.25) &
                        (rlon >= matrix[k, 1] - 0.25) & (rlon <= matrix[k, 1] + 0.25))
        index_list.append(aux1)  # Add the matching index to the list

    return index_list, matrix  # Return the list of indices and the matrix with latitudes, longitudes, and FEER coefficients

In [None]:
# Main function to process netCDF files, extract burned area data, and save results in a CSV file.
# The function reads each file, retrieves spatial and temporal information, computes burned area statistics,
# and stores the results in a CSV file for further analysis.
def process_data_area(files, indexes):
    # Change the current working directory to the output directory
    os.chdir(outdir)
    # Open the CSV file for writing the output data
    outfn = open(outfile, 'w')
    
    # Loop through each file in the provided list
    for i in range(0, len(files)):
        with fs.open(files[i], 'rb') as f:
            # Read the netCDF file and load the data into an xarray Dataset
            ds = xr.open_dataset(BytesIO(f.read()), engine='h5netcdf')
            try:
                # Extract the date-time information from the file name
                prodbase = files[i].split('/')[5][:23]  # Extract product base information
                starttime = files[i].split(prodbase)[1].split('_')[0]  # Extract the start time
                year, julian, hhmm = starttime[:4], starttime[4:7], starttime[7:11]  # Split date-time components
                plottitle = year + ',' + julian + ',' + hhmm  # Create a plot title
                fpart = starttime + ',' + plottitle  # Format the first part of the output string
                print(f'Processing year: {year}, day: {julian}, hour: {hhmm}', end='\r')  # Print progress message

                # Calculate a fractional day from the start time
                code = int(julian) + (int(hhmm) / 100) / 24 + (int(hhmm) % 100) / 60 / 24
                
                # Extract the burned area matrix from the dataset
                A = np.array(ds.Area)

                # Use the provided indexes to select the relevant region of interest (ROI)
                A_box_amazon = A[indexes[0]]

                # Flatten the valid (non-NaN) elements of the selected region into a 1D array
                array_A_box_amazon = A_box_amazon[~np.isnan(A_box_amazon)]

                # Calculate the total burned area by summing the valid values
                sum_area = np.sum(array_A_box_amazon)

                # Calculate the average burned area, ignoring warnings caused by NaN values
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=RuntimeWarning)
                    mean_area = np.mean(array_A_box_amazon)

                # Set the average to -9999 if it is NaN (invalid)
                if np.isnan(mean_area):
                    mean_area = -9999

                # Format the results as a string
                results = str(code) + ',' + str(sum_area) + ',' + str(mean_area)
                # Combine the temporal information and the results into one string
                outstring = fpart + ',' + results + '\n'
                # Write the results to the CSV file
                outfn.writelines(outstring)

            # Catch any OSError exceptions and print the error message
            except OSError as error:
                print(error)
    
    # Close the output file after processing all the data
    outfn.close()
    return print('Done')

In [None]:
# Retrieve latitude and longitude data using the get_lat_lon function
rlat, rlon = get_lat_lon(fs)

# Get the indexes and matrix corresponding to the region of interest (ROI) from the FEER data
Indexes, M = get_indexes_v3(minlon, maxlon, minlat, maxlat, rlat, rlon, feer_data)

print('Got indexes and matrix')  # Print a confirmation message once the index and matrix are obtained

# Define the year, day, and hour range for the data extraction
start_year, end_year = Year, Year  # Set both start and end year to the specified Year
start_day, end_day = 150, 350  # Define the start and end days of the year
start_hour, end_hour = 0, 24  # Define the start and end hours for data extraction

# Get the list of files based on the specified time range
data_list = get_files(start_year, end_year, start_day, end_day, start_hour, end_hour)

print('Data listed')  # Print a confirmation message once the data files are listed

print('Starting process data')  # Indicate that the data processing has started
# Process the data files for the ROI using the previously obtained indexes
process_data_area(data_list, Indexes)