In [None]:
import os
import rasterio
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import traceback
import dask.dataframe as dd
from dask import delayed
import re
from datetime import datetime
import numpy as np

# Define the correct band names for columns in the DataFrame
band_names = [
    "BCANGSTR", "BCCMASS", "BCEXTTAU", "BCFLUXU", "BCFLUXV", "BCSCATAU", "BCSMASS", 
    "DMSCMASS", "DMSSMASS", "DUANGSTR", "DUCMASS25", "DUCMASS", "DUEXTT25", 
    "DUEXTTAU", "DUFLUXU", "DUFLUXV", "DUSCAT25", "DUSCATAU", "DUSMASS25", 
    "DUSMASS", "OCANGSTR", "OCCMASS", "OCEXTTAU", "OCFLUXU", "OCFLUXV", "OCSCATAU", 
    "OCSMASS", "SO2CMASS", "SO2SMASS", "SO4CMASS", "SO4SMASS", "SSANGSTR", 
    "SSCMASS25", "SSCMASS", "SSEXTT25", "SSEXTTAU", "SSFLUXU", "SSFLUXV", 
    "SSSCAT25", "SSSCATAU", "SSSMASS25", "SSSMASS", "SUANGSTR", "SUEXTTAU", 
    "SUFLUXU", "SUFLUXV", "SUSCATAU", "TOTANGSTR", "TOTEXTTAU", "TOTSCATAU"
]

# Function to save the DataFrame to a Parquet file
def save_to_parquet(df, date_time):
    try:
        # Define the base directory for Parquet files
        base_dir = "Z:\\Thesis\\Data\\GEE\\MERRA2_aer\\MERRA2_num_data"
        
        # Create directory based on the datetime (year, month, day as strings)
        year_str = str(date_time.year)
        month_str = f"{date_time.month:02d}"  # Ensure two digits for month
        day_str = f"{date_time.day:02d}"      # Ensure two digits for day
        hour_str = f"{date_time.hour:02d}"    # Include hour in two digits
        minute_str = "00"  # Since we're using whole hours, minutes are always "00"
        
        # Define the directory path, including year, month, and day
        parquet_dir = os.path.join(base_dir, year_str, month_str, day_str)
        os.makedirs(parquet_dir, exist_ok=True)

        # Define the file path for the Parquet file, including day and hour
        parquet_file_path = os.path.join(parquet_dir, f"merra2_numerical_data_{year_str}_{month_str}_{day_str}_{hour_str}{minute_str}.parquet")

        # Convert DataFrame to PyArrow Table and write to Parquet
        table = pa.Table.from_pandas(df)
        pq.write_table(table, parquet_file_path)
        
        print(f"Data successfully saved to Parquet file: {parquet_file_path}")
        return parquet_file_path

    except Exception as e:
        print(f"Error saving data to Parquet: {e}")
        traceback.print_exc()

# Optimized function to process the TIFF file using Dask and extract spatial info
def process_local_tiff_and_save_parquet(local_file_path):
    file_name = os.path.basename(local_file_path)
    print(f"Processing file: {file_name}")
    
    # Extract datetime from the filename
    date_time = extract_datetime_from_filename(file_name)
    print(f"Extracted datetime: {date_time}")  # Debugging output
    
    try:
        # Open the TIFF file from the local path
        with rasterio.open(local_file_path) as dataset:
            tasks = []
            
            # Process each window (block) in the TIFF file using Dask delayed
            for ji, window in dataset.block_windows(1):
                # Read the chunk data using a delayed Dask task
                @delayed
                def process_chunk(window, transform):
                    # Read the data from the window
                    band_data = dataset.read(window=window)
                    
                    # Get the coordinates for each pixel in the window
                    rows, cols = np.meshgrid(np.arange(window.width), np.arange(window.height), indexing='ij')
                    lon, lat = rasterio.transform.xy(transform, rows, cols, offset='center')

                    # Flatten lat and lon arrays
                    lat = np.array(lat).flatten()
                    lon = np.array(lon).flatten()

                    # Convert the band data into a DataFrame
                    chunk_df = pd.DataFrame(band_data.reshape(band_data.shape[0], -1).T, columns=band_names)

                    # Append latitude, longitude, time, and hour columns
                    chunk_df['lat'] = lat
                    chunk_df['lon'] = lon
                    chunk_df['time'] = pd.Timestamp(date_time)
                    chunk_df['hour'] = date_time.hour

                    return chunk_df
                
                # Get the transform for the window
                transform = dataset.window_transform(window)
                
                # Append the delayed task
                tasks.append(process_chunk(window, transform))
            
            # Combine all the delayed DataFrames into a single Dask DataFrame
            full_ddf = dd.from_delayed(tasks)
            
            # Compute the Dask DataFrame (this triggers the actual processing)
            final_df = full_ddf.compute()
            
            # Save the complete DataFrame to Parquet
            parquet_file_path = save_to_parquet(final_df, date_time)
            
            print(f"Data processing complete for {file_name} and saved to {parquet_file_path}")

    except Exception as e:
        print(f"Error processing TIFF file {file_name}: {e}")
        traceback.print_exc()

# Function to extract full datetime from filename using regex
def extract_datetime_from_filename(file_name):
    try:
        # Use a regex pattern to find the date and time in the filename (YYYYMMDDHH format)
        match = re.search(r'(\d{4})(\d{2})(\d{2})(\d{2})', file_name)
        if match:
            year = int(match.group(1))
            month = int(match.group(2))
            day = int(match.group(3))
            hour = int(match.group(4))
            # Debugging print statements
            print(f"Extracted Year: {year}, Month: {month}, Day: {day}, Hour: {hour}")
            # Construct a datetime object from the extracted components
            date_time = datetime(year, month, day, hour)
            return date_time
        else:
            raise ValueError(f"No valid datetime found in filename: {file_name}")
    except Exception as e:
        print(f"Error extracting datetime from filename: {e}")
        return None

# Function to iterate through years, months, days, and hours
def process_aod_ducmass_data(start_year, end_year):
    base_dir = "Z:\\Thesis\\Data\\GEE\\MERRA2_aer\\AOD_ducmass"
    
    for year in range(start_year, end_year + 1):
        year_dir = os.path.join(base_dir, str(year))
        
        if os.path.exists(year_dir):
            # Iterate through all TIFF files in the year's directory
            for file_name in sorted(os.listdir(year_dir)):
                if file_name.endswith(".tif"):
                    local_file_path = os.path.join(year_dir, file_name)
                    process_local_tiff_and_save_parquet(local_file_path)
        else:
            print(f"Year directory {year_dir} not found.")

# Process data for AOD DUCMASS from 2016 to 2023
process_aod_ducmass_data(2007, 2023)

print("Processing complete for all years!")


Processing file: 2007010100.tif
Extracted Year: 2007, Month: 1, Day: 1, Hour: 0
Extracted datetime: 2007-01-01 00:00:00
Data successfully saved to Parquet file: Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\2007\01\01\merra2_numerical_data_2007_01_01_0000.parquet
Data processing complete for 2007010100.tif and saved to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\2007\01\01\merra2_numerical_data_2007_01_01_0000.parquet
Processing file: 2007010101.tif
Extracted Year: 2007, Month: 1, Day: 1, Hour: 1
Extracted datetime: 2007-01-01 01:00:00
Data successfully saved to Parquet file: Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\2007\01\01\merra2_numerical_data_2007_01_01_0100.parquet
Data processing complete for 2007010101.tif and saved to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\2007\01\01\merra2_numerical_data_2007_01_01_0100.parquet
Processing file: 2007010102.tif
Extracted Year: 2007, Month: 1, Day: 1, Hour: 2
Extracted datetime: 2007-01-01 02:00:00
Data successfully saved to Parqu