In [3]:
import os
import pandas as pd
from pathlib import Path
import glob
from tqdm import tqdm

# Paths for MERRA2 2017 datasets
MERRA2_INPUT_PATH = r"Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\2017"
MERRA2_OUTPUT_PATH = r"Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2017"

# Columns to calculate statistics for
AOD_COLUMNS = ["DUSMASS", "DUSMASS25", "DUCMASS", "DUCMASS25", "DUFLUXU", "DUFLUXV"]

def process_merra2_files(input_path, output_path):
    """Process MERRA2 parquet files for the year 2017 to calculate monthly statistics."""
    Path(output_path).mkdir(parents=True, exist_ok=True)

    # Find all parquet files in the specified input directory
    parquet_files = sorted(glob.glob(os.path.join(input_path, "**", "*.parquet"), recursive=True))

    all_data = []

    print("Reading parquet files and concatenating data...")
    for file in tqdm(parquet_files, desc="Processing files"):
        df = pd.read_parquet(file)

        # Ensure 'time' column is in datetime format
        if 'time' not in df.columns:
            raise KeyError(f"'time' column not found in file {file}")
        df['time'] = pd.to_datetime(df['time'])
        
        # Filter required columns
        columns_to_keep = ['time', 'lat', 'lon'] + AOD_COLUMNS
        df = df[columns_to_keep]
        
        all_data.append(df)

    # Combine all data into one DataFrame
    combined_df = pd.concat(all_data, ignore_index=True)

    print("Grouping data by month and calculating statistics...")
    # Group by month and calculate statistics for specified columns
    grouped = combined_df.groupby([pd.Grouper(key='time', freq='M'), 'lat', 'lon'])
    monthly_stats = grouped[AOD_COLUMNS].agg(['min', 'max', 'mean']).reset_index()

    # Flatten multi-level column index
    monthly_stats.columns = [
        f"{col[0]}_{col[1]}" if col[1] else col[0] for col in monthly_stats.columns
    ]

    # Save the output as a parquet file
    output_file = os.path.join(output_path, "MERRA2_2017_monthly_stats.parquet")
    monthly_stats.to_parquet(output_file, index=False)

    print(f"Saved MERRA2 monthly statistics to {output_file}")

if __name__ == "__main__":
    process_merra2_files(MERRA2_INPUT_PATH, MERRA2_OUTPUT_PATH)


Reading parquet files and concatenating data...


Processing files: 100%|█████████████████████████| 9100/9100 [06:53<00:00, 21.99it/s]


Grouping data by month and calculating statistics...


  grouped = combined_df.groupby([pd.Grouper(key='time', freq='M'), 'lat', 'lon'])


Saved MERRA2 monthly statistics to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2017\MERRA2_2017_monthly_stats.parquet


In [1]:
import os
import pandas as pd
from pathlib import Path
import glob
from tqdm import tqdm

# Base paths for input and output
MERRA2_BASE_INPUT_PATH = r"Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data"
MERRA2_BASE_OUTPUT_PATH = r"Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats"

# Columns to calculate statistics for
AOD_COLUMNS = ["DUSMASS", "DUSMASS25", "DUCMASS", "DUCMASS25", "DUFLUXU", "DUFLUXV"]

def process_merra2_files_for_year(year, input_base_path, output_base_path):
    """
    Process MERRA2 parquet files for a specific year to calculate monthly statistics.
    """
    input_path = os.path.join(input_base_path, str(year))
    output_path = os.path.join(output_base_path, str(year))

    Path(output_path).mkdir(parents=True, exist_ok=True)

    # Find all parquet files in the specified input directory
    parquet_files = sorted(glob.glob(os.path.join(input_path, "**", "*.parquet"), recursive=True))

    if not parquet_files:
        print(f"No parquet files found for the year {year}. Skipping...")
        return

    all_data = []

    print(f"Processing files for the year {year}...")
    for file in tqdm(parquet_files, desc=f"Processing files for {year}"):
        df = pd.read_parquet(file)

        # Ensure 'time' column is in datetime format
        if 'time' not in df.columns:
            raise KeyError(f"'time' column not found in file {file}")
        df['time'] = pd.to_datetime(df['time'])
        
        # Filter required columns
        columns_to_keep = ['time', 'lat', 'lon'] + AOD_COLUMNS
        df = df[columns_to_keep]
        
        all_data.append(df)

    # Combine all data into one DataFrame
    combined_df = pd.concat(all_data, ignore_index=True)

    print(f"Grouping data by month and calculating statistics for the year {year}...")
    # Group by month and calculate statistics for specified columns
    grouped = combined_df.groupby([pd.Grouper(key='time', freq='ME'), 'lat', 'lon'])
    monthly_stats = grouped[AOD_COLUMNS].agg(['min', 'max', 'mean']).reset_index()

    # Flatten multi-level column index
    monthly_stats.columns = [
        f"{col[0]}_{col[1]}" if col[1] else col[0] for col in monthly_stats.columns
    ]

    # Save the output as a parquet file
    output_file = os.path.join(output_path, f"MERRA2_{year}_monthly_stats.parquet")
    monthly_stats.to_parquet(output_file, index=False)

    print(f"Saved MERRA2 monthly statistics for {year} to {output_file}")

def process_merra2_files_for_years(start_year, end_year, input_base_path, output_base_path):
    """
    Process MERRA2 parquet files for multiple years to calculate monthly statistics.
    """
    for year in range(start_year, end_year + 1):
        try:
            process_merra2_files_for_year(year, input_base_path, output_base_path)
        except Exception as e:
            print(f"An error occurred while processing the year {year}: {e}")

if __name__ == "__main__":
    START_YEAR = 1980
    END_YEAR = 2023
    process_merra2_files_for_years(START_YEAR, END_YEAR, MERRA2_BASE_INPUT_PATH, MERRA2_BASE_OUTPUT_PATH)


Processing files for the year 1980...


Processing files for 1980: 100%|████████████████| 9150/9150 [05:45<00:00, 26.48it/s]


Grouping data by month and calculating statistics for the year 1980...
Saved MERRA2 monthly statistics for 1980 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1980\MERRA2_1980_monthly_stats.parquet
Processing files for the year 1981...


Processing files for 1981: 100%|████████████████| 9125/9125 [05:46<00:00, 26.37it/s]


Grouping data by month and calculating statistics for the year 1981...
Saved MERRA2 monthly statistics for 1981 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1981\MERRA2_1981_monthly_stats.parquet
Processing files for the year 1982...


Processing files for 1982: 100%|████████████████| 9125/9125 [05:45<00:00, 26.44it/s]


Grouping data by month and calculating statistics for the year 1982...
Saved MERRA2 monthly statistics for 1982 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1982\MERRA2_1982_monthly_stats.parquet
Processing files for the year 1983...


Processing files for 1983: 100%|████████████████| 9125/9125 [06:01<00:00, 25.23it/s]


Grouping data by month and calculating statistics for the year 1983...
Saved MERRA2 monthly statistics for 1983 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1983\MERRA2_1983_monthly_stats.parquet
Processing files for the year 1984...


Processing files for 1984: 100%|████████████████| 9150/9150 [06:08<00:00, 24.80it/s]


Grouping data by month and calculating statistics for the year 1984...
Saved MERRA2 monthly statistics for 1984 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1984\MERRA2_1984_monthly_stats.parquet
Processing files for the year 1985...


Processing files for 1985: 100%|████████████████| 9125/9125 [06:18<00:00, 24.10it/s]


Grouping data by month and calculating statistics for the year 1985...
Saved MERRA2 monthly statistics for 1985 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1985\MERRA2_1985_monthly_stats.parquet
Processing files for the year 1986...


Processing files for 1986: 100%|████████████████| 9125/9125 [06:02<00:00, 25.15it/s]


Grouping data by month and calculating statistics for the year 1986...
Saved MERRA2 monthly statistics for 1986 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1986\MERRA2_1986_monthly_stats.parquet
Processing files for the year 1987...


Processing files for 1987: 100%|████████████████| 9125/9125 [05:47<00:00, 26.27it/s]


Grouping data by month and calculating statistics for the year 1987...
Saved MERRA2 monthly statistics for 1987 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1987\MERRA2_1987_monthly_stats.parquet
Processing files for the year 1988...


Processing files for 1988: 100%|████████████████| 9150/9150 [06:14<00:00, 24.44it/s]


Grouping data by month and calculating statistics for the year 1988...
Saved MERRA2 monthly statistics for 1988 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1988\MERRA2_1988_monthly_stats.parquet
Processing files for the year 1989...


Processing files for 1989: 100%|████████████████| 9124/9124 [05:52<00:00, 25.90it/s]


Grouping data by month and calculating statistics for the year 1989...
Saved MERRA2 monthly statistics for 1989 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1989\MERRA2_1989_monthly_stats.parquet
Processing files for the year 1990...


Processing files for 1990: 100%|████████████████| 9125/9125 [05:51<00:00, 25.97it/s]


Grouping data by month and calculating statistics for the year 1990...
Saved MERRA2 monthly statistics for 1990 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1990\MERRA2_1990_monthly_stats.parquet
Processing files for the year 1991...


Processing files for 1991: 100%|████████████████| 9125/9125 [06:06<00:00, 24.91it/s]


Grouping data by month and calculating statistics for the year 1991...
Saved MERRA2 monthly statistics for 1991 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1991\MERRA2_1991_monthly_stats.parquet
Processing files for the year 1992...


Processing files for 1992: 100%|████████████████| 9150/9150 [06:00<00:00, 25.35it/s]


Grouping data by month and calculating statistics for the year 1992...
Saved MERRA2 monthly statistics for 1992 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1992\MERRA2_1992_monthly_stats.parquet
Processing files for the year 1993...


Processing files for 1993: 100%|████████████████| 9125/9125 [06:17<00:00, 24.16it/s]


Grouping data by month and calculating statistics for the year 1993...
Saved MERRA2 monthly statistics for 1993 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1993\MERRA2_1993_monthly_stats.parquet
Processing files for the year 1994...


Processing files for 1994: 100%|████████████████| 9125/9125 [06:15<00:00, 24.29it/s]


Grouping data by month and calculating statistics for the year 1994...
Saved MERRA2 monthly statistics for 1994 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1994\MERRA2_1994_monthly_stats.parquet
Processing files for the year 1995...


Processing files for 1995: 100%|████████████████| 9125/9125 [06:33<00:00, 23.22it/s]


Grouping data by month and calculating statistics for the year 1995...
Saved MERRA2 monthly statistics for 1995 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1995\MERRA2_1995_monthly_stats.parquet
Processing files for the year 1996...


Processing files for 1996: 100%|████████████████| 9150/9150 [06:19<00:00, 24.09it/s]


Grouping data by month and calculating statistics for the year 1996...
Saved MERRA2 monthly statistics for 1996 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1996\MERRA2_1996_monthly_stats.parquet
Processing files for the year 1997...


Processing files for 1997: 100%|████████████████| 9125/9125 [06:08<00:00, 24.79it/s]


Grouping data by month and calculating statistics for the year 1997...
Saved MERRA2 monthly statistics for 1997 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1997\MERRA2_1997_monthly_stats.parquet
Processing files for the year 1998...


Processing files for 1998: 100%|████████████████| 9124/9124 [06:07<00:00, 24.82it/s]


Grouping data by month and calculating statistics for the year 1998...
Saved MERRA2 monthly statistics for 1998 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1998\MERRA2_1998_monthly_stats.parquet
Processing files for the year 1999...


Processing files for 1999: 100%|████████████████| 9125/9125 [06:14<00:00, 24.35it/s]


Grouping data by month and calculating statistics for the year 1999...
Saved MERRA2 monthly statistics for 1999 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\1999\MERRA2_1999_monthly_stats.parquet
Processing files for the year 2000...


Processing files for 2000: 100%|████████████████| 9150/9150 [06:01<00:00, 25.29it/s]


Grouping data by month and calculating statistics for the year 2000...
Saved MERRA2 monthly statistics for 2000 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2000\MERRA2_2000_monthly_stats.parquet
Processing files for the year 2001...


Processing files for 2001: 100%|████████████████| 9124/9124 [06:09<00:00, 24.67it/s]


Grouping data by month and calculating statistics for the year 2001...
Saved MERRA2 monthly statistics for 2001 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2001\MERRA2_2001_monthly_stats.parquet
Processing files for the year 2002...


Processing files for 2002: 100%|████████████████| 9122/9122 [06:20<00:00, 23.98it/s]


Grouping data by month and calculating statistics for the year 2002...
Saved MERRA2 monthly statistics for 2002 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2002\MERRA2_2002_monthly_stats.parquet
Processing files for the year 2003...


Processing files for 2003: 100%|████████████████| 9125/9125 [06:17<00:00, 24.18it/s]


Grouping data by month and calculating statistics for the year 2003...
Saved MERRA2 monthly statistics for 2003 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2003\MERRA2_2003_monthly_stats.parquet
Processing files for the year 2004...


Processing files for 2004: 100%|████████████████| 9150/9150 [06:27<00:00, 23.64it/s]


Grouping data by month and calculating statistics for the year 2004...
Saved MERRA2 monthly statistics for 2004 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2004\MERRA2_2004_monthly_stats.parquet
Processing files for the year 2005...


Processing files for 2005: 100%|████████████████| 9125/9125 [06:16<00:00, 24.24it/s]


Grouping data by month and calculating statistics for the year 2005...
Saved MERRA2 monthly statistics for 2005 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2005\MERRA2_2005_monthly_stats.parquet
Processing files for the year 2006...


Processing files for 2006: 100%|████████████████| 9125/9125 [05:43<00:00, 26.55it/s]


Grouping data by month and calculating statistics for the year 2006...
Saved MERRA2 monthly statistics for 2006 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2006\MERRA2_2006_monthly_stats.parquet
Processing files for the year 2007...


Processing files for 2007: 100%|████████████████| 9125/9125 [05:55<00:00, 25.68it/s]


Grouping data by month and calculating statistics for the year 2007...
Saved MERRA2 monthly statistics for 2007 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2007\MERRA2_2007_monthly_stats.parquet
Processing files for the year 2008...


Processing files for 2008: 100%|████████████████| 9150/9150 [05:46<00:00, 26.39it/s]


Grouping data by month and calculating statistics for the year 2008...
Saved MERRA2 monthly statistics for 2008 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2008\MERRA2_2008_monthly_stats.parquet
Processing files for the year 2009...


Processing files for 2009: 100%|████████████████| 9125/9125 [05:46<00:00, 26.32it/s]


Grouping data by month and calculating statistics for the year 2009...
Saved MERRA2 monthly statistics for 2009 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2009\MERRA2_2009_monthly_stats.parquet
Processing files for the year 2010...


Processing files for 2010: 100%|████████████████| 9125/9125 [06:00<00:00, 25.35it/s]


Grouping data by month and calculating statistics for the year 2010...
Saved MERRA2 monthly statistics for 2010 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2010\MERRA2_2010_monthly_stats.parquet
Processing files for the year 2011...


Processing files for 2011: 100%|████████████████| 9125/9125 [05:52<00:00, 25.89it/s]


Grouping data by month and calculating statistics for the year 2011...
Saved MERRA2 monthly statistics for 2011 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2011\MERRA2_2011_monthly_stats.parquet
Processing files for the year 2012...


Processing files for 2012: 100%|████████████████| 9150/9150 [05:36<00:00, 27.20it/s]


Grouping data by month and calculating statistics for the year 2012...
Saved MERRA2 monthly statistics for 2012 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2012\MERRA2_2012_monthly_stats.parquet
Processing files for the year 2013...


Processing files for 2013: 100%|████████████████| 9125/9125 [05:43<00:00, 26.59it/s]


Grouping data by month and calculating statistics for the year 2013...
Saved MERRA2 monthly statistics for 2013 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2013\MERRA2_2013_monthly_stats.parquet
Processing files for the year 2014...


Processing files for 2014: 100%|████████████████| 9125/9125 [05:35<00:00, 27.17it/s]


Grouping data by month and calculating statistics for the year 2014...
Saved MERRA2 monthly statistics for 2014 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2014\MERRA2_2014_monthly_stats.parquet
Processing files for the year 2015...


Processing files for 2015: 100%|████████████████| 9124/9124 [05:52<00:00, 25.90it/s]


Grouping data by month and calculating statistics for the year 2015...
Saved MERRA2 monthly statistics for 2015 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2015\MERRA2_2015_monthly_stats.parquet
Processing files for the year 2016...


Processing files for 2016: 100%|████████████████| 9124/9124 [05:42<00:00, 26.64it/s]


Grouping data by month and calculating statistics for the year 2016...
Saved MERRA2 monthly statistics for 2016 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2016\MERRA2_2016_monthly_stats.parquet
Processing files for the year 2017...


Processing files for 2017: 100%|████████████████| 9100/9100 [05:52<00:00, 25.82it/s]


Grouping data by month and calculating statistics for the year 2017...
Saved MERRA2 monthly statistics for 2017 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2017\MERRA2_2017_monthly_stats.parquet
Processing files for the year 2018...


Processing files for 2018: 100%|████████████████| 9100/9100 [05:49<00:00, 26.03it/s]


Grouping data by month and calculating statistics for the year 2018...
Saved MERRA2 monthly statistics for 2018 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2018\MERRA2_2018_monthly_stats.parquet
Processing files for the year 2019...


Processing files for 2019: 100%|████████████████| 9100/9100 [05:54<00:00, 25.65it/s]


Grouping data by month and calculating statistics for the year 2019...
Saved MERRA2 monthly statistics for 2019 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2019\MERRA2_2019_monthly_stats.parquet
Processing files for the year 2020...


Processing files for 2020: 100%|████████████████| 9125/9125 [05:48<00:00, 26.17it/s]


Grouping data by month and calculating statistics for the year 2020...
Saved MERRA2 monthly statistics for 2020 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2020\MERRA2_2020_monthly_stats.parquet
Processing files for the year 2021...


Processing files for 2021: 100%|████████████████| 9100/9100 [06:09<00:00, 24.61it/s]


Grouping data by month and calculating statistics for the year 2021...
Saved MERRA2 monthly statistics for 2021 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2021\MERRA2_2021_monthly_stats.parquet
Processing files for the year 2022...


Processing files for 2022: 100%|████████████████| 9100/9100 [06:02<00:00, 25.11it/s]


Grouping data by month and calculating statistics for the year 2022...
Saved MERRA2 monthly statistics for 2022 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2022\MERRA2_2022_monthly_stats.parquet
Processing files for the year 2023...


Processing files for 2023: 100%|████████████████| 9100/9100 [06:08<00:00, 24.67it/s]


Grouping data by month and calculating statistics for the year 2023...
Saved MERRA2 monthly statistics for 2023 to Z:\Thesis\Data\GEE\MERRA2_aer\MERRA2_num_data\Monthly_stats\2023\MERRA2_2023_monthly_stats.parquet
