In [None]:
# modify Parquet files
# Add pressure_level column with 1013mb entry for sfc data
# remove extra number and expver columns if they exist
# Convert surface NetCDF files to parquet

# Import libs
import os
import xarray as xr
import pandas as pd
from tqdm import tqdm

# Define the root dir where the NetCDF files are located
root_dir = r"Z:\Thesis\Data\Met\ERA5"

# Define the years to process
years = [str(year) for year in range(2019, 2024)]

# Define the list of countries
countries = [
    #"Bahrain",
    "Kuwait",
    #"Oman",
    #"Qatar",
    #"Saudi Arabia", 
    #"United Arab Emirates",
    #"Yemen"
]

# Define the subfolders to process
subfolders = ["surface"]

# Define the output root dir for parquet files
output_root_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
os.makedirs(output_root_dir, exist_ok=True)

# collect all NetCDF files into a list
netcdf_files = []

# Iterate through the years
for year in years:
    year_dir = os.path.join(root_dir, year)
    # Iterate through the countries
    for country in countries:
        country_dir = os.path.join(year_dir, country)
        # Iterate through subfolders
        for subfolder in subfolders:
            # Define the subfolder directory
            subfolder_dir = os.path.join(country_dir, subfolder)
            # Check if the subfolder exists
            if not os.path.exists(subfolder_dir):
                continue
            # Get a list of NEtCDF files inthe subfolder
            files_in_subfolder = [
                os.path.join(subfolder_dir, f) for f in os.listdir(subfolder_dir) if f.endswith('.nc')
            ]
            # Add the files to the list
            netcdf_files.extend(files_in_subfolder)


# Process the files with a progress bar
for netcdf_path in tqdm(netcdf_files, desc="Processing files"):
    # Print the file being prcessed
    tqdm.write(f"Processing file: {netcdf_path}")
    # Read the NEtCDF file using xarray
    try:
        ds = xr.open_dataset(netcdf_path)
    except Exception as e:
        tqdm.write(f"Error reading {netcdf_path: {e}}")
        continue

    # Convert xarray dataset to pandas DataFrame
    try:
        df = ds.to_dataframe().reset_index()
        # Inspect 'valid_time'
        print(f"Before conversion - Data type of 'time': {df['time'].dtype}")
        print(f"Sample 'time' values:\n{df['time'].head()}")
    except Exception as e:
        tqdm.write(f"Error converting dataset to DataFrame for {netcdf_path}: {e}")
        continue

    # Check if the file is in a 'surface' subfolder to add 'pressure_level'
    if os.path.normpath('surface') in os.path.normpath(netcdf_path).split(os.sep):
        df['pressure_level'] = 1013.0
        # Ensure 'pressure_level' is in the second column after 'valid_time
        cols = df.columns.tolist()
        if 'time' in cols and 'pressure_level' in cols:
            cols.insert(cols.index('time') + 1, cols.pop(cols.index('pressure_level')))
            df = df[cols]

    # Cechk if 'valid_time' is in Unix timestamp format and convert to timestampz
    if pd.api.types.is_integer_dtype(df['time']):
        try:
            # convert 'valid_time' from Unix timestamp to datetime
            df['time'] = pd.to_datetime(df['time'], unit='s', utc=True)
        except Exception as e:
            tqdm.write(f"Error converting 'time' to datetime for {netcdf_path}: {e}")
            continue

    # Define the output directory
    relative_path = os.path.relpath(netcdf_path, root_dir)
    output_dir = os.path.join(output_root_dir, os.path.dirname(relative_path))
    # Create the output directory if it doesnt exist
    os.makedirs(output_dir, exist_ok=True)
    # Define the output file name
    netcdf_file = os.path.basename(netcdf_path)
    parquet_file = netcdf_file.replace('.nc', '.parquet')
    parquet_path = os.path.join(output_dir, parquet_file)

    # Write the DataFrame to a Parquet file
    try:
        df.to_parquet(parquet_path, index=False)
        tqdm.write(f"Successfully converted {netcdf_path} to {parquet_path}")
    except Exception as e:
        tqdm.write(f"Error writing Parquet file for {netcdf_path}: {e}")
        continue

In [None]:
# modify Parquet files
# Add pressure_level column with 1013mb entry for sfc data
# remove extra number and expver columns if they exist
# Convert surface NetCDF files to parquet

# Import libs
import os
import xarray as xr
import pandas as pd
from tqdm import tqdm

# Define the root dir where the NetCDF files are located
root_dir = r"Z:\Thesis\Data\Met\ERA5"

# Define the years to process
years = [str(year) for year in range(2017, 2018)]

# Define the list of countries
countries = [
    "Bahrain",
    "Kuwait",
    "Oman",
    "Qatar",
    "Saudi Arabia", 
    "United Arab Emirates",
    "Yemen"
]

# Define the subfolders to process
subfolders = ["pressure"]

# Define the output root dir for parquet files
output_root_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
os.makedirs(output_root_dir, exist_ok=True)

# collect all NetCDF files into a list
netcdf_files = []

# Iterate through the years
for year in years:
    year_dir = os.path.join(root_dir, year)
    # Iterate through the countries
    for country in countries:
        country_dir = os.path.join(year_dir, country)
        # Iterate through subfolders
        for subfolder in subfolders:
            # Define the subfolder directory
            subfolder_dir = os.path.join(country_dir, subfolder)
            # Check if the subfolder exists
            if not os.path.exists(subfolder_dir):
                continue
            # Get a list of NEtCDF files inthe subfolder
            files_in_subfolder = [
                os.path.join(subfolder_dir, f) for f in os.listdir(subfolder_dir) if f.endswith('.nc')
            ]
            # Add the files to the list
            netcdf_files.extend(files_in_subfolder)


# Process the files with a progress bar
for netcdf_path in tqdm(netcdf_files, desc="Processing files"):
    # Print the file being prcessed
    tqdm.write(f"Processing file: {netcdf_path}")
    # Read the NEtCDF file using xarray
    try:
        ds = xr.open_dataset(netcdf_path)
    except Exception as e:
        tqdm.write(f"Error reading {netcdf_path: {e}}")
        continue

        #
        # Convert xarray dataset to pandas DataFrame
        try:
            df = ds.to_dataframe().reset_index()
    
            # Dynamically handle 'time' or 'valid_time'
            time_column = 'time' if 'time' in df.columns else 'valid_time' if 'valid_time' in df.columns else None
    
            if time_column:
               print(f"Before conversion - Data type of '{time_column}': {df[time_column].dtype}")
               print(f"Sample '{time_column}' values:\n{df[time_column].head()}")
            else:
                tqdm.write(f"No 'time' or 'valid_time' column found in {netcdf_path}. Skipping.")
                continue  # Skip files without either column
        except Exception as e:
            tqdm.write(f"Error converting dataset to DataFrame for {netcdf_path}: {e}")
            continue
        
    # Convert xarray dataset to pandas DataFrame
    #try:
    #    df = ds.to_dataframe().reset_index()
        # Inspect 'valid_time'
   #     print(f"Before conversion - Data type of 'time': {df['time'].dtype}")
   #     print(f"Sample 'time' values:\n{df['time'].head()}")
   # except Exception as e:
   #     tqdm.write(f"Error converting dataset to DataFrame for {netcdf_path}: {e}")
        #continue
    
    # Check if the file is in a 'surface' subfolder to add 'pressure_level'
    #if os.path.normpath('surface') in os.path.normpath(netcdf_path).split(os.sep):
     #   df['pressure_level'] = 1013.0
     #   # Ensure 'pressure_level' is in the second column after 'valid_time
     #   cols = df.columns.tolist()
     #   if 'time' in cols and 'pressure_level' in cols:
     #       cols.insert(cols.index('time') + 1, cols.pop(cols.index('pressure_level')))
     #       df = df[cols]
    # Check if the file is in a 'surface' subfolder to add 'pressure_level'
    if os.path.normpath('surface') in os.path.normpath(netcdf_path).split(os.sep):
        df['pressure_level'] = 1013.0

        # Ensure 'pressure_level' is in the second column after the time-related column
        cols = df.columns.tolist()
        time_column = 'time' if 'time' in cols else 'valid_time' if 'valid_time' in cols else None
    
        if time_column and 'pressure_level' in cols:
            cols.insert(cols.index(time_column) + 1, cols.pop(cols.index('pressure_level')))
            df = df[cols]
        else:
            tqdm.write(f"Skipping file {netcdf_path}: No valid time-related column found.")
            continue


    # Cechk if 'valid_time' is in Unix timestamp format and convert to timestampz
    #if pd.api.types.is_integer_dtype(df['time']):
     #   try:
            # convert 'valid_time' from Unix timestamp to datetime
      #      df['time'] = pd.to_datetime(df['time'], unit='s', utc=True)
       # except Exception as e:
        #    tqdm.write(f"Error converting 'time' to datetime for {netcdf_path}: {e}")
         #   continue

    # Check if 'time' or 'valid_time' is in Unix timestamp format and convert to datetime
    time_column = 'time' if 'time' in df.columns else 'valid_time' if 'valid_time' in df.columns else None

    if time_column:
        if pd.api.types.is_integer_dtype(df[time_column]):
            try:
                # Convert Unix timestamp to datetime
                df[time_column] = pd.to_datetime(df[time_column], unit='s', utc=True)
            except Exception as e:
                tqdm.write(f"Error converting '{time_column}' to datetime for {netcdf_path}: {e}")
                continue
    else:
        tqdm.write(f"Skipping file {netcdf_path}: No valid time-related column found.")
        continue

    # Define the output directory
    relative_path = os.path.relpath(netcdf_path, root_dir)
    output_dir = os.path.join(output_root_dir, os.path.dirname(relative_path))
    # Create the output directory if it doesnt exist
    os.makedirs(output_dir, exist_ok=True)
    # Define the output file name
    netcdf_file = os.path.basename(netcdf_path)
    parquet_file = netcdf_file.replace('.nc', '.parquet')
    parquet_path = os.path.join(output_dir, parquet_file)

    # Write the DataFrame to a Parquet file
    try:
        df.to_parquet(parquet_path, index=False)
        tqdm.write(f"Successfully converted {netcdf_path} to {parquet_path}")
    except Exception as e:
        tqdm.write(f"Error writing Parquet file for {netcdf_path}: {e}")
        continue

In [None]:
import os
import xarray as xr
import pandas as pd
from tqdm import tqdm

# Define the root dir where the NetCDF files are located
root_dir = r"Z:\Thesis\Data\Met\ERA5"

# Define the years to process
years = [str(year) for year in range(2017, 2018)]

# Define the list of countries
countries = [
    "Bahrain",
    "Kuwait",
    "Oman",
    "Qatar",
    "Saudi Arabia",
    "United Arab Emirates",
    "Yemen"
]

# Define the subfolders to process
subfolders = ["surface", "pressure"] #pressure

# Define the output root dir for parquet files
output_root_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
os.makedirs(output_root_dir, exist_ok=True)

# Collect all NetCDF files into a list
netcdf_files = []

# List to store skipped files and error messages
skipped_files = []

# Iterate through the years
for year in years:
    year_dir = os.path.join(root_dir, year)
    for country in countries:
        country_dir = os.path.join(year_dir, country)
        for subfolder in subfolders:
            subfolder_dir = os.path.join(country_dir, subfolder)
            if not os.path.exists(subfolder_dir):
                continue
            files_in_subfolder = [
                os.path.join(subfolder_dir, f) for f in os.listdir(subfolder_dir) if f.endswith('.nc')
            ]
            netcdf_files.extend(files_in_subfolder)

# Process the files with a progress bar
for netcdf_path in tqdm(netcdf_files, desc="Processing files"):
    try:
        # Print the file being processed
        tqdm.write(f"Processing file: {netcdf_path}")
        
        # Read the NetCDF file using xarray
        ds = xr.open_dataset(netcdf_path)
        
        # Convert xarray dataset to pandas DataFrame
        df = ds.to_dataframe().reset_index()
        
        # Dynamically handle 'time' or 'valid_time'
        time_column = 'time' if 'time' in df.columns else 'valid_time' if 'valid_time' in df.columns else None
        if not time_column:
            raise ValueError("No 'time' or 'valid_time' column found")
        
        # If the file is in a 'surface' subfolder, add 'pressure_level'
        if os.path.normpath('surface') in os.path.normpath(netcdf_path).split(os.sep):
            df['pressure_level'] = 1013.0
            cols = df.columns.tolist()
            if time_column in cols and 'pressure_level' in cols:
                cols.insert(cols.index(time_column) + 1, cols.pop(cols.index('pressure_level')))
                df = df[cols]
        
        # Define the output directory and file path
        relative_path = os.path.relpath(netcdf_path, root_dir)
        output_dir = os.path.join(output_root_dir, os.path.dirname(relative_path))
        os.makedirs(output_dir, exist_ok=True)
        parquet_file = os.path.basename(netcdf_path).replace('.nc', '.parquet')
        parquet_path = os.path.join(output_dir, parquet_file)
        
        # Write the DataFrame to a Parquet file
        df.to_parquet(parquet_path, index=False)
        tqdm.write(f"Successfully converted {netcdf_path} to {parquet_path}")
    
    except Exception as e:
        # Add the file and error to the skipped files list
        skipped_files.append((netcdf_path, str(e)))
        tqdm.write(f"Error processing {netcdf_path}: {e}")
        continue

# Print the summary of skipped files
if skipped_files:
    print("\nSummary of Skipped Files:")
    for file, error in skipped_files:
        print(f"{file}: {error}")

    # Optionally save the skipped files to a CSV for further investigation
    skipped_df = pd.DataFrame(skipped_files, columns=["File Path", "Error Message"])
    skipped_df.to_csv(os.path.join(output_root_dir, "skipped_files.csv"), index=False)
    print(f"Skipped files list saved to {os.path.join(output_root_dir, 'skipped_files.csv')}")
else:
    print("\nAll files processed successfully!")


In [None]:
import os
import xarray as xr
import pandas as pd
from tqdm import tqdm

# Define the root dir where the NetCDF files are located
root_dir = r"Z:\Thesis\Data\Met\ERA5"

# Define the years to process
years = [str(year) for year in range(1980, 1981)]

# Define the list of countries
countries = [
    #"Bahrain",
    #"Kuwait",
    #"Oman",
    #"Qatar",
    "Saudi Arabia",
    "United Arab Emirates",
    #"Yemen"
]

# Define the subfolders to process
subfolders = ["surface", "pressure"]

# Define the output root dir for parquet files
output_root_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
os.makedirs(output_root_dir, exist_ok=True)

# Function to adjust country names for output folder
def adjust_country_name(country):
    if country == "Saudi Arabia":
        return "Saudi_Arabia"
    elif country == "United Arab Emirates":
        return "United_Arab_Emirates"
    return country

# Collect all NetCDF files into a list
netcdf_files = []

# List to store skipped files and error messages
skipped_files = []

# Iterate through the years
for year in years:
    year_dir = os.path.join(root_dir, year)
    for country in countries:
        country_dir = os.path.join(year_dir, country)
        for subfolder in subfolders:
            subfolder_dir = os.path.join(country_dir, subfolder)
            if not os.path.exists(subfolder_dir):
                continue
            files_in_subfolder = [
                os.path.join(subfolder_dir, f) for f in os.listdir(subfolder_dir) if f.endswith('.nc')
            ]
            netcdf_files.extend(files_in_subfolder)

# Process the files with a progress bar
for netcdf_path in tqdm(netcdf_files, desc="Processing files"):
    try:
        # Print the file being processed
        tqdm.write(f"Processing file: {netcdf_path}")
        
        # Read the NetCDF file using xarray
        ds = xr.open_dataset(netcdf_path)
        
        # Convert xarray dataset to pandas DataFrame
        df = ds.to_dataframe().reset_index()
        
        # Check for 'level' or 'pressure_level' and standardize the column name to 'level'
        if 'pressure_level' in df.columns:
            df.rename(columns={'pressure_level': 'level'}, inplace=True)
            tqdm.write(f"Renamed 'pressure_level' to 'level' for file: {netcdf_path}")
        elif 'level' not in df.columns:
            # If neither column exists, handle surface or raise an error
            if 'surface' in os.path.normpath(netcdf_path).split(os.sep):
                df['level'] = 1013.0  # Add a 'level' column for surface files
                tqdm.write(f"Added 'level' column with value 1013.0 for surface file: {netcdf_path}")
            else:
                raise ValueError("'level' column is missing in the pressure file")
        
        # Verify all levels are present for pressure files
        expected_levels = [1000, 925, 850, 700, 500, 300, 200, 100, 50, 10]
        if 'level' in df.columns:
            missing_levels = set(expected_levels) - set(df['level'].unique())
            if missing_levels:
                tqdm.write(f"Warning: Missing levels in {netcdf_path}: {missing_levels}")
        
        # Dynamically handle 'time' or 'valid_time'
        time_column = 'time' if 'time' in df.columns else 'valid_time' if 'valid_time' in df.columns else None
        if not time_column:
            raise ValueError("No 'time' or 'valid_time' column found")
        
        # Adjust the country name for output folder
        relative_path = os.path.relpath(netcdf_path, root_dir)
        parts = relative_path.split(os.sep)
        if len(parts) >= 2:
            parts[1] = adjust_country_name(parts[1])  # Adjust country name if needed
        adjusted_relative_path = os.path.join(*parts)
        
        # Define the output directory and file path
        output_dir = os.path.join(output_root_dir, os.path.dirname(adjusted_relative_path))
        os.makedirs(output_dir, exist_ok=True)
        parquet_file = os.path.basename(netcdf_path).replace('.nc', '.parquet')
        parquet_path = os.path.join(output_dir, parquet_file)
        
        # Write the DataFrame to a Parquet file
        df.to_parquet(parquet_path, index=False)
        tqdm.write(f"Successfully converted {netcdf_path} to {parquet_path}")
    
    except Exception as e:
        # Add the file and error to the skipped files list
        skipped_files.append((netcdf_path, str(e)))
        tqdm.write(f"Error processing {netcdf_path}: {e}")
        continue

# Print the summary of skipped files
if skipped_files:
    print("\nSummary of Skipped Files:")
    for file, error in skipped_files:
        print(f"{file}: {error}")

    # Optionally save the skipped files to a CSV for further investigation
    skipped_df = pd.DataFrame(skipped_files, columns=["File Path", "Error Message"])
    skipped_df.to_csv(os.path.join(output_root_dir, "skipped_files.csv"), index=False)
    print(f"Skipped files list saved to {os.path.join(output_root_dir, 'skipped_files.csv')}")
else:
    print("\nAll files processed successfully!")


Processing files:   0%|                                      | 0/66 [00:00<?, ?it/s]

Processing file: Z:\Thesis\Data\Met\ERA5\1980\Saudi Arabia\surface\1980_Saudi Arabia_surface_10m_u_component_of_wind.nc


Processing files:   0%|                                      | 0/66 [00:03<?, ?it/s]

Added 'level' column with value 1013.0 for surface file: Z:\Thesis\Data\Met\ERA5\1980\Saudi Arabia\surface\1980_Saudi Arabia_surface_10m_u_component_of_wind.nc


Processing files:   0%|                                      | 0/66 [00:03<?, ?it/s]



Processing files:   2%|▍                             | 1/66 [00:08<09:03,  8.36s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1980\Saudi Arabia\surface\1980_Saudi Arabia_surface_10m_u_component_of_wind.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1980\Saudi_Arabia\surface\1980_Saudi Arabia_surface_10m_u_component_of_wind.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1980\Saudi Arabia\surface\1980_Saudi Arabia_surface_10m_v_component_of_wind.nc


Processing files:   2%|▍                             | 1/66 [00:11<09:03,  8.36s/it]

Added 'level' column with value 1013.0 for surface file: Z:\Thesis\Data\Met\ERA5\1980\Saudi Arabia\surface\1980_Saudi Arabia_surface_10m_v_component_of_wind.nc


In [6]:
import os
import xarray as xr
import pandas as pd
from tqdm import tqdm

# Define the root dir where the NetCDF files are located
root_dir = r"Z:\Thesis\Data\Met\ERA5"

# Define the years to process
years = [str(year) for year in range(1996, 1997)]

# Define the list of countries
countries = [
    #"Bahrain",
    #"Kuwait",
    #"Oman",
    #"Qatar",
    "Saudi Arabia",
    #"United Arab Emirates",
    #"Yemen"
]
# Define the subfolders to process
subfolders = [#"surface",
              "pressure"
             ]

# Define the output root dir for parquet files
output_root_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
os.makedirs(output_root_dir, exist_ok=True)

# Function to adjust country names for output folder
def adjust_country_name(country):
    if country == "Saudi Arabia":
        return "Saudi_Arabia"
    elif country == "United Arab Emirates":
        return "United_Arab_Emirates"
    return country

# Collect all NetCDF files into a list
netcdf_files = []

# List to store skipped files and error messages
skipped_files = []

# Iterate through the years
for year in years:
    year_dir = os.path.join(root_dir, year)
    for country in countries:
        country_dir = os.path.join(year_dir, country)
        for subfolder in subfolders:
            subfolder_dir = os.path.join(country_dir, subfolder)
            if not os.path.exists(subfolder_dir):
                continue
            files_in_subfolder = [
                os.path.join(subfolder_dir, f) for f in os.listdir(subfolder_dir) if f.endswith('.nc')
            ]
            netcdf_files.extend(files_in_subfolder)

# Process the files with a progress bar
for netcdf_path in tqdm(netcdf_files, desc="Processing files"):
    try:
        # Print the file being processed
        tqdm.write(f"Processing file: {netcdf_path}")
        
        # Read the NetCDF file using xarray
        ds = xr.open_dataset(netcdf_path)
        
        # Convert xarray dataset to pandas DataFrame
        df = ds.to_dataframe().reset_index()
        
        # Check for 'level' or 'pressure_level' and standardize the column name to 'level'
        if 'pressure_level' in df.columns:
            df.rename(columns={'pressure_level': 'level'}, inplace=True)
            tqdm.write(f"Renamed 'pressure_level' to 'level' for file: {netcdf_path}")
        elif 'level' not in df.columns:
            # If neither column exists, handle surface or raise an error
            if 'surface' in os.path.normpath(netcdf_path).split(os.sep):
                df['level'] = 1013.0  # Add a 'level' column for surface files
                tqdm.write(f"Added 'level' column with value 1013.0 for surface file: {netcdf_path}")
            else:
                raise ValueError("'level' column is missing in the pressure file")
        
        # Verify all levels are present for pressure files
        expected_levels = [1000, 925, 850, 700, 500, 300, 200, 100, 50, 10]
        if 'level' in df.columns:
            missing_levels = set(expected_levels) - set(df['level'].unique())
            if missing_levels:
                tqdm.write(f"Warning: Missing levels in {netcdf_path}: {missing_levels}")
        
        # Dynamically handle 'time' or 'valid_time'
        time_column = 'time' if 'time' in df.columns else 'valid_time' if 'valid_time' in df.columns else None
        if not time_column:
            raise ValueError("No 'time' or 'valid_time' column found")
        
        # Adjust the country name for output folder
        relative_path = os.path.relpath(netcdf_path, root_dir)
        parts = relative_path.split(os.sep)
        if len(parts) >= 2:
            parts[1] = adjust_country_name(parts[1])  # Adjust country name if needed
        adjusted_relative_path = os.path.join(*parts)
        
        # Adjust the filename explicitly for Saudi Arabia and United Arab Emirates
        country_name = os.path.basename(os.path.dirname(netcdf_path)).strip()
        original_filename = os.path.basename(netcdf_path).replace('.nc', '')
        if "Saudi Arabia" in original_filename:
            parquet_file = original_filename.replace("Saudi Arabia", "Saudi_Arabia") + '.parquet'
        elif "United Arab Emirates" in original_filename:
            parquet_file = original_filename.replace("United Arab Emirates", "United_Arab_Emirates") + '.parquet'
        else:
            parquet_file = original_filename + '.parquet'
        
        # Define the output directory and file path
        output_dir = os.path.join(output_root_dir, os.path.dirname(adjusted_relative_path))
        os.makedirs(output_dir, exist_ok=True)
        parquet_path = os.path.join(output_dir, parquet_file)
        
        # Write the DataFrame to a Parquet file
        df.to_parquet(parquet_path, index=False)
        tqdm.write(f"Successfully converted {netcdf_path} to {parquet_path}")
    
    except Exception as e:
        # Add the file and error to the skipped files list
        skipped_files.append((netcdf_path, str(e)))
        tqdm.write(f"Error processing {netcdf_path}: {e}")
        continue

# Print the summary of skipped files
if skipped_files:
    print("\nSummary of Skipped Files:")
    for file, error in skipped_files:
        print(f"{file}: {error}")

    # Optionally save the skipped files to a CSV for further investigation
    skipped_df = pd.DataFrame(skipped_files, columns=["File Path", "Error Message"])
    skipped_df.to_csv(os.path.join(output_root_dir, "skipped_files.csv"), index=False)
    print(f"Skipped files list saved to {os.path.join(output_root_dir, 'skipped_files.csv')}")
else:
    print("\nAll files processed successfully!")


Processing files:   0%|                                       | 0/7 [00:00<?, ?it/s]

Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_geopotential.nc


Processing files:   0%|                                       | 0/7 [00:46<?, ?it/s]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_geopotential.nc


Processing files:  14%|████▎                         | 1/7 [01:47<10:44, 107.35s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_geopotential.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_geopotential.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_relative_humidity.nc


Processing files:  14%|████▎                         | 1/7 [03:12<10:44, 107.35s/it]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_relative_humidity.nc


Processing files:  29%|████████▌                     | 2/7 [04:18<11:04, 132.88s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_relative_humidity.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_relative_humidity.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_temperature.nc


Processing files:  29%|████████▌                     | 2/7 [05:37<11:04, 132.88s/it]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_temperature.nc


Processing files:  43%|████████████▊                 | 3/7 [06:40<09:09, 137.28s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_temperature.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_temperature.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_u_component_of_wind.nc


Processing files:  43%|████████████▊                 | 3/7 [07:55<09:09, 137.28s/it]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_u_component_of_wind.nc


Processing files:  57%|█████████████████▏            | 4/7 [08:58<06:52, 137.44s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_u_component_of_wind.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_u_component_of_wind.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_vertical_velocity.nc


Processing files:  57%|█████████████████▏            | 4/7 [10:07<06:52, 137.44s/it]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_vertical_velocity.nc


Processing files:  71%|█████████████████████▍        | 5/7 [11:08<04:29, 134.96s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_vertical_velocity.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_vertical_velocity.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_vorticity.nc


Processing files:  71%|█████████████████████▍        | 5/7 [12:16<04:29, 134.96s/it]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_vorticity.nc


Processing files:  86%|█████████████████████████▋    | 6/7 [13:20<02:13, 133.76s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_vorticity.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_vorticity.parquet
Processing file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_v_component_of_wind.nc


Processing files:  86%|█████████████████████████▋    | 6/7 [14:25<02:13, 133.76s/it]

Renamed 'pressure_level' to 'level' for file: Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_v_component_of_wind.nc


Processing files: 100%|██████████████████████████████| 7/7 [15:27<00:00, 132.57s/it]

Successfully converted Z:\Thesis\Data\Met\ERA5\1996\Saudi Arabia\pressure\1996_Saudi Arabia_pressure_v_component_of_wind.nc to Z:\Thesis\Data\Met\ERA5_parquet_test\1996\Saudi_Arabia\pressure\1996_Saudi_Arabia_pressure_v_component_of_wind.parquet

All files processed successfully!



