In [3]:
import os
import pandas as pd

# Define the base directory and the range of years
base_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
years = range(2000, 2001)  # 1989 to 2000 inclusive

# Define the countries dictionary
countries_dict = {
    "SA": "Saudi_Arabia",
    #"YE": "Yemen",
    #"OM": "Oman",
    #"QA": "Qatar",
    #"BH": "Bahrain",
    #"AE": "United_Arab_Emirates",
    #"KW": "Kuwait"
}

# Define subdirectories to iterate through
folders = [#"surface", 
    "pressure"
]

# Create a list to keep track of files that were skipped due to errors
skipped_files = []

# Define an output directory for the CSV summary (you can change this as needed)
output_root_dir = base_dir  # For example, using base_dir here

# Iterate over years, countries, and folders
for year in years:
    for code, country in countries_dict.items():
        for folder in folders:
            folder_path = os.path.join(base_dir, str(year), country, folder)
            
            # Check if the folder exists
            if os.path.exists(folder_path):
                for file in os.listdir(folder_path):
                    # Process only parquet files
                    if file.endswith(".parquet"):
                        file_path = os.path.join(folder_path, file)
                        
                        try:
                            # Load the parquet file
                            df = pd.read_parquet(file_path)
                            
                            # Check if 'valid_time' column exists and rename it to 'time'
                            if 'valid_time' in df.columns:
                                df.rename(columns={'valid_time': 'time'}, inplace=True)
                                
                                # Save the updated dataframe back to the parquet file
                                df.to_parquet(file_path, index=False)
                                print(f"Updated column in file: {file_path}")
                        except Exception as e:
                            # Print error message and log the file as skipped
                            print(f"Error processing file {file_path}: {e}")
                            skipped_files.append((file_path, str(e)))
            else:
                print(f"Folder does not exist: {folder_path}")

# After processing, check and report any skipped files
if skipped_files:
    print("\nSummary of Skipped Files:")
    for file, error in skipped_files:
        print(f"{file}: {error}")

    # Optionally, save the skipped files information to a CSV for further investigation
    skipped_df = pd.DataFrame(skipped_files, columns=["File Path", "Error Message"])
    csv_path = os.path.join(output_root_dir, "skipped_files.csv")
    skipped_df.to_csv(csv_path, index=False)
    print(f"Skipped files list saved to {csv_path}")
else:
    print("\nAll files processed successfully!")


Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1999\Saudi_Arabia\pressure\1999_Saudi_Arabia_pressure_geopotential.parquet

All files processed successfully!


In [None]:
# Test to different output folder ** do this first**

import os
import pandas as pd

# Define the base directory and the range of years
base_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
output_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test\column_change_test"  # Output directory for modified files
years = range(2019, 2020)  # 2019 to 2023 inclusive

# Define the countries dictionary
countries_dict = {
    "SA": "Saudi Arabia",
    "YE": "Yemen",
    "OM": "Oman",
    "QA": "Qatar",
    "BH": "Bahrain",
    "AE": "United Arab Emirates",
    "KW": "Kuwait"
}

# Define subdirectories to iterate through
folders = ["surface", "pressure"]

# Iterate over years, countries, and folders
for year in years:
    for code, country in countries_dict.items():
        for folder in folders:
            input_folder_path = os.path.join(base_dir, str(year), country, folder)
            output_folder_path = os.path.join(output_dir, str(year), country, folder)

            # Check if the input folder exists
            if os.path.exists(input_folder_path):
                os.makedirs(output_folder_path, exist_ok=True)  # Create the output directory if it doesn't exist

                for file in os.listdir(input_folder_path):
                    # Check if the file is a parquet file
                    if file.endswith(".parquet"):
                        input_file_path = os.path.join(input_folder_path, file)
                        output_file_path = os.path.join(output_folder_path, file)

                        try:
                            # Load the parquet file
                            df = pd.read_parquet(input_file_path)

                            # Check if 'valid_time' column exists and rename it
                            if 'valid_time' in df.columns:
                                df.rename(columns={'valid_time': 'time'}, inplace=True)
                                print(f"Renaming 'valid_time' to 'time' in file: {input_file_path}")
                            
                            # Save the modified DataFrame to the output directory
                            df.to_parquet(output_file_path, index=False)
                            print(f"Saved updated file to: {output_file_path}")
                        except Exception as e:
                            print(f"Error processing file {input_file_path}: {e}")
            else:
                print(f"Input folder does not exist: {input_folder_path}")


In [None]:
import os
import pandas as pd

# Define the base directory and the range of years
base_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
years = range(1989, 2001)  # 2019 to 2023 inclusive

# Define the countries dictionary
countries_dict = {
    "SA": "Saudi_Arabia",
    "YE": "Yemen",
    "OM": "Oman",
    "QA": "Qatar",
    "BH": "Bahrain",
    "AE": "United_Arab_Emirates",
    "KW": "Kuwait"
}

# Define subdirectories to iterate through
folders = ["surface",
           "pressure"
          ]

# Iterate over years, countries, and folders
for year in years:
    for code, country in countries_dict.items():
        for folder in folders:
            folder_path = os.path.join(base_dir, str(year), country, folder)
            
            # Check if the folder exists
            if os.path.exists(folder_path):
                for file in os.listdir(folder_path):
                    # Check if the file is a parquet file
                    if file.endswith(".parquet"):
                        file_path = os.path.join(folder_path, file)
                        
                        try:
                            # Load the parquet file
                            df = pd.read_parquet(file_path)
                            
                            # Check if 'valid_time' column exists and rename it
                            if 'valid_time' in df.columns:
                                df.rename(columns={'valid_time': 'time'}, inplace=True)
                                
                                # Save the updated dataframe back to the parquet file
                                df.to_parquet(file_path, index=False)
                                print(f"Updated column in file: {file_path}")
                        except Exception as e:
                            print(f"Error processing file {file_path}: {e}")
            else:
                print(f"Folder does not exist: {folder_path}")
            # Print the summary of skipped files
if skipped_files:
    print("\nSummary of Skipped Files:")
    for file, error in skipped_files:
        print(f"{file}: {error}")

    # Optionally save the skipped files to a CSV for further investigation
    skipped_df = pd.DataFrame(skipped_files, columns=["File Path", "Error Message"])
    skipped_df.to_csv(os.path.join(output_root_dir, "skipped_files.csv"), index=False)
    print(f"Skipped files list saved to {os.path.join(output_root_dir, 'skipped_files.csv')}")
else:
    print("\nAll files processed successfully!")


Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_10m_u_component_of_wind.parquet
Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_10m_v_component_of_wind.parquet
Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_2m_dewpoint_temperature.parquet
Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_2m_temperature.parquet
Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_convective_available_potential_energy.parquet
Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_evaporation.parquet
Updated column in file: Z:\Thesis\Data\Met\ERA5_parquet_test\1989\Saudi_Arabia\surface\1989_Saudi_Arabia_surface_geopotential.parquet
Upd

In [None]:
# Above was previously operational, I think the memory was overloaded When processing^^^

In [None]:
import os
import pandas as pd
from pyarrow.parquet import ParquetFile

# Define the base directory and the range of years
base_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
years = range(1985, 1986)  # Adjust year range as needed

# Define the countries dictionary
countries_dict = {
    "SA": "Saudi_Arabia",
    # Uncomment other countries as needed
    # "YE": "Yemen",
    # "OM": "Oman",
    # "QA": "Qatar",
    # "BH": "Bahrain",
    # "AE": "United_Arab_Emirates",
    # "KW": "Kuwait"
}

# Define subdirectories to iterate through
folders = ["pressure"]  # Add "surface" if needed

# Function to process parquet file in chunks
def process_parquet_in_chunks(file_path, output_path, chunk_size=1000):
    try:
        # Open parquet file using pyarrow
        parquet_file = ParquetFile(file_path)
        
        # Write to a new parquet file in chunks
        writer = None
        for i, batch in enumerate(parquet_file.iter_batches(batch_size=chunk_size)):
            # Convert batch to pandas DataFrame
            df = batch.to_pandas()
            
            # Rename 'valid_time' column if it exists
            if 'valid_time' in df.columns:
                df.rename(columns={'valid_time': 'time'}, inplace=True)
            
            # Append to the output file
            if writer is None:
                writer = pd.DataFrame(df).to_parquet(output_path, index=False, engine="pyarrow")
            else:
                df.to_parquet(output_path, index=False, engine="pyarrow", append=True)
        
        if writer:
            print(f"Successfully processed and saved: {output_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Iterate over years, countries, and folders
for year in years:
    for code, country in countries_dict.items():
        for folder in folders:
            folder_path = os.path.join(base_dir, str(year), country, folder)
            
            # Check if the folder exists
            if os.path.exists(folder_path):
                for file in os.listdir(folder_path):
                    if file.endswith(".parquet"):
                        file_path = os.path.join(folder_path, file)
                        output_file_path = file_path.replace(".parquet", "_processed.parquet")
                        
                        try:
                            # Process and save the parquet file in chunks
                            process_parquet_in_chunks(file_path, output_file_path)
                        except Exception as e:
                            print(f"Error processing file {file_path}: {e}")
            else:
                print(f"Folder does not exist: {folder_path}")


In [None]:
import os
import pandas as pd
from pyarrow.parquet import ParquetFile
from tqdm import tqdm

# Define the base directory and the range of years
base_dir = r"Z:\Thesis\Data\Met\ERA5_parquet_test"
years = range(1986, 1988)  # Adjust year range as needed

# Define the countries dictionary
countries_dict = {
    "SA": "Saudi_Arabia",
    # Uncomment other countries as needed
    "YE": "Yemen",
    "OM": "Oman",
    "QA": "Qatar",
    "BH": "Bahrain",
    "AE": "United_Arab_Emirates",
    "KW": "Kuwait"
}

# Define subdirectories to iterate through
folders = ["pressure"]  # Add "surface" if needed

# Function to process parquet file in chunks
def process_parquet_in_chunks(file_path, output_path, chunk_size=100_000):
    try:
        # Open parquet file using pyarrow
        parquet_file = ParquetFile(file_path)
        total_batches = parquet_file.metadata.num_rows // chunk_size + 1
        writer = None

        # Iterate over batches with progress tracking
        for i, batch in enumerate(tqdm(parquet_file.iter_batches(batch_size=chunk_size), 
                                       desc=f"Processing {os.path.basename(file_path)}", 
                                       total=total_batches)):
            # Convert batch to pandas DataFrame
            df = batch.to_pandas()
            
            # Rename 'valid_time' column if it exists
            if 'valid_time' in df.columns:
                df.rename(columns={'valid_time': 'time'}, inplace=True)
            
            # Write to the output file (overwrite for the first chunk, then append)
            if writer is None:
                df.to_parquet(output_path, index=False, engine="pyarrow")
                writer = True  # Initialize writer after the first write
            else:
                df.to_parquet(output_path, index=False, engine="pyarrow", append=True)
        
        print(f"Successfully processed and saved: {output_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Iterate over years, countries, and folders
for year in tqdm(years, desc="Processing Years"):
    for code, country in tqdm(countries_dict.items(), desc="Processing Countries"):
        for folder in tqdm(folders, desc="Processing Folders"):
            folder_path = os.path.join(base_dir, str(year), country, folder)
            
            # Check if the folder exists
            if os.path.exists(folder_path):
                files = [f for f in os.listdir(folder_path) if f.endswith(".parquet")]
                for file in tqdm(files, desc=f"Processing Files in {folder_path}"):
                    file_path = os.path.join(folder_path, file)
                    output_file_path = file_path.replace(".parquet", "_processed.parquet")
                    
                    try:
                        # Process and save the parquet file in chunks
                        process_parquet_in_chunks(file_path, output_file_path)
                    except Exception as e:
                        print(f"Error processing file {file_path}: {e}")
            else:
                print(f"Folder does not exist: {folder_path}")
