In [1]:
import pandas as pd
import os


In [2]:
def merge_parquet_folder(folder_path, output_file):
    """
    Load all .parquet files from a folder, merge them, and save to output_file.

    Args:
        folder_path (str): Path to the folder containing .parquet files.
        output_file (str): Output .parquet file path (e.g. "merged.parquet").

    Returns:
        pd.DataFrame: The merged DataFrame.
    """
    # List all .parquet files in the folder
    parquet_files = [os.path.join(folder_path, f)
                     for f in os.listdir(folder_path)
                     if f.endswith('.parquet')]

    if not parquet_files:
        raise ValueError("No .parquet files found in folder: " + folder_path)

    # Load all files into a list of DataFrames
    dataframes = [pd.read_parquet(file) for file in sorted(parquet_files)]

    # Merge into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Save to output
    merged_df.to_parquet(output_file)

    print(f"Merged {len(parquet_files)} files into {output_file}")
    return merged_df

In [3]:
dataset_1 = merge_parquet_folder("../../datasets/dataset_1", "../../datasets/dataset_1.parquet")
dataset_2 = merge_parquet_folder("../../datasets/dataset_2", "../../datasets/dataset_2.parquet")
dataset_3 = merge_parquet_folder("../../datasets/dataset_3", "../../datasets/dataset_3.parquet")

Merged 3 files into ../../datasets/dataset_1.parquet
Merged 5 files into ../../datasets/dataset_2.parquet
Merged 1 files into ../../datasets/dataset_3.parquet
