In [3]:
import pandas as pd
import zstandard as zstd
import os
from io import BytesIO
def combine_csv_zst_files(input_dir, output_file):
    """
    Combine multiple .csv.zst files into one .csv.zst file.
    Args:
        input_dir (str): Path to the directory containing .csv.zst files.
        output_file (str): Path to the output .csv.zst file.
    """
    combined_df = pd.DataFrame()  # Initialize an empty DataFrame
    # Iterate through all files in the directory
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.csv.zst'):
            file_path = os.path.join(input_dir, file_name)
            # print(f"Processing file: {file_path}")
            # Decompress the .csv.zst file
            with open(file_path, 'rb') as f:
                dctx = zstd.ZstdDecompressor()
                decompressed_data = dctx.stream_reader(f).read()
                if not decompressed_data:
                    print(f"File {file_name} is empty or not properly decompressed.")
                    continue
                # Load the decompressed data into a DataFrame
                try:
                    df = pd.read_csv(BytesIO(decompressed_data))
                    combined_df = pd.concat([combined_df, df], ignore_index=True)
                except Exception as e:
                    print(f"Error reading decompressed data from {file_name}: {e}")
                    continue
    # Check if the DataFrame is empty before saving
    if combined_df.empty:
        print("Warning: Combined DataFrame is empty. No data to save.")
        return
    # Save the combined DataFrame as a .csv.zst file
    with open(output_file, 'wb') as f:
        cctx = zstd.ZstdCompressor(level=3)
        compressed_data = cctx.compress(combined_df.to_csv(index=False).encode('utf-8'))
        f.write(compressed_data)
    print(f"Combined file saved as: {output_file}")
# Example usage
input_directory = "/Users/glennfor/Desktop/test/options/XNAS-20250113-HMGC8QH5JX"  # Replace with the directory containing .csv.zst files
output_combined_file = "combined_output.csv.zst"  # Replace with the desired output file path
combine_csv_zst_files(input_directory, output_combined_file)



KeyboardInterrupt: 