In [1]:
import pandas as pd
import os

In [2]:
# fruits.parquet
df1_fruits = pd.read_parquet("../datasets/dataset_1/fruits.parquet")
df1_fruits = df1_fruits.rename(columns={"fruit": "ingredients"})
df1_fruits.to_parquet("../datasets/dataset_1/fruits.parquet")

In [3]:
# vegetables_seafood.parquet
df1_vegetables_seafood = pd.read_parquet("../datasets/dataset_1/vegetables_seafood.parquet")
df1_vegetables_seafood = df1_vegetables_seafood.rename(columns={"Food and Serving": "ingredients"})
df1_vegetables_seafood.to_parquet("../datasets/dataset_1/vegetables_seafood.parquet")

In [4]:
# vegetables.parquet
df1_vegetables = pd.read_parquet("../datasets/dataset_1/vegetables.parquet")
df1_vegetables = df1_vegetables.rename(columns={"Name": "ingredients"})
df1_vegetables.to_parquet("../datasets/dataset_1/vegetables.parquet")

In [5]:
# Create a list of file paths
file_paths_ds2 = ["../datasets/dataset_2/all-recipes_0.parquet", "../datasets/dataset_2/all-recipes_1.parquet", "../datasets/dataset_2/all-recipes_2.parquet", "../datasets/dataset_2/all-recipes_3.parquet"]

# Load, rename, and overwrite each file
for path in file_paths_ds2:
    df = pd.read_parquet(path)
    df = df.rename(columns={"input": "ingredients"})
    df.to_parquet(path, index=False)  # Overwrite the original file

In [6]:
def merge_parquet_folder(folder_path, output_file):
    """
    Load all .parquet files from a folder, merge them, and save to output_file.

    Args:
        folder_path (str): Path to the folder containing .parquet files.
        output_file (str): Output .parquet file path (e.g. "merged.parquet").

    Returns:
        pd.DataFrame: The merged DataFrame.
    """
    # List all .parquet files in the folder
    parquet_files = [os.path.join(folder_path, f)
                     for f in os.listdir(folder_path)
                     if f.endswith('.parquet')]

    if not parquet_files:
        raise ValueError("No .parquet files found in folder: " + folder_path)

    # Load all files into a list of DataFrames
    dataframes = [pd.read_parquet(file) for file in sorted(parquet_files)]

    # Merge into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Save to output
    merged_df.to_parquet(output_file)

    print(f"Merged {len(parquet_files)} files into {output_file}")
    return merged_df

In [8]:
dataset_1 = merge_parquet_folder("../datasets/dataset_1", "../datasets/dataset_1.parquet")
dataset_2 = merge_parquet_folder("../datasets/dataset_2", "../datasets/dataset_2.parquet")
dataset_3 = merge_parquet_folder("../datasets/dataset_3", "../datasets/dataset_3.parquet")

Merged 3 files into ../datasets/dataset_1.parquet
Merged 4 files into ../datasets/dataset_2.parquet
Merged 1 files into ../datasets/dataset_3.parquet
