In [1]:
import logging
from pathlib import Path

import numpy as np
import polars as pl

from utils import ProjectConfig

In [2]:
cfg = pl.Config()
cfg.set_tbl_rows(2000)
cfg.set_tbl_width_chars(200)
cfg.set_fmt_str_lengths(200)

polars.config.Config

In [3]:
pc = ProjectConfig()
parquet_files = list(pc.data_root_dir.glob("*.parquet"))
parquet_files.remove(pc.data_root_dir.joinpath("RecipeNLG_dataset.parquet"))
parquet_files.remove(
    pc.data_root_dir.joinpath("common_ingredient_images_dataset.parquet")
)
parquet_files.remove(pc.data_root_dir.joinpath("extracted_common_images.parquet"))
parquet_files

[WindowsPath('C:/Users/teddy/Documents/01-Berkeley/210/data/FoodClassification.parquet'),
 WindowsPath('C:/Users/teddy/Documents/01-Berkeley/210/data/Fruits-360.parquet'),
 WindowsPath('C:/Users/teddy/Documents/01-Berkeley/210/data/FruitsAndVegetablesImageRecognitionDataset.parquet'),
 WindowsPath('C:/Users/teddy/Documents/01-Berkeley/210/data/FruitsClassification.parquet'),
 WindowsPath('C:/Users/teddy/Documents/01-Berkeley/210/data/GroceryStoreDataset.parquet')]

In [4]:
for i, file in enumerate(parquet_files):
    print(f"Processing {i} {file}")
    df = pl.read_parquet(file)
    if i == 0:
        final_df = df
    else:
        final_df = pl.concat([final_df, df])

Processing 0 C:\Users\teddy\Documents\01-Berkeley\210\data\FoodClassification.parquet
Processing 1 C:\Users\teddy\Documents\01-Berkeley\210\data\Fruits-360.parquet
Processing 2 C:\Users\teddy\Documents\01-Berkeley\210\data\FruitsAndVegetablesImageRecognitionDataset.parquet
Processing 3 C:\Users\teddy\Documents\01-Berkeley\210\data\FruitsClassification.parquet
Processing 4 C:\Users\teddy\Documents\01-Berkeley\210\data\GroceryStoreDataset.parquet


In [5]:
final_counts = (
    final_df.group_by("ClassId")
    .agg(pl.count("ClassId").alias("count"))
    .sort("count", descending=True)
)
final_counts

ClassId,count
str,u32
"""Apple""",2218
"""Banana""",1985
"""Mango""",1972
"""Grape""",1940
"""Strawberry""",1940
"""water""",863
"""bread-white""",595
"""salad-leaf-salad-green""",535
"""apple_hit_1""",468
"""tomato""",450


In [6]:
unique_ing_list = final_counts.write_parquet(
    "unique_ingredients.parquet",
    compression="zstd",
    compression_level=3,
    statistics=True,
)

In [7]:
## Delete
final_df = final_df.filter(pl.col("ClassId") != "Grape")
final_df = final_df.filter(pl.col("ClassId") != "grapes")
final_df = final_df.filter(pl.col("ClassId") != "water")
final_df = final_df.filter(pl.col("ClassId") != "Mango")
final_df = final_df.filter(pl.col("ClassId") != "mango")
final_df = final_df.filter(pl.col("ClassId") != "bread-french-white-flour")
final_df = final_df.filter(pl.col("ClassId") != "bread-sourdough")
final_df = final_df.filter(pl.col("ClassId") != "salad-leaf-salad-green")
final_df = final_df.filter(pl.col("ClassId") != "apple_hit_1")
final_df = final_df.filter(pl.col("ClassId") != "coffee-with-caffeine")
final_df = final_df.filter(pl.col("ClassId") != "butter")
final_df = final_df.filter(pl.col("ClassId") != "pear")
final_df = final_df.filter(pl.col("ClassId") != "pear_1")
final_df = final_df.filter(pl.col("ClassId") != "apple_rotten_1")
final_df = final_df.filter(pl.col("ClassId") != "apple_crimson_snow_1")  # Extra Apples
final_df = final_df.filter(pl.col("ClassId") != "apple_red_delicios_1")  # Extra Apples
final_df = final_df.filter(pl.col("ClassId") != "apple_red_yellow_1")  # Extra Apples
final_df = final_df.filter(pl.col("ClassId") != "Juice")
final_df = final_df.filter(pl.col("ClassId") != "mixed-vegetables")
final_df = final_df.filter(pl.col("ClassId") != "mixed-salad-chopped-without-sauce")
final_df = final_df.filter(pl.col("ClassId") != "espresso-with-caffeine")
final_df = final_df.filter(pl.col("ClassId") != "Sour-milk")
final_df = final_df.filter(pl.col("ClassId") != "tea")
final_df = final_df.filter(pl.col("ClassId") != "tea-green")
final_df = final_df.filter(pl.col("ClassId") != "jam")
final_df = final_df.filter(pl.col("ClassId") != "eggplant")
final_df = final_df.filter(pl.col("ClassId") != "eggplant_violet_1")
final_df = final_df.filter(pl.col("ClassId") != "potatoes-steamed")
final_df = final_df.filter(pl.col("ClassId") != "Melon")
final_df = final_df.filter(pl.col("ClassId") != "pear_3")
final_df = final_df.filter(pl.col("ClassId") != "chips-french-fries")
final_df = final_df.filter(pl.col("ClassId") != "white-coffee-with-caffeine")
final_df = final_df.filter(pl.col("ClassId") != "Pepper")
final_df = final_df.filter(pl.col("ClassId") != "pizza-margherita-baked")
final_df = final_df.filter(pl.col("ClassId") != "turnip")
final_df = final_df.filter(pl.col("ClassId") != "pasta-spaghetti")
final_df = final_df.filter(pl.col("ClassId") != "soy beans")
final_df = final_df.filter(pl.col("ClassId") != "lettuce")
final_df = final_df.filter(pl.col("ClassId") != "garlic")
final_df = final_df.filter(pl.col("ClassId") != "dark-chocolate")
final_df = final_df.filter(pl.col("ClassId") != "jalepeno")
final_df = final_df.filter(pl.col("ClassId") != "beetroot")
final_df = final_df.filter(pl.col("ClassId") != "kiwi")
final_df = final_df.filter(pl.col("ClassId") != "chilli pepper")
final_df = final_df.filter(pl.col("ClassId") != "sweet-pepper")
final_df = final_df.filter(pl.col("ClassId") != "water-mineral")
final_df = final_df.filter(pl.col("ClassId") != "watermelon")
final_df = final_df.filter(pl.col("ClassId") != "boisson-au-glucose-50g")
final_df = final_df.filter(pl.col("ClassId") != "paprika")
final_df = final_df.filter(pl.col("ClassId") != "mixed-nuts")
final_df = final_df.filter(pl.col("ClassId") != "raddish")
final_df = final_df.filter(pl.col("ClassId") != "beer")
final_df = final_df.filter(pl.col("ClassId") != "cauliflower")
final_df = final_df.filter(pl.col("ClassId") != "pomegranate")
final_df = final_df.filter(pl.col("ClassId") != "croissant")
final_df = final_df.filter(pl.col("ClassId") != "french-beans")
final_df = final_df.filter(pl.col("ClassId") != "gruyere")
final_df = final_df.filter(pl.col("ClassId") != "sweetpotato")
final_df = final_df.filter(pl.col("ClassId") != "parmesan")
final_df = final_df.filter(pl.col("ClassId") != "biscuits")
final_df = final_df.filter(pl.col("ClassId") != "honey")
final_df = final_df.filter(pl.col("ClassId") != "sauce-savoury")
final_df = final_df.filter(pl.col("ClassId") != "Soy-Milk")
final_df = final_df.filter(pl.col("ClassId") != "Soyghurt")
final_df = final_df.filter(pl.col("ClassId") != "salmon")
final_df = final_df.filter(pl.col("ClassId") != "salami")
final_df = final_df.filter(pl.col("ClassId") != "mayonnaise")
final_df = final_df.filter(pl.col("ClassId") != "Peach")
## Rename
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("\bCabbage\b", "cabbage")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("\bcabbage_white_1\b", "cabbage")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("Pineapple", "pineapple")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("\bApple\b", "apple"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("apple_braeburn_1", "apple")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("apple_6", "apple"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("apple_granny_smith_1", "apple")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("apple_golden_1", "apple")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("apple_golden_2", "apple")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("apple_golden_3", "apple")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("apple_red_1", "apple"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("apple_red_2", "apple"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("apple_red_3", "apple"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("apple_pink_lady_1", "apple")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("\bBanana\b", "banana"))


final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("\bStrawberry\b", "strawberries")
)

final_df = final_df.with_columns(pl.col("ClassId").str.replace("bread-white", "bread"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("bread-wholemeal", "bread")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("bread-whole-wheat", "bread")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("wine-red", "red wine"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("wine-white", "white wine")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Milk", "milk"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Yoghurt", "yoghurt"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("cucumber_1", "cucumber")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("cucumber_3", "cucumber")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("zucchini_dark_1", "zucchini")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("zucchini_1", "zucchini")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("hard-cheese", "cheese"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("soft-cheese", "cheese"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("tomato-sauce", "tomato sauce")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("carrot_1", "carrot"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("leaf-spinach", "spinach")
)
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("cabbage_white_1", "cabbage")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("sweetcorn", "corn"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("capsicum", "bell pepper")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Satsuma", "orange"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("oranges", "orange"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("mandarine", "orange"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("ginger", "fresh ginger")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("ham-raw", "ham"))
final_df = final_df.with_columns(
    pl.col("ClassId").str.replace("Sour-Cream", "sour cream")
)
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Lemon", "lemon"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Avocado", "avocado"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Mushroom", "mushroom"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Onion", "onion"))
final_df = final_df.with_columns(pl.col("ClassId").str.replace("Nectarine", "orange"))
final_df.filter(pl.col("ClassId").str.contains("Mushroom")).select("ClassId").unique()

ClassId
str
