# Hummus-OFF Attribute Coverage Analysis Script

This Python script analyzes the "coverage" of product attributes from an Open Food Facts (OFF) dataset within a collection of recipes. The primary goal is to determine, for each product attribute, how many unique recipes are associated with products that possess that attribute with a valid, non-generic value. 

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import gc

hummus_df = pd.read_csv("../csv_file/pp_recipes_normalized_by_pipeline.csv", sep=";", low_memory=False, on_bad_lines="skip")

original_total_unique_recipes  = len(hummus_df)
print(f"Hummus loaded: {original_total_unique_recipes} rows")

mapping_df = pd.read_csv("../csv_file/file_off_hummus.csv", sep=",", low_memory=False, on_bad_lines="skip")
print(f"Mapping loaded: {len(mapping_df)} righe")

valid_titles_in_mapping = set(mapping_df['title_normalized'].unique())
hummus_df_relevant = hummus_df[hummus_df['title_normalized'].isin(valid_titles_in_mapping)].copy()
print(f"Hummus filtered on recipes with mapping (relevant for the numerator): {len(hummus_df_relevant)} righe")

needed_off_products = set(mapping_df['product_name_normalized'].unique())
off_file_path = "../csv_file/off_normalized_final.csv"

off_cols_df = pd.read_csv(off_file_path, sep="\t", nrows=0)
off_cols = off_cols_df.columns.tolist()
off_attributes = [col for col in off_cols]
print(f"Founded {len(off_attributes)} attributes to check in off_df")

valid_products_per_attribute = {attr: set() for attr in off_attributes}

off_reader = pd.read_csv(
    off_file_path,
    sep="\t",
    low_memory=False,
    on_bad_lines="skip",
    chunksize=300000,
    iterator=True,
    usecols=off_attributes
)

processed_chunks = 0
for chunk in tqdm(off_reader, desc="Processing OFF chunks", unit="chunk"):
    processed_chunks += 1

    chunk.dropna(subset=['product_name_normalized'], inplace=True)
    chunk = chunk[chunk['product_name_normalized'].isin(needed_off_products)]
    if chunk.empty:
        del chunk
        gc.collect()
        continue

    for attribute in off_attributes:
        if attribute in chunk.columns:
            mask = chunk[attribute].notna() & (chunk[attribute].astype(str).str.strip() != '') & (chunk[attribute].astype(str).str.strip() != 'unknown') & (chunk[attribute].astype(str).str.strip() != 'none')
            valid_products_in_chunk = chunk.loc[mask, 'product_name_normalized']
            valid_products_per_attribute[attribute].update(valid_products_in_chunk)

attribute_counts = {}
attribute_percentages = {}


if len(hummus_df_relevant) > 0:
    title_to_recipe_ids = hummus_df_relevant.groupby('title_normalized')['recipe_id'].apply(set).to_dict()
else:
    title_to_recipe_ids = {} 

for attr in tqdm(off_attributes, desc="Calculating percentages", unit="attribute"):
    valid_products_for_this_attr = valid_products_per_attribute.get(attr, set())
    count = 0 

    if valid_products_for_this_attr and title_to_recipe_ids:
        relevant_mappings = mapping_df[mapping_df['product_name_normalized'].isin(valid_products_for_this_attr)]
        relevant_titles = set(relevant_mappings['title_normalized'].unique())
        recipe_ids_with_valid_attr = set()
        for title in relevant_titles:
            ids_for_title = title_to_recipe_ids.get(title) 
            if ids_for_title:
                recipe_ids_with_valid_attr.update(ids_for_title)

        count = len(recipe_ids_with_valid_attr)

    attribute_counts[attr] = count
    percentage = (count / original_total_unique_recipes) * 100 if original_total_unique_recipes > 0 else 0
    attribute_percentages[attr] = percentage


result_df = pd.DataFrame({
    'count': pd.Series(attribute_counts),
    'percentage': pd.Series(attribute_percentages)
})
result_df = result_df.sort_values('percentage', ascending=False)
result_df['percentage_str'] = result_df['percentage'].apply(lambda x: f"{x:.2f}%")
result_df.index.name = 'attribute'

output_path = "../csv_file/hummus_off_attribute_coverage_85.csv"
result_df.to_csv(output_path)

Hummus caricato: 507335 righe
Mapping caricato: 37796563 righe
Hummus filtrato su ricette con mapping (rilevanti per il numeratore): 493749 righe
Trovati 208 attributi da controllare in off_df


Processing OFF chunks: 9chunk [15:44, 104.96s/chunk]
Calculating percentages: 100%|██████████| 208/208 [07:53<00:00,  2.28s/attribute]
