In [None]:
import pandas as pd
import tqdm

print("Caricamento dei dataframe...")

hummus_df = pd.read_csv("../csv_file/pp_recipes_normalized_by_pipeline.csv", sep=";", low_memory=False, on_bad_lines="skip")
off_df = pd.read_csv("../csv_file/off_normalized_final.csv", sep="\t", low_memory=False, on_bad_lines="skip", nrows=100000)
mapping_df = pd.read_csv("../csv_file/file_off_hummus.csv", sep=",", low_memory=False, on_bad_lines="skip")

print("Dataframe caricati")

off_df = off_df[off_df['product_name_normalized'].isin(mapping_df['product_name_normalized'])]
hummus_df = hummus_df[hummus_df['title_normalized'].isin(mapping_df['title_normalized'])]
mapping_df = mapping_df.drop_duplicates()

total_hummus = len(hummus_df)
print("Dataframe filtrati")

unique_hummus_recipes = set(hummus_df["title_normalized"])

hummus_to_off = {}
for _, row in mapping_df.iterrows():
    hummus_name = row["title_normalized"]
    off_name = row["product_name_normalized"]
    
    if hummus_name in unique_hummus_recipes:
        hummus_to_off.setdefault(hummus_name, set()).add(off_name)

off_attributes = [col for col in off_df.columns if col != "product_name_normalized"]

print("Costruzione dizionari di attributi aggregati...")

attribute_presence_map = {attr: set() for attr in off_attributes}

for attr in off_attributes:
    col_data = off_df[["product_name_normalized", attr]].dropna()
    col_data = col_data[col_data[attr].astype(str).str.strip() != ""]
    attribute_presence_map[attr].update(col_data["product_name_normalized"])

print("Inizio il calcolo delle statistiche...")

attribute_counts = {}
attribute_percentages = {}

for attr in tqdm.tqdm(off_attributes, desc="Processing attributes"):
    valid_products = attribute_presence_map[attr]

    recipes_with_attr = {
        hummus_name
        for hummus_name, off_names in hummus_to_off.items()
        if any(off_name in valid_products for off_name in off_names)
    }

    count = len(recipes_with_attr)
    attribute_counts[attr] = count
    attribute_percentages[attr] = count / total_hummus * 100

print("Generazione file CSV finale...")

result_df = pd.DataFrame({
    'count': attribute_counts,
    'percentage': attribute_percentages
})

result_df = result_df.sort_values('percentage', ascending=False)
result_df['percentage'] = result_df['percentage'].apply(lambda x: f"{x:.2f}%")
result_df.to_csv("../csv_file/hummus_off_attribute_coverage.csv")

print("Fatto! File salvato in ../csv_file/hummus_off_attribute_coverage.csv")


Caricamento dei dataframe...
Dataframe caricati
Dataframe filtrati
Costruzione dizionari di attributi aggregati...
Inizio il calcolo delle statistiche...


Processing attributes: 100%|██████████| 207/207 [00:00<00:00, 571.81it/s]

Generazione file CSV finale...
Fatto! File salvato in ../csv_file/hummus_off_attribute_coverage.csv



