# Test the model for the food name translation

In [None]:
from analisys import translate_to_english_test

food_list = [
    "pane al mais",  # Corn bread in Italian
    "apple",  # English
    "mela",  # Apple in Italian
    "taco",  # English
    "gâteau",  # Cake in French
    "spaghetti",  # English
    "burrito",  # English
    "alimento",  # Food in Spanish
    "salmon",  # English
    "limone",  # Lemon in Italian
    "cheeseburger",  # English
    "pasta",  # English
    "tortilla",  # English
    "sushi",  # English
    "zucchini",  # English
    "carne",  # Meat in Spanish
    "frutta",  # Fruit in Italian
    "okonomiyaki",  # Japanese savory pancake
    "pomme de terre",  # Potato in French
    '"baguette parisien"',
]

for food in food_list:
    response = translate_to_english_test(text=food)

# Test brand filtering accuracy test

In [None]:
from analisys import test_filtering_brand_accuracy

file = "../csv_file/brand_filter_test.csv"
test_filtering_brand_accuracy( file)

# Test user attributes extraction from user descriptions

In [None]:
from analisys import test_attribute_extraction

file = "../csv_file/user_description.csv"
test_attribute_extraction(file)

# Test user attributes extraction Accuracy

In [None]:
import pandas as pd
from analisys import clean_text, compute_similarity

fuzzy_threshold = 70
count_over_threshold_row = 0
count_over_threshold = 0

df1 = pd.read_csv("../csv_file/member_description_list_for_test_attributes_giovanni_zedda.csv", sep="\t")
df2 = pd.read_csv("../csv_file/member_description_list_for_test_attributes_infered_LLM.csv", sep=",")

df1 = df1.map(clean_text)
df2 = df2.map(clean_text)

df1 = df1[['age', 'weight', 'height', 'gender', 'user_constraints']].add_suffix('_1')
df2 = df2[['age', 'weight', 'height', 'gender', 'user_constraints']].add_suffix('_2')

df = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)

df[['similarity_score', 'count_sim']] = df.apply(compute_similarity, axis=1)

for i, row in df.iterrows():

    if row['similarity_score'] >= fuzzy_threshold:
        count_over_threshold_row += 1
        
    count_over_threshold += row['count_sim']

    print(f"Row {i} - Similarity: {row['similarity_score']:.2f}")
    print(f"Row {i} - correct count: {row['count_sim']:.2f}")

    print(f"  df1 - age: {row['age_1']}, weight: {row['weight_1']}, height: {row['height_1']}, gender: {row['gender_1']}")
    print(f"  df2 - age: {row['age_2']}, weight: {row['weight_2']}, height: {row['height_2']}, gender: {row['gender_2']}")
    print(f"  df1 - User Constraints: {row['user_constraints_1']}")
    print(f"  df2 - User Constraints: {row['user_constraints_2']}")
    print("-" * 50, "\n\n")

print(f"fuzzy score: {df['similarity_score'].mean()}, percentage of right inference: {round(count_over_threshold / i / 5 * 100)}%, perccentage of right row: {round(count_over_threshold_row/i*100)}%")


Row 0 - Similarity: 77.00
Row 0 - correct count: 4.00
  df1 - age: 30-50, weight: nan, height: nan, gender: female
  df2 - age: 30-50, weight: nan, height: nan, gender: nan
  df1 - User Constraints: physical activity category: gardening; physical activity category: reading
  df2 - User Constraints: physical activity category: gardening
-------------------------------------------------- 


Row 1 - Similarity: 56.67
Row 1 - correct count: 3.00
  df1 - age: 30-50, weight: nan, height: nan, gender: female
  df2 - age: 25-40, weight: nan, height: nan, gender: female
  df1 - User Constraints: dietary preference: chili's; dietary preference: chicken; dietary preference: dumplins
  df2 - User Constraints: physical activity category: ; religious constraint:
-------------------------------------------------- 


Row 2 - Similarity: 100.00
Row 2 - correct count: 5.00
  df1 - age: 18-25, weight: nan, height: nan, gender: nan
  df2 - age: 18-25, weight: nan, height: nan, gender: nan
  df1 - User Con

# Analisys brand threshold

In [None]:
from analisys import count_products_by_brand_threshold

csv_file = "../csv_file/off_english.csv"
count_products_by_brand_threshold(csv_file, range(2, 10))

# Analisys number of istance for column in off

In [None]:
from analisys import number_of_instance_for_columns

input_file = "../csv_file/off_english.csv"
output_file = "../csv_file/column_analysis.csv"
chunk_size = 120000
number_of_instance_for_columns(input_file, output_file, chunk_size)

# Analisys quantities attributes from off


In [None]:
from analisys import analisys_quantities

input_file = "../csv_file/off_english.csv"
output_file = "../csv_file/quantities.csv"
analisys_quantities(input_file, output_file, n=5)

# Analisys on attribute distribution on off

In [None]:
from analisys import plot_populated_counts

csv_file_path = "../csv_file/off_english.csv"
output_directory = "../csv_file"
output_file = "current_attributes_analisys.png"
columns_to_plot = [
    "product_name",
    "allergens", 
    "traces_en",
    "calcium_100g",
    "iron_100g",
    "vitamin-c_100g",
    "vitamin-a_100g",
    "nutriscore_score",
    "nutrition-score-fr_100g",
    "nutriscore_grade",
    "nova_group",
    "ecoscore_grade",
    "ecoscore_score",
    "nutriscore_grade",
    "nova_group",
    "ecoscore_grade",
    "ecoscore_score",
    "generic_name",
    "ingredients_text"
    ]

plot_populated_counts(csv_file_path, output_directory, columns_to_plot, output_file)

In [None]:
from analisys import plot_populated_counts

csv_file_path = "../csv_file/off_english.csv"
output_directory = "../csv_file"
output_file = "future_attributes_analisys.png"
columns_to_plot = [
    "product_name",
    "quantity", 
    "serving_size",
    "serving_quantity",
    "food_groups",
    "food_groups_tags",
    "food_groups_en"
    "stores",
    "countries"
    "countries_en",
    "countries_tag",
    "purchase_places",
    "origins_en",
    "cities",
    "cities_tag",
    "image_url",
    "ingredients_text",
    "ingredients_tags",
    "additives_n",
    "additives_en",
    "brands",
    "brands_tags",
    "generic_name",
    "main_category_en",
    "owner",
    "brand_owner",
    "packaging",
    "categories"
    ]

plot_populated_counts(csv_file_path, output_directory, columns_to_plot, output_file)

In [None]:
from analisys import plot_populated_counts

csv_file_path = "../csv_file/off_english.csv"
output_directory = "../csv_file"
output_file = "shared_attributes_analisys.png"
columns_to_plot = [
    "nutriscore_score", #not 100% in common with hummus
    "energy_100g",
    "energy-from-fat_100g",
    "fat_100g",
    "saturated-fat_100g",
    "cholesterol_100g",
    "sodium_100g",
    "carbohydrates_100g",
    "fiber_100g",
    "sugars_100g",
    "proteins_100g",
    ]

plot_populated_counts(csv_file_path, output_directory, columns_to_plot, output_file)