# SIMILARITY WITH A CLASSIC BERT (Naive approach)

In [None]:
from entity_linking import find_most_similar_pairs, read_specified_columns

file1_path = "../csv_file/entity_linking_test.csv"

lists = read_specified_columns(file_path=file1_path, elenco_colonne=["off", "foodkg"], delimiter=",")

list1 = [item[0] for item in lists]
list2 = [item[1] for item in lists]

most_similar_pairs = find_most_similar_pairs(list1, list2)

print("Most similar couples:\n")
for item1, item2, score in most_similar_pairs:
    print(f"({item1}) --- ({item2}) --- Similarity: {score:.3f}")

# UNIVERSITY OF BARI METHOD

In [None]:
from tqdm import tqdm
from entity_linking import (
    RecipeTransformer,
    compute_embeddings,
    find_similar_by_title,
    read_specified_columns,
)

file1_path = "../csv_file/entity_linking_test.csv"

lists = read_specified_columns(file_path=file1_path, elenco_colonne=["off", "foodkg"], delimiter=",")

list1 = [item[0] for item in lists]
list2 = [item[1] for item in lists]


# Initialize the transformer
transformer_name = "davanstrien/autotrain-recipes-2451975973"
transformer = RecipeTransformer(transformer_name)

# Compute embeddings for all recipes in list2
print("Calculating embeddings for list2...")
embeddings2 = compute_embeddings(list2, transformer)

# Create a list of tuples (index, title) for list2
entities_list2 = list(enumerate(iterable=list2))

# Find the most similar recipe for each item in list1
most_similar_pairs = []
print("Searching for the most similar recipes...")
for recipe_title in tqdm(list1, desc="Similarity search"):
    similar_recipe, similarity_score = find_similar_by_title(
        recipe_title, entities_list2, embeddings2, transformer
    )
    most_similar_pairs.append((recipe_title, similar_recipe[1], similarity_score))

# Output the results
print("Most similar recipe pairs found:\n")
for item1, item2, score in most_similar_pairs:
    print(f"({item1}) --------- ({item2}) --------- Similarity: {score:.3f}")

# IBRID METHOD WITH INDICATOR TEST

In [None]:
from entity_linking import find_k_most_similar_pairs_with_indicators

list1 = [("Pasta", 30, 5, 10, "Pasta"), ("Pane", 50, 1, 10, "Pane")]
list2 = [("Riso", 40, 2, 8, "Riso"), ("Pizza", 20, 10, 12, "Pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2, use_indicator=True)
print("actual contribution", result)

list1 = [("Pasta", 100, 0, 0, "Pasta"), ("Pane", 0, 0, 0, "Pane")]
list2 = [("Riso", 0, 2, 8, "Riso"), ("Pizza", 0, 50, 50, "Pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2, use_indicator=True)
print("negative contribution", result)

list1 = [("Pasta", 33, 33, 33, "Pasta"), ("Pane", 0, 0, 0, "Pane")]
list2 = [("Riso", 0, 2, 8, "Riso"), ("Pizza", 33, 33, 33 , "Pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2, use_indicator=True)
print("positive contribution", result)

list1 = [("pasta", "pasta"), ("pane", "pane")]
list2 = [("riso", "riso"), ("pizza", "pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2)
print("no contribution", result)

# TEST OF VARIOUS BERT MODEL

search of the best threshold value for the bert on a validation set

In [None]:
from entity_linking import read_specified_columns, evaluate_entity_linking_method
import csv

file_path = "../csv_file/entity_linking_test_normalized_validation.csv"
column_list = ["off_normalized", "foodkg_normalized"]
data = read_specified_columns(file_path, elenco_colonne=column_list, delimiter=",")

#https://huggingface.co/spaces/mteb/leaderboard 09/12/2024
list_of_models = [
    
    #top 5 in pair classification (around 10000000 parameter)
    #voyage is not free
    #"meta-llama/Meta-Llama-3-8B-Instruct", # have problem with the token's padding
    "nvidia/NV-Embed-v2",
    "Salesforce/SFR-Embedding-Mistral",
    "compressa-ai/Compressa-Embeddings",

    # top 3 under 1000000 parameters
    "dunzhang/stella_en_400M_v5",
    "llmrails/ember-v1",
    "WhereIsAI/UAE-Large-V1",

    # top 3 under 100000 parameters
    "infgrad/stella-base-en-v2",
    "intfloat/e5-small",
    "BAAI/bge-small-en-v1.5", 

    #top 5 overall
    #nvidia/NV-Embed-v2 alredy tested
    "dunzhang/stella_en_1.5B_v5",
    "BAAI/bge-en-icl",
    "blevlabs/stella_en_v5",
    "Salesforce/SFR-Embedding-2_R",

    # top 3 in sts
    "Lajavaness/bilingual-embedding-large",
    "ilhamdprastyo/jina-embeddings-v3-tei",
    "jinaai/jina-embeddings-v3"
    ]


column_names = ["model_name", "vocab_size", "number_of_parameters", "accuracy", "accuracy_on_considered", "number_of_TP_and_TN", "threshold", ]
threshold = [(i/100) for i in  range(80, 100, 1)]

output_file = "../csv_file/bert_comparison_validation.csv"

with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(column_names)
    
    for model in list_of_models:
        model_name, vocab_size, number_of_parameters, accuracy, accuracy_considered, number_of_TP_and_TN, threshold = evaluate_entity_linking_method(
            data, show_progress=False, model=model, threshold_list=threshold
        )
        for modelz, vocab_sizez, number_of_parametersz, accuracyz, accuracy_consideredz, number_of_TP_and_TNz, thresholdz,  in zip(model_name, vocab_size, number_of_parameters, accuracy, accuracy_considered, number_of_TP_and_TN, threshold):
            writer.writerow([model, vocab_sizez, number_of_parametersz, round(accuracyz, 2), round(accuracy_consideredz) , number_of_TP_and_TNz, thresholdz])

print(f"file created {output_file}.")


determine the best bert on a test set

In [None]:
from entity_linking import read_specified_columns, evaluate_entity_linking_method
import csv

file_path = "../csv_file/entity_linking_test_normalized_test.csv"
column_list = ["off_normalized", "foodkg_normalized"]
data = read_specified_columns(file_path, elenco_colonne=column_list, delimiter=",")

file1_path = "../csv_file/bert_comparison_validation.csv"
column_list = ["model_name", "threshold", "accuracy_on_considered"]
reader = read_specified_columns(file1_path, elenco_colonne=column_list, delimiter=",")

list_of_models = []
list_of_threshold = []
list_of_accuracy = []

for model, threshold, accuracy in reader:
    list_of_models.append(model)
    list_of_threshold.append(threshold)
    list_of_accuracy.append(accuracy)

model_threshold_dictionary = {}

for model, threshold, accuracy in zip(list_of_models, list_of_threshold, list_of_accuracy):
    if model not in model_threshold_dictionary:
        model_threshold_dictionary[model] = [threshold, accuracy]
    else:
        if ((float(accuracy) > float(model_threshold_dictionary[model][1])) | ((float(accuracy) == float(model_threshold_dictionary[model][1])) & (float(threshold) < float(model_threshold_dictionary[model][0])))):
            model_threshold_dictionary[model] = [threshold, accuracy]

print(model_threshold_dictionary)



column_names = ["model_name", "vocab_size", "number_of_parameters", "accuracy", "accuracy_on_considered", "number_of_TP_and_TN", "threshold", ]

output_file = "../csv_file/bert_comparison_on_test_set.csv"

with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(column_names)
    for model in model_threshold_dictionary.keys():
        threshold = model_threshold_dictionary[model][0]
        try:
            threshold_list = [float(threshold)]
        except ValueError:
            print(f"Invalid threshold for model {model}: {threshold}")
            continue  # Salta il modello se il threshold è invalido

        model_name, vocab_size, number_of_parameters, accuracy, accuracy_considered, number_of_TP_and_TN, threshold = evaluate_entity_linking_method(
            data, show_progress=False, model=model, threshold_list=threshold_list
        )
        for modelz, vocab_sizez, number_of_parametersz, accuracyz, accuracy_consideredz, number_of_TP_and_TNz, thresholdz in zip(
            model_name, vocab_size, number_of_parameters, accuracy, accuracy_considered, number_of_TP_and_TN, threshold
        ):
            writer.writerow([modelz, vocab_sizez, number_of_parametersz, round(accuracyz, 2), round(accuracy_consideredz), number_of_TP_and_TNz, thresholdz])

print(f"file created {output_file}.")

In [1]:
import os
import sys
import csv

from entity_linking import read_specified_columns

header = ["name_normalized"]
file_off = "../csv_file/off_recipe_for_linking.csv"
file_hummus = "../csv_file/hum_recipe_for_linking.csv"
file_foodkg = "../csv_file/foodkg_recipe_for_linking.csv"

print(f"starting the file creation\n\n")

# Columns of the hummus file to be used for the merging
hummus_file_path = "../csv_file/pp_recipes_normalized_by_pipeline.csv"
hummus_column: list[str] = [
    "title_normalized"
]
list_hummus_recipe = read_specified_columns(
    hummus_file_path, hummus_column, delimiter=";"
)

seen = set()
unique_recipe = []
for row in list_hummus_recipe:
    key = row  
    if key != "" and key is not None and key not in seen:  
        seen.add(key)
        unique_recipe.append(row)

list_hummus_recipe = unique_recipe

print("numero ricette hummus: ", len(list_hummus_recipe))

with open(file_hummus, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for name1 in list_hummus_recipe:
        writer.writerow(name1)


# WAITING FOR THE NEW NORMALIZATION
# Columns of the off file to be used for the merging
#off_file_path = "../csv_file/off_normalized_final.csv"
#off_column = [
#    "product_name_normalized",
#]
#list_off_recipe = read_specified_columns(
#    off_file_path, off_column, delimiter="\t"
#)
#
#seen = set()
#unique_recipe = []
#for row in list_off_recipe:
#    key = row 
#    if key != "" and key is not None and key not in seen:  
#        seen.add(key)
#        unique_recipe.append(row)
#
#list_off_recipe = unique_recipe
#
#print("numero ricette off: ", len(list_off_recipe))
#
#with open(file_off, mode="w", newline="", encoding="utf-8") as file:
#    writer = csv.writer(file)
#    writer.writerow(header)
#    for name1 in list_off_recipe:
#        writer.writerow(name1)



# Columns of the off file to be used for the merging
foodkg_file_path = "../csv_file/ingredients_food_kg_normalizzed_by_pipeline.csv"
foodkg_column = [
    "ingredient_normalized"
]
list_foodkg_recipe = read_specified_columns(
    foodkg_file_path, foodkg_column, delimiter=","
)

seen = set()
unique_recipe = []
for row in list_foodkg_recipe:
    key = row  
    if key != "" and key is not None and key not in seen: 
        seen.add(key)
        unique_recipe.append(row)

list_foodkg_recipe = unique_recipe

print("numero ricette off: ", len(list_foodkg_recipe))

with open(file_foodkg, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    for name1 in list_foodkg_recipe:
        writer.writerow(name1)

starting the file creation


numero ricette hummus:  313821
numero ricette off:  13663
