# SIMILARITY WITH A CLASSIC BERT

In [16]:
from entity_linking import find_most_similar_pairs, read_specified_columns

file1_path = "../csv_file/entity_linking_test.csv"

lists = read_specified_columns(file_path=file1_path, elenco_colonne=["off", "foodkg"], delimiter=",")

list1 = [item[0] for item in lists]
list2 = [item[1] for item in lists]

most_similar_pairs = find_most_similar_pairs(list1, list2)

print("Most similar couples:\n")
for item1, item2, score in most_similar_pairs:
    print(f"({item1}) --- ({item2}) --- Similarity: {score:.3f}")

Most similar couples:

(Protein brownie mini) --- (brownie) --- Similarity: 0.686
(Tablet) --- (rennet tablet) --- Similarity: 0.607
(Saucisse a tartiner 170g) --- (hake fillets) --- Similarity: 0.327
(Noisettes crues) --- (italian - flavored croutons) --- Similarity: 0.411
(Hummus gegrilde groenten) --- (hummus) --- Similarity: 0.606
(Land O' Frost Premium Cured Roast Beef) --- (roast beef) --- Similarity: 0.743
(pepper jelly) --- (pepper jelly) --- Similarity: 1.000
(Whole Chocolate Milk) --- (chocolate milk) --- Similarity: 0.951
(Filet de poulet) --- (boneless pork filet) --- Similarity: 0.344
(Olivenmix mit Kräuter) --- (olive) --- Similarity: 0.621
(Rinder-Hackfleisch) --- (white wine vinegar) --- Similarity: 0.247
(ICA i♥eco 12 ekologiska ägg från frigående) --- (red enchilada sauce) --- Similarity: 0.272
(Harina de trigo especial reposteria) --- (italian - flavored croutons) --- Similarity: 0.281
(Sardines in water) --- (sardines) --- Similarity: 0.873
(Oeufs frais) --- (oatmea

# UNIVERSITY OF BARI METHOD

In [None]:
from tqdm import tqdm
from entity_linking import (
    RecipeTransformer,
    compute_embeddings,
    find_similar_by_title,
    read_specified_columns,
)

file1_path = "../csv_file/entity_linking_test.csv"

lists = read_specified_columns(file_path=file1_path, elenco_colonne=["off", "foodkg"], delimiter=",")

list1 = [item[0] for item in lists]
list2 = [item[1] for item in lists]


# Initialize the transformer
transformer_name = "davanstrien/autotrain-recipes-2451975973"
transformer = RecipeTransformer(transformer_name)

# Compute embeddings for all recipes in list2
print("Calculating embeddings for list2...")
embeddings2 = compute_embeddings(list2, transformer)

# Create a list of tuples (index, title) for list2
entities_list2 = list(enumerate(iterable=list2))

# Find the most similar recipe for each item in list1
most_similar_pairs = []
print("Searching for the most similar recipes...")
for recipe_title in tqdm(list1, desc="Similarity search"):
    similar_recipe, similarity_score = find_similar_by_title(
        recipe_title, entities_list2, embeddings2, transformer
    )
    most_similar_pairs.append((recipe_title, similar_recipe[1], similarity_score))

# Output the results
print("Most similar recipe pairs found:\n")
for item1, item2, score in most_similar_pairs:
    print(f"({item1}) --------- ({item2}) --------- Similarity: {score:.3f}")

Calculating embeddings for list2...


Processing Titles embeddings: 100%|██████████| 106/106 [00:00<00:00, 192.97batch/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]


Searching for the most similar recipes...


Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 224.86batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 124.99batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 183.04batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 115.96batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 94.25batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 110.50batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 140.12batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 168.86batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 141.13batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 199.91batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 142.09batch/s]
Processing Titles embeddings: 100%|██████████| 1/1 [00:00<00:00, 102.07batch/s]
Processing Titles embeddings: 100%|██████

Most similar recipe pairs found:

(Protein brownie mini) --------- (7 - Up soda) --------- Similarity: 0.886
(Tablet) --------- (lollipops) --------- Similarity: 0.819
(Saucisse a tartiner 170g) --------- (Dijon mustard) --------- Similarity: 0.881
(Noisettes crues) --------- (tzatziki) --------- Similarity: 0.899
(Hummus gegrilde groenten) --------- (Dijon mustard) --------- Similarity: 0.894
(Land O' Frost Premium Cured Roast Beef) --------- (chicken sandwich steaks) --------- Similarity: 0.855
(pepper jelly) --------- (pepper jelly) --------- Similarity: 1.000
(Whole Chocolate Milk) --------- (chocolate bar) --------- Similarity: 0.888
(Filet de poulet) --------- (Dijon mustard) --------- Similarity: 0.894
(Olivenmix mit Kräuter) --------- (prosciutto ham) --------- Similarity: 0.844
(Rinder-Hackfleisch) --------- (tzatziki) --------- Similarity: 0.846
(ICA i♥eco 12 ekologiska ägg från frigående) --------- (tzatziki) --------- Similarity: 0.850
(Harina de trigo especial reposteria) 




# IBRID METHOD WITH INDICATOR TEST

In [5]:
from entity_linking import find_k_most_similar_pairs_with_indicators

list1 = [("Pasta", 30, 5, 10, "Pasta"), ("Pane", 50, 1, 10, "Pane")]
list2 = [("Riso", 40, 2, 8, "Riso"), ("Pizza", 20, 10, 12, "Pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2, use_indicator=True)
print("actual contribution", result)

list1 = [("Pasta", 100, 0, 0, "Pasta"), ("Pane", 0, 0, 0, "Pane")]
list2 = [("Riso", 0, 2, 8, "Riso"), ("Pizza", 0, 50, 50, "Pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2, use_indicator=True)
print("negative contribution", result)

list1 = [("Pasta", 33, 33, 33, "Pasta"), ("Pane", 0, 0, 0, "Pane")]
list2 = [("Riso", 0, 2, 8, "Riso"), ("Pizza", 33, 33, 33 , "Pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2, use_indicator=True)
print("positive contribution", result)

list1 = [("pasta", "pasta"), ("pane", "pane")]
list2 = [("riso", "riso"), ("pizza", "pizza")]
result = find_k_most_similar_pairs_with_indicators(list1, list2)
print("no contribution", result)

actual contribution [(0.4346590819370404, 'Pasta', 'Pizza'), (0.3310670316900095, 'Pane', 'Pizza')]
negative contribution [(0.35608773076070105, 'Pasta', 'Pizza'), (0.30325937271118164, 'Pane', 'Pizza')]
positive contribution [(0.4426902711391449, 'Pasta', 'Pizza'), (0.30325937271118164, 'Pane', 'Pizza')]
no contribution [(0.3926902711391449, 'pasta', 'pizza'), (0.30325937271118164, 'pane', 'pizza')]


In [1]:
from entity_linking import read_csv, evaluate_entity_linking_method

file_path = "../csv_file/entity_linking_test.csv"
data = read_csv(file_path)

list_of_models = [
"sentence-transformers/all-MiniLM-L6-v2", 
"sentence-t5-base",
"sentence-transformers/paraphrase-MiniLM-L6-v2",
"sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
"sentence-transformers/paraphrase-mpnet-base-v2",
"sentence-transformers/stsb-roberta-large",
"sentence-transformers/all-MiniLM-L12-v2",
"sentence-transformers/distilbert-base-nli-stsb-mean-tokens",
"sentence-transformers/msmarco-distilbert-base-v4",
"flax-sentence-embeddings/all_datasets_v4_MiniLM-L6",
"sentence-transformers/multi-qa-distilbert-cos-v1",
"sentence-transformers/all-distilroberta-v1",
"sentence-transformers/stsb-roberta-base-v2",
"sentence-transformers/paraphrase-xlm-r-multilingual-v1",
"sentence-transformers/multi-qa-MiniLM-L12-cos-v1",
"sentence-transformers/msmarco-MiniLM-L12-v3",
"sentence-transformers/msmarco-distilbert-base-v3",
"sentence-transformers/paraphrase-albert-small-v2",
"sentence-transformers/all-MiniLM-L6-cos-v1",
"sentence-transformers/average_word_embeddings_glove.6B.300d",
"flax-sentence-embeddings/all_datasets_v3_MiniLM-L6",
]

list_of_models = ["sentence-transformers/all-MiniLM-L6-v2", "sentence-t5-base", "sentence-transformers/stsb-roberta-large"]
best_accuracy = 0

for model in list_of_models:
    accuracy_method_1 = evaluate_entity_linking_method(data, show_progress=True, model=model)
    print(f"{model}: {accuracy_method_1:.2f}%")
    if accuracy_method_1 > best_accuracy:
        best_accuracy = accuracy_method_1

print(f"Best model: {best_accuracy:.2f}%")

  from tqdm.autonotebook import tqdm, trange



Original OFF: Protein brownie mini, Linked FoodKG: brownie, 
  Similarity: 0.63, 
  Correct: True

Original OFF: Tablet, Linked FoodKG: rennet tablet, 
  Similarity: 0.65, 
  Correct: True

Original OFF: Saucisse a tartiner 170g, Linked FoodKG: smoked bacon, 
  Similarity: 0.32, 
  Correct: False

Original OFF: Noisettes crues, Linked FoodKG: italian - flavored croutons, 
  Similarity: 0.42, 
  Correct: False

Original OFF: Hummus gegrilde groenten, Linked FoodKG: hummus, 
  Similarity: 0.72, 
  Correct: True

Original OFF: Land O' Frost Premium Cured Roast Beef, Linked FoodKG: roast beef, 
  Similarity: 0.64, 
  Correct: True

Original OFF: pepper jelly, Linked FoodKG: pepper jelly, 
  Similarity: 1.00, 
  Correct: True

Original OFF: Whole Chocolate Milk, Linked FoodKG: chocolate milk, 
  Similarity: 0.86, 
  Correct: True

Original OFF: Filet de poulet, Linked FoodKG: boneless pork filet, 
  Similarity: 0.35, 
  Correct: False

Original OFF: Olivenmix mit Kräuter, Linked FoodKG: ol