In [7]:
import pandas as pd
import numpy as np
from rapidfuzz import fuzz
import os

In [8]:
# ------------------------
# Step 1: Load datasets
# ------------------------
df1 = pd.read_parquet("../datasets/dataset_1.parquet")
df2 = pd.read_parquet("../datasets/dataset_2.parquet")
df3 = pd.read_parquet("../datasets/dataset_3.parquet")

In [9]:
def normalize_ingredient(s):
    return s.strip().lower()

def extract_unique_ingredients(df):
    ingredients = set()
    for row in df["ingredients"].dropna():
        if isinstance(row, str):
            items = row.split(',')
        elif isinstance(row, (list, np.ndarray, tuple)):
            items = row
        else:
            continue
        for ing in items:
            if isinstance(ing, str):
                ingredients.add(normalize_ingredient(ing))
    return ingredients
def build_canonical_map(ingredients_list, threshold=85):
    """Fast fuzzy canonical map with early stopping and improved matching."""
    ingredients = sorted(set(ingredients_list), key=len)  # sort for early matching
    canonical = []
    mapping = {}

    @lru_cache(maxsize=None)  # cache repeated calls to speed up fuzzy ratio
    def cached_ratio(a, b):
        return fuzz.token_sort_ratio(a, b)

    print("⚡ Optimized fuzzy canonical map building...")
    for ing in tqdm(ingredients, desc="Fuzzy matching", unit="ingredient"):
        match_found = False
        for canon in canonical:
            score = cached_ratio(ing, canon)
            if score >= threshold:
                mapping[ing] = canon
                match_found = True
                break
        if not match_found:
            canonical.append(ing)
            mapping[ing] = ing

    return mapping

def canonicalize_set(ingredient_set, mapping):
    return {mapping.get(ing, ing) for ing in ingredient_set}

def jaccard_index(a, b):
    return len(a & b) / len(a | b) if (a | b) else 0.0

In [10]:
# Extract all ingredients across datasets
set1_raw = extract_unique_ingredients(df1)
set2_raw = extract_unique_ingredients(df2)
set3_raw = extract_unique_ingredients(df3)

all_ingredients = list(set1_raw | set2_raw | set3_raw)

# Build fast fuzzy map
canonical_map = build_canonical_map(all_ingredients, threshold=85)

# Apply canonical map
canon1 = canonicalize_set(set1_raw, canonical_map)
canon2 = canonicalize_set(set2_raw, canonical_map)
canon3 = canonicalize_set(set3_raw, canonical_map)


KeyboardInterrupt: 

In [None]:
datasets = {
    "dataset_1": canon1,
    "dataset_2": canon2,
    "dataset_3": canon3,
}

names = list(datasets.keys())
results = pd.DataFrame(index=names, columns=names, dtype=float)

for i in names:
    for j in names:
        if i == j:
            results.loc[i, j] = 1.0
        else:
            results.loc[i, j] = round(jaccard_index(datasets[i], datasets[j]), 4)

In [None]:
# ------------------------
# Step 5: Save result
# ------------------------

output_dir = "./results"
os.makedirs(output_dir, exist_ok=True)
results.to_csv(os.path.join(output_dir, "jaccard_index_matrix.csv"))
print(results)