In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from rapidfuzz import fuzz
from tqdm import tqdm
import re
from functools import lru_cache
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from collections import Counter


In [3]:
# Load datasets
dataset_1 = pd.read_parquet("../datasets/dataset_1.parquet")
dataset_2 = pd.read_parquet("../datasets/dataset_2.parquet")
dataset_3 = pd.read_parquet("../datasets/dataset_3.parquet")

In [4]:
def normalize_ingredient(ingredient):
    # Optional: refine based on your data
    return " ".join(
        sorted(re.sub(r"[^\w\s]", "", ingredient.lower().strip()).split())
    )

def extract_all_ingredients(dfs):
    ingredients_set = set()
    for df in dfs:
        for row in df['ingredients'].dropna():
            if isinstance(row, str):
                items = row.split(',')
            elif isinstance(row, (list, np.ndarray, tuple)):
                items = row
            else:
                continue
            for ing in items:
                if isinstance(ing, str):
                    ingredients_set.add(normalize_ingredient(ing))
    return sorted(ingredients_set)

dfs = [dataset_1, dataset_2, dataset_3]
all_ingredients = extract_all_ingredients(dfs)


In [5]:
def build_canonical_map_semantic(ingredients_list, threshold=85, model_name="all-MiniLM-L6-v2"):
    print("🧹 Preprocessing ingredients...")
    normalized_to_original = {normalize_ingredient(ing): ing for ing in ingredients_list}
    unique_normalized = list(normalized_to_original.keys())
    unique_original = [normalized_to_original[norm] for norm in unique_normalized]

    print("🧠 Loading embedding model (CPU)...")
    model = SentenceTransformer(model_name)

    print("🔤 Encoding ingredients (CPU)...")
    embeddings = model.encode(
        unique_normalized,
        show_progress_bar=True,
        batch_size=128,           # Good default for CPU
        convert_to_numpy=True,
        device="cpu"
    )

    print("🧱 Clustering...")
    clustering_model = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=1.2,   # Tune this if needed
        metric="cosine",
        linkage="average"
    )
    clusters = clustering_model.fit_predict(embeddings)

    print("📎 Building cluster dictionary...")
    cluster_dict = {}
    for idx, cluster_id in enumerate(clusters):
        if cluster_id not in cluster_dict:
            cluster_dict[cluster_id] = []
        cluster_dict[cluster_id].append(unique_normalized[idx])

    freq_counter = Counter(unique_original)

    print("📎 Building canonical map...")
    canonical_map = {}

    for cluster_id, members in tqdm(cluster_dict.items(), desc="Choosing canonicals"):
        # Use frequency to choose most common item
        canonical_normalized = max(members, key=lambda x: freq_counter[normalized_to_original[x]])
        canonical_original = normalized_to_original[canonical_normalized]

        # Optional: use fuzzy matching to filter noisy mappings
        for member_normalized in members:
            member_original = normalized_to_original[member_normalized]
            if fuzz.ratio(member_normalized, canonical_normalized) >= threshold:
                canonical_map[member_original] = canonical_original

    return canonical_map

# Run it
canonical_map = build_canonical_map_semantic(all_ingredients)

🧹 Preprocessing ingredients...
🧠 Loading embedding model (CPU)...
🔤 Encoding ingredients (CPU)...


Batches:   0%|          | 0/108587 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def apply_canonical_map(df, mapping):
    cleaned_rows = []
    print(" Applying canonical ingredient mapping...")
    for row in tqdm(df['ingredients'].fillna("").astype(str), desc="Processing rows", unit="row"):
        if isinstance(row, str):
            items = [normalize_ingredient(i) for i in row.split(',')]
        else:
            items = []
        canonical_items = {mapping.get(item, item) for item in items}
        cleaned_rows.append(", ".join(sorted(canonical_items)))
    return cleaned_rows

canon_1 = apply_canonical_map(dataset_1, canonical_map)
canon_2 = apply_canonical_map(dataset_2, canonical_map)
canon_3 = apply_canonical_map(dataset_3, canonical_map)


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_1 = vectorizer.fit_transform(canon_1)
tfidf_2 = vectorizer.transform(canon_2)
tfidf_3 = vectorizer.transform(canon_3)

tfidf_1 = normalize(tfidf_1, norm='l2', axis=1)
tfidf_2 = normalize(tfidf_2, norm='l2', axis=1)
tfidf_3 = normalize(tfidf_3, norm='l2', axis=1)

def avg_cosine(tfidf_a, tfidf_b):
    return cosine_similarity(tfidf_a, tfidf_b).mean()

sim_1_2 = avg_cosine(tfidf_1, tfidf_2)
sim_1_3 = avg_cosine(tfidf_1, tfidf_3)
sim_2_3 = avg_cosine(tfidf_2, tfidf_3)

sim_matrix = np.array([
    [1.0,     sim_1_2, sim_1_3],
    [sim_1_2, 1.0,     sim_2_3],
    [sim_1_3, sim_2_3, 1.0]
])

In [None]:

# ----------------------
# Save as CSV
# ----------------------
labels = ["dataset_1", "dataset_2", "dataset_3"]
sim_df = pd.DataFrame(sim_matrix, index=labels, columns=labels)

output_dir = "/results"
os.makedirs(output_dir, exist_ok=True)
sim_df.to_csv(os.path.join(output_dir, "avg_cosine_similarity_matrix.csv"))

# ----------------------
# Print result
# ----------------------
print("Average Cosine Similarity Matrix:")
print(sim_df)