In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

In [2]:
# Load datasets
dataset_1 = pd.read_parquet("../../datasets/dataset_1.parquet")
dataset_2 = pd.read_parquet("../../datasets/dataset_2.parquet")
dataset_3 = pd.read_parquet("../../datasets/dataset_3.parquet")

In [3]:
def clean_ingredients(df: pd.DataFrame) -> pd.Series:
    """Clean ingredient strings by filling NA and converting to string."""
    return df['ingredients'].fillna("").astype(str)

ingredients_1 = clean_ingredients(dataset_1)
ingredients_2 = clean_ingredients(dataset_2)
ingredients_3 = clean_ingredients(dataset_3)


In [4]:
# ----------------------
# TF-IDF Vectorization
# ----------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_1 = vectorizer.fit_transform(ingredients_1)
tfidf_2 = vectorizer.transform(ingredients_2)
tfidf_3 = vectorizer.transform(ingredients_3)

# Normalize vectors
tfidf_1 = normalize(tfidf_1, norm='l2', axis=1)
tfidf_2 = normalize(tfidf_2, norm='l2', axis=1)
tfidf_3 = normalize(tfidf_3, norm='l2', axis=1)

In [5]:
# ----------------------
# Compute Average Cosine Similarities
# ----------------------
def avg_cosine(tfidf_a, tfidf_b):
    return cosine_similarity(tfidf_a, tfidf_b).mean()

sim_1_2 = avg_cosine(tfidf_1, tfidf_2)
sim_1_3 = avg_cosine(tfidf_1, tfidf_3)
sim_2_3 = avg_cosine(tfidf_2, tfidf_3)

# Construct symmetric similarity matrix
sim_matrix = np.array([
    [1.0,     sim_1_2, sim_1_3],
    [sim_1_2, 1.0,     sim_2_3],
    [sim_1_3, sim_2_3, 1.0]
])

In [6]:

# ----------------------
# Save as CSV
# ----------------------
labels = ["dataset_1", "dataset_2", "dataset_3"]
sim_df = pd.DataFrame(sim_matrix, index=labels, columns=labels)

output_dir = "results"
os.makedirs(output_dir, exist_ok=True)
sim_df.to_csv(os.path.join(output_dir, "avg_cosine_similarity_matrix.csv"))

# ----------------------
# Print result
# ----------------------
print("Average Cosine Similarity Matrix:")
print(sim_df)

Average Cosine Similarity Matrix:
           dataset_1  dataset_2  dataset_3
dataset_1   1.000000   0.012810   0.013978
dataset_2   0.012810   1.000000   0.043379
dataset_3   0.013978   0.043379   1.000000
