In [29]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

In [30]:
# Load datasets
dataset_1 = pd.read_parquet("../../datasets/dataset_1.parquet")
# dataset_2 = pd.read_parquet("../../datasets/dataset_2.parquet")
dataset_3 = pd.read_parquet("../../datasets/dataset_3.parquet")

In [31]:
def clean_ingredients(df: pd.DataFrame) -> pd.Series:
    """Clean ingredient strings by filling NA and converting to string."""
    return df['ingredients'].fillna("").astype(str)

ingredients_1 = clean_ingredients(dataset_1)
#ingredients_2 = clean_ingredients(dataset_2)
ingredients_3 = clean_ingredients(dataset_3)


In [32]:
# ----------------------
# TF-IDF Vectorization
# ----------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_1 = vectorizer.fit_transform(ingredients_1)
tfidf_3 = vectorizer.transform(ingredients_3)

# Normalize vectors
tfidf_1 = normalize(tfidf_1, norm='l2', axis=1)
tfidf_3 = normalize(tfidf_3, norm='l2', axis=1)

In [33]:
# From dataset_1 to dataset_3
sim_1_to_3 = cosine_similarity(tfidf_1, tfidf_3).mean()

# Since cosine similarity is symmetric, sim_3_to_1 == sim_1_to_3
avg_similarity = sim_1_to_3

# Self-similarities (diagonal) are always 1.0
sim_matrix = np.array([
    [1.0, avg_similarity],
    [avg_similarity, 1.0]
])

In [35]:
# Create labels and DataFrame
labels = ["dataset_1", "dataset_3"]
sim_df = pd.DataFrame(sim_matrix, index=labels, columns=labels)

# ----------------------
# Save as CSV
# ----------------------
output_dir = "results"
os.makedirs(output_dir, exist_ok=True)
sim_df.to_csv(os.path.join(output_dir, "avg_cosine_similarity_matrix.csv"))

# ----------------------
# Print result
# ----------------------
print("Average Cosine Similarity Matrix:")
print(sim_df)

Average Cosine Similarity Matrix:
           dataset_1  dataset_3
dataset_1   1.000000   0.013978
dataset_3   0.013978   1.000000
