# SnackTrack ML --- Content-Based Filtering Analysis

This notebook builds and evaluates the **content-based filtering** pipeline that
powers SnackTrack's recipe recommendations. We walk through every stage:

1. **Ingredient vectorization** -- TF-IDF + TruncatedSVD to produce 128-D ingredient vectors
   (matching the `ingredient_vector` column in the `recipes` table)
2. **Latent space visualization** -- t-SNE projections colored by cuisine
3. **User preference vectors** -- Weighted aggregation of recipe vectors from interactions
   (replicating `retrain_user_model()` from `app/recommender/hybrid.py`)
4. **Weight sensitivity** -- How changes in `INTERACTION_WEIGHTS` affect preference vectors
5. **Similarity sanity check** -- Top-K retrieval using cosine similarity
6. **Nutrition vector analysis** -- 12-D nutrition features, PCA/t-SNE, KMeans clustering

> **Prerequisites**: Run notebooks `00` and `01` first so that Parquet datasets are available.

In [None]:
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore", category=FutureWarning)

# Allow imports from the parent directory
sys.path.insert(0, "..")

from notebooks.utils.plot_helpers import (
    setup_plot_style,
    plot_latent_space_2d,
    plot_feature_distributions,
    SNACKTRACK_COLORS,
    PALETTE,
)
from notebooks.utils.data_loader import load_kaggle_dataset, extract_vae_features

setup_plot_style()

print("Environment ready.")

## 1. Build Ingredient Vectors

The `recipe_ingredients` Kaggle dataset contains ~40K recipes with:
- A list of **ingredients** (free-text)
- A **cuisine** label

We transform the ingredient lists into dense 128-dimensional vectors using:
1. **TF-IDF** on the ingredient text (captures ingredient frequency/importance)
2. **TruncatedSVD** to reduce to 128 dimensions (matching the DB `ingredient_vector` schema)

In [None]:
# Load recipe ingredients dataset
try:
    recipe_ing = load_kaggle_dataset("recipe_ingredients")
    print(f"Loaded recipe_ingredients: {recipe_ing.shape[0]:,} rows, {recipe_ing.shape[1]} columns")
    print(f"Columns: {list(recipe_ing.columns)}")
    display(recipe_ing.head(3))
except FileNotFoundError as e:
    print(f"Dataset not found: {e}")
    print("Please run notebook 00 first.")
    recipe_ing = pd.DataFrame()

In [None]:
EMBEDDING_DIM = 128  # Matches DB ingredient_vector dimension

if not recipe_ing.empty:
    # Identify the ingredients column (may be named 'ingredients' or 'ingredient_list')
    ing_col = None
    for c in ["ingredients", "ingredient_list", "ingredient"]:
        if c in recipe_ing.columns:
            ing_col = c
            break

    # Identify cuisine column
    cuisine_col = None
    for c in ["cuisine", "cuisine_type", "cuisine_types"]:
        if c in recipe_ing.columns:
            cuisine_col = c
            break

    if ing_col is None:
        raise ValueError(f"Could not find ingredients column. Available: {list(recipe_ing.columns)}")

    # Convert ingredient lists to strings if they are stored as lists
    def ingredients_to_text(val):
        """Convert ingredient values to a single text string."""
        if isinstance(val, list):
            return " ".join(str(v) for v in val)
        if isinstance(val, str):
            # Handle JSON-encoded lists like '["salt", "pepper"]'
            if val.startswith("["):
                import json
                try:
                    return " ".join(json.loads(val))
                except (json.JSONDecodeError, TypeError):
                    pass
            return val
        return str(val)

    recipe_ing["ingredients_text"] = recipe_ing[ing_col].apply(ingredients_to_text)

    # Drop rows with empty/missing ingredients
    recipe_ing = recipe_ing[recipe_ing["ingredients_text"].str.strip().str.len() > 0].reset_index(drop=True)
    print(f"Recipes with valid ingredients: {len(recipe_ing):,}")

    # Step 1: TF-IDF vectorization
    tfidf = TfidfVectorizer(
        max_features=5000,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.95,
    )
    tfidf_matrix = tfidf.fit_transform(recipe_ing["ingredients_text"])
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

    # Step 2: TruncatedSVD to reduce to 128 dimensions
    svd = TruncatedSVD(n_components=EMBEDDING_DIM, random_state=42)
    ingredient_vectors = svd.fit_transform(tfidf_matrix)
    explained_var = svd.explained_variance_ratio_.sum()

    print(f"Ingredient vectors shape: {ingredient_vectors.shape}")
    print(f"Explained variance: {explained_var:.1%}")
    print(f"\nVector statistics:")
    print(f"  Mean: {ingredient_vectors.mean():.4f}")
    print(f"  Std:  {ingredient_vectors.std():.4f}")
    print(f"  Min:  {ingredient_vectors.min():.4f}")
    print(f"  Max:  {ingredient_vectors.max():.4f}")
    print(f"  L2 norm (mean): {np.linalg.norm(ingredient_vectors, axis=1).mean():.4f}")
else:
    ingredient_vectors = None
    print("No recipe ingredients data available.")

## 2. Ingredient Space Visualization

We use **t-SNE** to project the 128-D ingredient vectors down to 2 dimensions
and color each point by its **cuisine type**. This reveals whether ingredient
compositions naturally cluster by cuisine -- a prerequisite for effective
content-based recommendations.

In [None]:
if ingredient_vectors is not None and cuisine_col is not None:
    # Sample for faster t-SNE (full dataset may be too large)
    MAX_TSNE_SAMPLES = 5000
    n_samples = min(len(ingredient_vectors), MAX_TSNE_SAMPLES)

    rng = np.random.RandomState(42)
    sample_idx = rng.choice(len(ingredient_vectors), size=n_samples, replace=False)

    sample_vectors = ingredient_vectors[sample_idx]
    sample_cuisines = recipe_ing[cuisine_col].iloc[sample_idx].values

    # Keep only top-N cuisines for readable legend
    top_cuisines = recipe_ing[cuisine_col].value_counts().head(10).index.tolist()
    cuisine_labels = [
        c if c in top_cuisines else "other"
        for c in sample_cuisines
    ]

    print(f"Running t-SNE on {n_samples:,} samples...")
    fig = plot_latent_space_2d(
        sample_vectors,
        labels=cuisine_labels,
        title=f"Ingredient Space (t-SNE, {n_samples:,} recipes, top-10 cuisines)",
    )
    plt.show()

    # Print cuisine counts in sample
    cuisine_series = pd.Series(cuisine_labels)
    print(f"\nCuisine distribution in sample:")
    print(cuisine_series.value_counts().to_string())
elif ingredient_vectors is not None:
    print("No cuisine column found -- plotting without color labels.")
    MAX_TSNE_SAMPLES = 5000
    n_samples = min(len(ingredient_vectors), MAX_TSNE_SAMPLES)
    rng = np.random.RandomState(42)
    sample_idx = rng.choice(len(ingredient_vectors), size=n_samples, replace=False)
    fig = plot_latent_space_2d(
        ingredient_vectors[sample_idx],
        title=f"Ingredient Space (t-SNE, {n_samples:,} recipes)",
    )
    plt.show()
else:
    print("Skipping visualization -- no ingredient vectors available.")

## 3. Preference Vector Construction

This section replicates the core logic from `retrain_user_model()` in
`app/recommender/hybrid.py`. Given a set of user interactions, we compute a
**preference vector** as a weighted average of the recipe ingredient vectors.

The interaction weights used in production are:

| Interaction Type | Weight |
|-----------------|--------|
| `cook` | 5.0 |
| `log` | 4.0 |
| `swap_accept` | 3.0 |
| `rate` | 1.0 x rating value |
| `view` | 1.0 |
| `swap_reject` | -2.0 |

In [None]:
# Production interaction weights (from app/recommender/hybrid.py)
INTERACTION_WEIGHTS = {
    "cook": 5.0,
    "log": 4.0,
    "swap_accept": 3.0,
    "rate": 1.0,   # multiplied by actual rating value
    "view": 1.0,
    "swap_reject": -2.0,
}


def compute_preference_vector(
    interactions: list[dict],
    recipe_vectors: np.ndarray,
    recipe_ids: list,
    weights: dict | None = None,
) -> np.ndarray:
    """Compute a user preference vector from weighted recipe interactions.

    Matches the logic in retrain_user_model():
    1. Look up the recipe's ingredient vector for each interaction
    2. Multiply by the interaction weight (and rating value for 'rate' type)
    3. Compute weighted average
    4. Normalize to unit vector
    """
    if weights is None:
        weights = INTERACTION_WEIGHTS

    # Build recipe_id -> vector lookup
    id_to_idx = {rid: i for i, rid in enumerate(recipe_ids)}

    vectors = []
    w_list = []

    for interaction in interactions:
        rid = interaction["recipe_id"]
        if rid not in id_to_idx:
            continue

        vec = recipe_vectors[id_to_idx[rid]]
        itype = interaction["interaction_type"]
        ivalue = float(interaction.get("interaction_value", 0) or 0)

        w = weights.get(itype, 1.0)
        if itype == "rate":
            w *= ivalue

        vectors.append(vec)
        w_list.append(w)

    if not vectors:
        return np.zeros(recipe_vectors.shape[1])

    vectors_arr = np.array(vectors)
    weights_arr = np.array(w_list).reshape(-1, 1)

    # Clip extreme weights
    weights_arr = np.clip(weights_arr, -10, 10)
    total_weight = np.abs(weights_arr).sum()
    if total_weight == 0:
        total_weight = 1.0

    pref_vec = (vectors_arr * weights_arr).sum(axis=0) / total_weight

    # Normalize to unit vector
    norm = np.linalg.norm(pref_vec)
    if norm > 0:
        pref_vec = pref_vec / norm

    return pref_vec


if ingredient_vectors is not None:
    # Create sample recipe IDs
    if "id" in recipe_ing.columns:
        recipe_ids = recipe_ing["id"].tolist()
    else:
        recipe_ids = list(range(len(recipe_ing)))

    # Simulate a user who has interacted with 15 random recipes
    rng = np.random.RandomState(123)
    sample_recipe_indices = rng.choice(len(recipe_ids), size=15, replace=False)

    interaction_types = ["cook", "log", "view", "rate", "swap_accept", "view",
                         "cook", "log", "view", "rate", "swap_reject",
                         "view", "cook", "log", "rate"]

    sample_interactions = []
    for i, idx in enumerate(sample_recipe_indices):
        interaction = {
            "recipe_id": recipe_ids[idx],
            "interaction_type": interaction_types[i],
            "interaction_value": rng.choice([3, 4, 5]) if interaction_types[i] == "rate" else None,
        }
        sample_interactions.append(interaction)

    # Compute preference vector
    pref_vec = compute_preference_vector(
        sample_interactions, ingredient_vectors, recipe_ids
    )

    print(f"Sample user has {len(sample_interactions)} interactions")
    print(f"Preference vector shape: {pref_vec.shape}")
    print(f"Preference vector L2 norm: {np.linalg.norm(pref_vec):.4f}")
    print(f"Non-zero dimensions: {(pref_vec != 0).sum()}/{len(pref_vec)}")

    # Show interaction breakdown
    interaction_summary = pd.DataFrame(sample_interactions)
    print(f"\nInteraction breakdown:")
    print(interaction_summary["interaction_type"].value_counts().to_string())
else:
    pref_vec = None
    print("Skipping -- no ingredient vectors available.")

## 4. Interaction Weight Sensitivity

How sensitive is the resulting preference vector to changes in the interaction weights?
We vary each weight individually and measure the **cosine distance** between the
resulting preference vector and the baseline. A **heatmap** shows which weights
have the most influence.

In [None]:
if ingredient_vectors is not None and pref_vec is not None:
    # Weight multipliers to test
    multipliers = [0.0, 0.5, 1.0, 2.0, 3.0, 5.0]
    interaction_types_to_test = ["cook", "log", "swap_accept", "rate", "view", "swap_reject"]

    # Compute baseline
    baseline_vec = compute_preference_vector(
        sample_interactions, ingredient_vectors, recipe_ids, INTERACTION_WEIGHTS
    )

    # Build cosine distance matrix
    distance_matrix = np.zeros((len(interaction_types_to_test), len(multipliers)))

    for i, itype in enumerate(interaction_types_to_test):
        for j, mult in enumerate(multipliers):
            modified_weights = INTERACTION_WEIGHTS.copy()
            modified_weights[itype] = INTERACTION_WEIGHTS[itype] * mult

            modified_vec = compute_preference_vector(
                sample_interactions, ingredient_vectors, recipe_ids, modified_weights
            )

            # Cosine distance = 1 - cosine_similarity
            if np.linalg.norm(modified_vec) > 0 and np.linalg.norm(baseline_vec) > 0:
                cos_sim = cosine_similarity(
                    baseline_vec.reshape(1, -1), modified_vec.reshape(1, -1)
                )[0, 0]
                distance_matrix[i, j] = 1.0 - cos_sim
            else:
                distance_matrix[i, j] = 1.0

    # Plot heatmap
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(
        distance_matrix,
        xticklabels=[f"{m:.1f}x" for m in multipliers],
        yticklabels=interaction_types_to_test,
        annot=True,
        fmt=".4f",
        cmap="YlOrRd",
        ax=ax,
        cbar_kws={"label": "Cosine Distance from Baseline"},
    )
    ax.set_xlabel("Weight Multiplier", fontsize=12)
    ax.set_ylabel("Interaction Type", fontsize=12)
    ax.set_title("Preference Vector Sensitivity to Interaction Weights", fontsize=14)
    plt.tight_layout()
    plt.show()

    # Identify most sensitive weight
    max_sensitivity = distance_matrix.max(axis=1)
    most_sensitive = interaction_types_to_test[np.argmax(max_sensitivity)]
    print(f"\nMost sensitive interaction type: '{most_sensitive}' "
          f"(max cosine distance = {max_sensitivity.max():.4f})")
    print(f"Least sensitive: '{interaction_types_to_test[np.argmin(max_sensitivity)]}' "
          f"(max cosine distance = {max_sensitivity.min():.4f})")
else:
    print("Skipping weight sensitivity analysis -- no vectors available.")

## 5. Content Similarity Sanity Check

For a sample preference vector, we retrieve the **top-10 most similar recipes**
by cosine similarity and display their names, cuisines, and similarity scores.
This replicates the core retrieval step of `get_content_recommendations()`.

In [None]:
if ingredient_vectors is not None and pref_vec is not None:
    # Compute cosine similarity between preference vector and all recipes
    similarities = cosine_similarity(pref_vec.reshape(1, -1), ingredient_vectors)[0]

    # Get top-10 most similar
    top_k = 10
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    # Build results table
    results = []
    for rank, idx in enumerate(top_indices, 1):
        row = {"Rank": rank, "Similarity": f"{similarities[idx]:.4f}"}

        # Try to get recipe name/ID
        for name_col in ["title", "name", "recipe_name", "dish"]:
            if name_col in recipe_ing.columns:
                row["Recipe"] = recipe_ing.iloc[idx][name_col]
                break
        else:
            row["Recipe"] = f"Recipe #{idx}"

        if cuisine_col and cuisine_col in recipe_ing.columns:
            row["Cuisine"] = recipe_ing.iloc[idx][cuisine_col]

        # Show a snippet of ingredients
        ing_text = recipe_ing.iloc[idx].get("ingredients_text", "")
        row["Ingredients (preview)"] = (ing_text[:80] + "...") if len(str(ing_text)) > 80 else ing_text

        results.append(row)

    results_df = pd.DataFrame(results)
    print(f"Top-{top_k} recipes most similar to sample user's preference vector:\n")
    display(results_df)

    # Distribution of similarity scores
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.hist(similarities, bins=80, color=SNACKTRACK_COLORS["primary"], edgecolor="white", alpha=0.8)
    ax.axvline(similarities[top_indices[-1]], color="red", linestyle="--",
               linewidth=1.5, label=f"Top-{top_k} threshold = {similarities[top_indices[-1]]:.3f}")
    ax.set_xlabel("Cosine Similarity to Preference Vector")
    ax.set_ylabel("Number of Recipes")
    ax.set_title("Similarity Score Distribution", fontsize=13)
    ax.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Skipping similarity check -- no vectors or preference vector available.")

## 6. Nutrition Vector Analysis

Beyond ingredient vectors, SnackTrack uses a **12-dimensional nutrition vector**
for each recipe (see `extract_vae_features()` in `utils/data_loader.py`). These
features include:

| Dim | Feature |
|-----|----------|
| 0 | calories |
| 1 | protein_g |
| 2 | carbs_g |
| 3 | fat_g |
| 4 | sodium_mg |
| 5 | fiber_g |
| 6 | sugar_g |
| 7 | ready_in_minutes |
| 8 | servings |
| 9 | vegetarian flag |
| 10 | vegan flag |
| 11 | gluten_free flag |

We visualize these features with PCA/t-SNE and apply KMeans clustering to see
if natural recipe groupings emerge from nutrition data alone.

In [None]:
# Load a dataset with nutrition columns
nutrition_df = None

# Try multiple Kaggle sources for nutrition data
for ds_name in ["global_food_nutrition", "epicurious", "daily_food_nutrition", "diet_recommendations"]:
    try:
        candidate = load_kaggle_dataset(ds_name)
        # Check if it has enough nutrition columns
        nutrition_cols_present = sum(
            1 for c in ["calories", "protein_g", "protein", "carbs_g", "carbohydrates",
                        "fat_g", "fat", "fiber_g", "fiber", "sugar_g", "sugar"]
            if c in candidate.columns
        )
        if nutrition_cols_present >= 3:
            nutrition_df = candidate
            print(f"Using '{ds_name}' for nutrition analysis ({len(nutrition_df):,} rows)")
            break
    except FileNotFoundError:
        continue

if nutrition_df is not None:
    # Standardize column names for extract_vae_features()
    col_renames = {
        "protein": "protein_g",
        "carbohydrates": "carbs_g",
        "carbs": "carbs_g",
        "total_carbohydrate_g": "carbs_g",
        "fat": "fat_g",
        "total_fat_g": "fat_g",
        "fiber": "fiber_g",
        "dietary_fiber_g": "fiber_g",
        "sugar": "sugar_g",
        "sugars_g": "sugar_g",
        "sodium": "sodium_mg",
        "energy_kcal": "calories",
    }
    nutrition_df = nutrition_df.rename(columns={
        c: col_renames[c] for c in nutrition_df.columns if c in col_renames
    })

    # Extract 12D feature vectors (same logic as production)
    features_12d = extract_vae_features(nutrition_df)
    print(f"Extracted 12D nutrition vectors: {features_12d.shape}")

    # Remove rows with all zeros
    nonzero_mask = features_12d.sum(axis=1) > 0
    features_12d = features_12d[nonzero_mask]
    print(f"After removing zero rows: {features_12d.shape}")

    # Plot feature distributions
    feature_names = [
        "calories", "protein_g", "carbs_g", "fat_g", "sodium_mg",
        "fiber_g", "sugar_g", "ready_in_min", "servings",
        "vegetarian", "vegan", "gluten_free",
    ]
    fig = plot_feature_distributions(features_12d, feature_names, title="12D Nutrition Feature Distributions")
    plt.show()
else:
    features_12d = None
    print("No nutrition dataset with sufficient columns found.")

In [None]:
if features_12d is not None and len(features_12d) > 100:
    # Standardize for PCA/clustering
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_12d)

    # --- PCA ---
    pca = PCA(n_components=2, random_state=42)
    pca_coords = pca.fit_transform(features_scaled)

    print(f"PCA explained variance: {pca.explained_variance_ratio_.sum():.1%}")
    print(f"  PC1: {pca.explained_variance_ratio_[0]:.1%}")
    print(f"  PC2: {pca.explained_variance_ratio_[1]:.1%}")

    # --- KMeans clustering ---
    n_clusters = 6
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(features_scaled)

    # Sample for visualization
    MAX_VIS = 5000
    if len(pca_coords) > MAX_VIS:
        rng = np.random.RandomState(42)
        vis_idx = rng.choice(len(pca_coords), size=MAX_VIS, replace=False)
    else:
        vis_idx = np.arange(len(pca_coords))

    fig, axes = plt.subplots(1, 2, figsize=(18, 7))

    # (a) PCA colored by cluster
    for cl in range(n_clusters):
        mask = cluster_labels[vis_idx] == cl
        axes[0].scatter(
            pca_coords[vis_idx][mask, 0],
            pca_coords[vis_idx][mask, 1],
            label=f"Cluster {cl}",
            color=PALETTE[cl % len(PALETTE)],
            alpha=0.5,
            s=15,
        )
    axes[0].set_title(f"PCA of 12D Nutrition Vectors (K={n_clusters})", fontsize=13)
    axes[0].set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.0%} var)")
    axes[0].set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.0%} var)")
    axes[0].legend(fontsize=9)

    # (b) t-SNE colored by cluster
    from sklearn.manifold import TSNE

    tsne_sample = min(3000, len(features_scaled))
    rng2 = np.random.RandomState(42)
    tsne_idx = rng2.choice(len(features_scaled), size=tsne_sample, replace=False)

    print(f"Running t-SNE on {tsne_sample:,} nutrition samples...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    tsne_coords = tsne.fit_transform(features_scaled[tsne_idx])

    for cl in range(n_clusters):
        mask = cluster_labels[tsne_idx] == cl
        axes[1].scatter(
            tsne_coords[mask, 0],
            tsne_coords[mask, 1],
            label=f"Cluster {cl}",
            color=PALETTE[cl % len(PALETTE)],
            alpha=0.5,
            s=15,
        )
    axes[1].set_title(f"t-SNE of 12D Nutrition Vectors (K={n_clusters})", fontsize=13)
    axes[1].set_xlabel("t-SNE dim 1")
    axes[1].set_ylabel("t-SNE dim 2")
    axes[1].legend(fontsize=9)

    plt.tight_layout()
    plt.show()

    # Cluster statistics
    print(f"\nCluster sizes:")
    for cl in range(n_clusters):
        mask = cluster_labels == cl
        cluster_means = features_12d[mask].mean(axis=0)
        print(f"  Cluster {cl}: {mask.sum():>6,} recipes  "
              f"| avg cal={cluster_means[0]:.0f}  "
              f"prot={cluster_means[1]:.1f}g  "
              f"carbs={cluster_means[2]:.1f}g  "
              f"fat={cluster_means[3]:.1f}g")
else:
    print("Skipping PCA/t-SNE/KMeans -- insufficient nutrition data.")

print("\nContent-based analysis complete.")