In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Added numpy for percentile calculation

# ---------------------------------------------
# Setup
# ---------------------------------------------
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 10)

# Assuming 'df' is your DataFrame loaded and columns cleaned

# ---------------------------------------------
# Define columns to explore
# ---------------------------------------------
nutritional_cols = [
    'calories_cal',
    'protein_g',
    'totalfat_g',
    'saturatedfat_g',
    'cholesterol_mg',
    'sodium_mg',
    'totalcarbohydrate_g',
    'dietaryfiber_g',
    'sugars_g',
    'duration',
    'ingredients_sizes',
    'who_score',
    'fsa_score',
    'nutri_score'

]

# Nicely formatted titles for plots
titles = {
    'calories_cal': 'Calories Distribution (99th Percentile)',
    'protein_g': 'Protein Distribution (99th Percentile)',
    'totalfat_g': 'Total Fat Distribution (99th Percentile)',
    'saturatedfat_g': 'Saturated Fat Distribution (99th Percentile)',
    'cholesterol_mg': 'Cholesterol Distribution (99th Percentile)',
    'sodium_mg': 'Sodium Distribution (99th Percentile)',
    'totalcarbohydrate_g': 'Total Carbohydrates Distribution (99th Percentile)',
    'dietaryfiber_g': 'Dietary Fiber Distribution (99th Percentile)',
    'sugars_g': 'Sugar Distribution (99th Percentile)',
    'duration': 'Cooking Duration Distribution (99th Percentile)',
    'ingredients_sizes': 'Ingredient Sizes Distribution (99th Percentile)',
    'who_score': 'WHO Score Distribution', # Scores usually don't need filtering, but kept consistent for structure
    'fsa_score': 'FSA Score Distribution',
    'nutri_score': 'Nutritional Score Distribution'
}

# ---------------------------------------------
# Plot all nutritional distributions in a grid (with percentile filtering)
# ---------------------------------------------
n = len(nutritional_cols)
rows = (n + 2) // 3  # roughly 3 per row

fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))
axes = axes.flatten()

for i, col in enumerate(nutritional_cols):
    if col in df.columns:
        # --- Filtering Added Here ---
        data_to_plot = df[col].dropna() # Drop NaNs for calculation and plotting
        if pd.api.types.is_numeric_dtype(data_to_plot) and not data_to_plot.empty:
            q99 = np.percentile(data_to_plot, 99)
            # Apply filtering mainly to columns prone to extreme outliers
            # Scores might not need it, but apply consistently unless specified otherwise
            if q99 > 0: # Avoid filtering if q99 is 0 or negative
               filtered_data = data_to_plot[data_to_plot <= q99]
            else:
               filtered_data = data_to_plot # Don't filter if percentile is non-positive
        else:
            filtered_data = data_to_plot # Use original data if not numeric or empty after dropna

        # Check if filtered_data is empty before plotting
        if not filtered_data.empty:
             sns.histplot(filtered_data, bins=50, ax=axes[i], color='mediumseagreen', edgecolor='black', kde=True)
             axes[i].set_title(titles[col], fontsize=14)
             axes[i].set_xlabel(col.replace('_', ' ').title())
             # Optionally set xlim to focus the view, especially for filtered columns
             if col not in ['who_score', 'fsa_score', 'nutri_score'] and q99 > 0: # Check q99 > 0
                  axes[i].set_xlim(0, q99)
             axes[i].set_ylabel('Frequency') # Add y-label
        else:
             axes[i].set_title(f"{titles[col]}\n(No data after filtering)", fontsize=14)
             axes[i].set_xlabel(col.replace('_', ' ').title())
             axes[i].set_ylabel('Frequency')

    else:
        axes[i].set_visible(False) # Hide axis if column not found

# Hide any empty subplots if the number of columns < grid size
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

fig.suptitle('Nutritional Attribute Distributions (Filtered to 99th Percentile)', fontsize=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# --------------------------------------------------------------
# bm25_search.py (initial unoptimized version)
# Task 3 – Simple BM25-based retrieval for recipe search
# --------------------------------------------------------------

import os
import re
import pickle
import joblib
import pandas as pd
from rank_bm25 import BM25Okapi
from text_preprocessing import preprocess_text


# -------------------------------------------------
# 1. Load the preprocessed dataset
# -------------------------------------------------
def load_dataset(path="data/hummus_recipes_preprocessed.csv"):
    print(f"📂 Loading dataset from: {path}")
    df = pd.read_csv(path, low_memory=True)
    print(f"✅ Dataset shape: {df.shape}")
    return df


# -------------------------------------------------
# 2. Build or load BM25 index
# -------------------------------------------------
def build_bm25_index(df, index_path="data/bm25_index.pkl", tokens_path="data/tokenized_docs.pkl"):
    """
    Builds or loads a cached BM25 index and tokenized documents.
    """
    if os.path.exists(index_path) and os.path.exists(tokens_path):
        print(f"📦 Loading cached BM25 index from disk...")
        with open(index_path, "rb") as f:
            bm25 = pickle.load(f)
        tokenized_docs = joblib.load(tokens_path)
        print(f"✅ Loaded BM25 index for {len(tokenized_docs)} recipes.")
        return bm25, tokenized_docs

    print("⚙️ Building new BM25 index...")

    # Combine relevant processed text columns
    text_cols = ["processed_title", "processed_ingredients", "processed_tags", "processed_directions"]
    available_cols = [c for c in text_cols if c in df.columns]
    print(f"Using columns: {available_cols}")

    # Combine text into one document per recipe
    df["combined_text"] = df[available_cols].fillna("").agg(" ".join, axis=1)

    # Tokenize (already preprocessed)
    tokenized_docs = [doc.split() for doc in df["combined_text"]]

    # Build BM25
    bm25 = BM25Okapi(tokenized_docs)

    # Save both index and tokenized docs for reuse
    with open(index_path, "wb") as f:
        pickle.dump(bm25, f)
    joblib.dump(tokenized_docs, tokens_path)

    print(f"✅ BM25 index built and saved for {len(tokenized_docs)} recipes.")
    return bm25, tokenized_docs


# -------------------------------------------------
# 3. Preprocess and search
# -------------------------------------------------
def preprocess_query(query: str):
    """Apply the same preprocessing as dataset text."""
    return preprocess_text(query)


def search_bm25(query: str, bm25, df, top_k=5):
    """Search using BM25 and return top_k results."""
    print(f"\n🔍 Query: {query}")
    tokens = preprocess_query(query)
    if not tokens:
        print("⚠️ Query resulted in no valid tokens after preprocessing.")
        return pd.DataFrame()

    scores = bm25.get_scores(tokens)
    top_indices = scores.argsort()[-top_k:][::-1]

    print(f"\nTop {top_k} results:")
    for rank, idx in enumerate(top_indices, 1):
        title = df.loc[idx, "title"] if "title" in df.columns else "(no title)"
        cal = df.loc[idx, "calories_cal"] if "calories_cal" in df.columns else "?"
        print(f"{rank}. {title}  ({cal} cal)")

    return df.iloc[top_indices][["title", "calories_cal", "totalfat_g", "protein_g"]]


# -------------------------------------------------
# 4. Run example
# -------------------------------------------------
if __name__ == "__main__":
    df = load_dataset()
    bm25, _ = build_bm25_index(df)

    # Example queries
    search_bm25("low fat chicken under 500 calories", bm25, df, top_k=5)
    search_bm25("high protein vegan salad", bm25, df, top_k=5)


In [None]:
# --------------------------------------------------------------
# semantic_retrieval.py (initial unoptimized version)
# Task 4: Semantic Embeddings & Retrieval (BERT-based)
# --------------------------------------------------------------

import os
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from text_preprocessing import preprocess_text


# -------------------------------------------------
# 1. Device check and model setup
# -------------------------------------------------
def setup_device_and_model(model_name="sentence-transformers/all-MiniLM-L6-v2"):
    """
    Checks for GPU availability and loads the Sentence-BERT model accordingly.
    Uses CUDA if available, otherwise falls back to CPU.
    """
    if torch.cuda.is_available():
        device = "cuda"
        print(f"✅ GPU detected: {torch.cuda.get_device_name(0)}")
    else:
        device = "cpu"
        print("⚠️ No GPU detected, using CPU")

    model = SentenceTransformer(model_name, device=device)
    print(f"Loaded model: {model_name} on {device.upper()}")
    return model, device


# -------------------------------------------------
# 2. Load dataset
# -------------------------------------------------
def load_dataset(input_path="data/hummus_recipes_preprocessed.csv"):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"❌ Dataset not found at: {input_path}")
    df = pd.read_csv(input_path)
    print(f"✅ Dataset loaded: {df.shape}")
    return df


# -------------------------------------------------
# 3. Build combined text field
# -------------------------------------------------
def build_combined_text(df):
    """
    Combines relevant text columns into a single searchable document per recipe.
    """
    text_cols = [
        "processed_title",
        "processed_ingredients",
        "processed_tags",
        "processed_directions"
    ]
    available_cols = [c for c in text_cols if c in df.columns]
    print(f"Using columns for embeddings: {available_cols}")

    df["combined_text"] = df[available_cols].fillna("").agg(" ".join, axis=1)
    print("✅ Combined text field created.")
    return df


# -------------------------------------------------
# 4. Create embeddings and save
# -------------------------------------------------
def create_and_save_embeddings(df, model, device, output_dir="data", batch_size=96):
    """
    Generates embeddings using Sentence-BERT and saves them as .npy.
    Adjusts batch size automatically based on VRAM (~6GB default).
    """
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "recipe_embeddings.npy")

    print(f"\n⚙️  Generating embeddings on {device.upper()} (batch size={batch_size}) ...")
    texts = df["combined_text"].tolist()

    embeddings = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )

    np.save(output_path, embeddings)
    print(f"✅ Embeddings created and saved → {output_path}")
    print(f"Shape: {embeddings.shape}")

    return embeddings


# -------------------------------------------------
# 5. Semantic search
# -------------------------------------------------
def semantic_search(query, model, df, embeddings, top_k=5):
    """
    Perform semantic retrieval based on cosine similarity between
    the query embedding and recipe embeddings.
    """
    print(f"\n🔍 Query: {query}")
    cleaned_query = " ".join(preprocess_text(query))
    query_emb = model.encode([cleaned_query], normalize_embeddings=True)

    scores = cosine_similarity(query_emb, embeddings)[0]
    top_indices = np.argsort(-scores)[:top_k]

    results = df.iloc[top_indices][["title", "calories_cal", "totalfat_g", "protein_g"]].copy()
    results["similarity"] = scores[top_indices]
    print("\nTop results:\n")
    print(results)
    return results


# -------------------------------------------------
# 6. Main
# -------------------------------------------------
def main():
    model, device = setup_device_and_model()
    df = load_dataset()
    df = build_combined_text(df)

    embeddings = create_and_save_embeddings(df, model, device, batch_size=96)

    # Optional: Test queries
    test_queries = [
        "low carb chicken meal",
        "high protein vegan breakfast",
        "sugar free dessert",
        "quick healthy pasta dinner"
    ]

    for q in test_queries:
        semantic_search(q, model, df, embeddings, top_k=5)


if __name__ == "__main__":
    main()
