# Feature Engineering for Food Recommendation System

This notebook performs feature engineering on the cleaned Open Food Facts dataset using PySpark for scalable processing.

## Data Sources:
- `../data/cleaned_food_data_filtered.csv` - Original cleaned food data
- `../data/engineered_features_filtered.csv` - Previously engineered data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import sys
import os
from datetime import datetime

# PySpark imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import (
    VectorAssembler, StandardScaler as SparkScaler,
    HashingTF, IDF, Tokenizer, StopWordsRemover,
    StringIndexer, OneHotEncoder, Bucketizer
)
from pyspark.ml import Pipeline
from pyspark.sql.functions import (
    udf, col, when, isnan, isnull,
    regexp_replace, lower, trim, size
)

print("Libraries imported successfully!")

In [None]:
# Initialize PySpark Session
spark = SparkSession.builder \
    .appName("FoodRecommendationFeatureEngineering") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"Spark session created! Version: {spark.version}")

# Load data
df_spark = spark.read.csv('../data/cleaned_food_data_filtered.csv', 
                         header=True, inferSchema=True)
print(f"Data loaded: {df_spark.count():,} rows, {len(df_spark.columns)} columns")

# Show basic info
print("\nData Schema:")
df_spark.printSchema()

## Text Feature Engineering

Create features from text fields (ingredients, categories).

In [None]:
def create_text_features(df):
    """Create text-based features"""
    print("Creating text features...")
    
    # Clean ingredients text
    if 'ingredients_text' in df.columns:
        df = df.withColumn(
            'ingredients_filtered',
            F.regexp_replace(F.lower(F.col('ingredients_text')), r'[^a-zA-Z\s,]', '')
        )
        
        # Count ingredients
        df = df.withColumn(
            'ingredient_count',
            F.size(F.split(F.col('ingredients_filtered'), ','))
        )
        
        # Allergen flags
        allergens = ['gluten', 'milk', 'eggs', 'nuts', 'peanuts', 'soy']
        for allergen in allergens:
            df = df.withColumn(
                f'contains_{allergen}',
                F.when(F.col('ingredients_filtered').contains(allergen), 1).otherwise(0)
            )
    
    # Category processing
    if 'main_category' in df.columns:
        df = df.withColumn(
            'category_count',
            F.size(F.split(F.col('main_category'), ','))
        )
    
    return df

# Apply text features
df_with_text = create_text_features(df_spark)
print("Text features created successfully!")
df_with_text.select(['product_name', 'ingredient_count', 'contains_gluten', 'contains_milk']).show(5)

## Nutritional Feature Engineering

Create nutritional ratios and health scores.

In [None]:
def create_nutritional_features(df):
    """Create nutritional features"""
    print("Creating nutritional features...")
    
    # Energy ratios
    if 'energy_100g' in df.columns and 'fat_100g' in df.columns:
        df = df.withColumn(
            'fat_energy_ratio',
            F.when(F.col('energy_100g') > 0, 
                  (F.col('fat_100g') * 9) / F.col('energy_100g')).otherwise(0)
        )
    
    if 'carbohydrates_100g' in df.columns:
        df = df.withColumn(
            'carb_energy_ratio',
            F.when(F.col('energy_100g') > 0,
                  (F.col('carbohydrates_100g') * 4) / F.col('energy_100g')).otherwise(0)
        )
    
    # Health score based on nutriscore
    if 'nutriscore_grade' in df.columns:
        nutriscore_map = {'a': 5, 'b': 4, 'c': 3, 'd': 2, 'e': 1}
        nutriscore_udf = udf(lambda x: nutriscore_map.get(x.lower() if x else None, 0), IntegerType())
        df = df.withColumn('health_score', nutriscore_udf(F.col('nutriscore_grade')))
    
    # Energy category
    if 'energy_100g' in df.columns:
        energy_buckets = [-float('inf'), 100, 300, 500, float('inf')]
        bucketizer = Bucketizer(splits=energy_buckets, inputCol='energy_100g', outputCol='energy_category')
        df = bucketizer.transform(df)
    
    return df

# Apply nutritional features
df_with_nutrition = create_nutritional_features(df_with_text)
print("Nutritional features created successfully!")
df_with_nutrition.select(['product_name', 'health_score', 'fat_energy_ratio', 'energy_category']).show(5)

In [None]:
# Feature Scaling
print("\nApplying feature scaling...")

# Get numerical columns
numerical_cols = []
for col_name, dtype in df_with_nutrition.dtypes:
    if dtype in ['int', 'bigint', 'float', 'double'] and 'id' not in col_name.lower():
        # Check for non-null values
        non_null_count = df_with_nutrition.filter(F.col(col_name).isNotNull()).count()
        if non_null_count > 0:
            numerical_cols.append(col_name)

print(f"Found {len(numerical_cols)} numerical columns for scaling")

if len(numerical_cols) >= 2:
    # Create feature vector and scale
    assembler = VectorAssembler(inputCols=numerical_cols[:10], outputCol="features")
    scaler = SparkScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
    
    pipeline = Pipeline(stages=[assembler, scaler])
    model = pipeline.fit(df_with_nutrition)
    df_final = model.transform(df_with_nutrition)
    
    print("Feature scaling completed!")
else:
    df_final = df_with_nutrition
    print("Insufficient numerical features for scaling")

print(f"Final dataset: {df_final.count():,} rows, {len(df_final.columns)} columns")

In [None]:
# Save engineered features
print("\nSaving engineered features...")

try:
    # Save as CSV sample (memory efficient)
    sample_df = df_final.limit(50000).toPandas()
    output_path = '../data/engineered_features_updated.csv'
    sample_df.to_csv(output_path, index=False)
    print(f"Sample features saved to: {output_path}")
    
    # Save metadata
    metadata = {
        'timestamp': datetime.now().isoformat(),
        'spark_version': spark.version,
        'total_rows': df_final.count(),
        'total_features': len(df_final.columns),
        'new_features': [
            'ingredients_filtered', 'ingredient_count', 'category_count',
            'contains_gluten', 'contains_milk', 'contains_eggs', 'contains_nuts',
            'fat_energy_ratio', 'carb_energy_ratio', 'health_score', 'energy_category'
        ]
    }
    
    with open('../data/feature_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("Feature engineering completed successfully!")
    print(f"Created {len(metadata['new_features'])} new features")
    
except Exception as e:
    print(f"Error saving: {e}")

# Cleanup
print("\nFeature engineering pipeline completed.")
print("Run 'spark.stop()' when finished to free resources.")