# Feast Feature Store Explorer with Spark Backend

This notebook demonstrates how to query and explore the Feast feature store configured with a Spark offline backend and Iceberg tables on LakeFS.

## Data Flow
```
dlt (Kaggle) ‚Üí Avro ‚Üí MinIO ‚Üí Spark ‚Üí Iceberg (LakeFS) ‚Üí Feast
```

## 1. Environment Setup

In [None]:
import os
import sys
from pathlib import Path

# Add project root to path for imports
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Set environment variables for local development (adjust as needed)
os.environ.setdefault("LAKEFS_ENDPOINT_URL", "http://localhost:8000")
os.environ.setdefault("LAKEFS_ACCESS_KEY_ID", "AKIAIOSFOLQUICKSTART")
os.environ.setdefault("LAKEFS_SECRET_ACCESS_KEY", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY")
os.environ.setdefault("LAKEFS_REPOSITORY", "kronodroid")
os.environ.setdefault("LAKEFS_BRANCH", "main")
os.environ.setdefault("REDIS_CONNECTION_STRING", "redis://localhost:16379")

print(f"Project root: {project_root}")
print(f"LakeFS endpoint: {os.environ['LAKEFS_ENDPOINT_URL']}")

## 2. Initialize Spark Session

Create a Spark session configured for Iceberg + LakeFS.

In [None]:
from engines.spark_engine.dfp_spark.session import get_spark_session, SparkConfig

# Create Spark session with Iceberg + LakeFS configuration
spark_config = SparkConfig(
    app_name="feast_explorer",
    driver_memory="2g",
    executor_memory="2g",
)

spark = get_spark_session(config=spark_config)
print(f"Spark version: {spark.version}")
print(f"Spark app name: {spark.sparkContext.appName}")

## 3. Initialize Feast Feature Store

Connect to the Feast feature store with Spark offline store configuration.

In [None]:
from feast import FeatureStore

# Path to feast feature_store.yaml
feast_repo_path = project_root / "feature_stores" / "feast_store"

# Initialize the feature store
store = FeatureStore(repo_path=str(feast_repo_path))

print(f"Feast project: {store.project}")
print(f"Registry path: {store.config.registry}")
print(f"Offline store type: {store.config.offline_store.type}")

## 4. List All Feature Views

Display all registered feature views in the Feast registry.

In [None]:
import pandas as pd

# Get all feature views
feature_views = store.list_feature_views()
batch_feature_views = store.list_batch_feature_views()
on_demand_feature_views = store.list_on_demand_feature_views()

print(f"\nüìä Feature Views Summary")
print(f"{'='*50}")
print(f"Regular Feature Views: {len(feature_views)}")
print(f"Batch Feature Views: {len(batch_feature_views)}")
print(f"On-Demand Feature Views: {len(on_demand_feature_views)}")
print(f"{'='*50}")

## 5. Feature Views Details

Display detailed information about each feature view.

In [None]:
def display_feature_view_details(fv):
    """Display detailed information about a feature view."""
    print(f"\nüîπ Feature View: {fv.name}")
    print(f"   {'‚îÄ'*45}")
    
    # Entities
    entity_names = [e.name if hasattr(e, 'name') else str(e) for e in fv.entities]
    print(f"   Entities: {', '.join(entity_names)}")
    
    # TTL
    print(f"   TTL: {fv.ttl}")
    
    # Online serving
    online = getattr(fv, 'online', 'N/A')
    print(f"   Online: {online}")
    
    # Tags
    tags = getattr(fv, 'tags', {})
    if tags:
        print(f"   Tags: {tags}")
    
    # Source
    source = getattr(fv, 'batch_source', getattr(fv, 'source', None))
    if source:
        source_name = getattr(source, 'name', type(source).__name__)
        print(f"   Source: {source_name}")
        if hasattr(source, 'table'):
            print(f"   Table: {source.table}")
    
    # Schema/Features
    schema = getattr(fv, 'schema', [])
    if schema:
        print(f"   Features ({len(schema)}):")
        for field in schema:
            desc = getattr(field, 'description', '')
            desc_str = f" - {desc}" if desc else ""
            print(f"      ‚Ä¢ {field.name}: {field.dtype}{desc_str}")

# Display regular feature views
print("\n" + "="*60)
print("REGULAR FEATURE VIEWS")
print("="*60)
for fv in feature_views:
    display_feature_view_details(fv)

In [None]:
# Display batch feature views
print("\n" + "="*60)
print("BATCH FEATURE VIEWS")
print("="*60)
for fv in batch_feature_views:
    display_feature_view_details(fv)

In [None]:
# Display on-demand feature views
print("\n" + "="*60)
print("ON-DEMAND FEATURE VIEWS")
print("="*60)
for odfv in on_demand_feature_views:
    print(f"\nüî∏ On-Demand Feature View: {odfv.name}")
    print(f"   {'‚îÄ'*45}")
    
    # Source feature views
    sources = list(odfv.source_feature_view_projections.keys())
    print(f"   Source FVs: {', '.join(sources)}")
    
    # Schema
    schema = getattr(odfv, 'schema', [])
    if schema:
        print(f"   Computed Features ({len(schema)}):")
        for field in schema:
            print(f"      ‚Ä¢ {field.name}: {field.dtype}")

## 6. Entities

List all entities defined in the feature store.

In [None]:
# List all entities
entities = store.list_entities()

print("\n" + "="*60)
print("ENTITIES")
print("="*60)

entity_data = []
for entity in entities:
    entity_data.append({
        "Name": entity.name,
        "Join Keys": ", ".join(entity.join_keys),
        "Value Type": str(entity.value_type),
        "Description": entity.description or "N/A"
    })

entities_df = pd.DataFrame(entity_data)
display(entities_df)

## 7. Data Sources

List all data sources configured in the feature store.

In [None]:
# List all data sources
data_sources = store.list_data_sources()

print("\n" + "="*60)
print("DATA SOURCES")
print("="*60)

source_data = []
for source in data_sources:
    source_info = {
        "Name": source.name,
        "Type": type(source).__name__,
    }
    
    # Add table info for SparkSource
    if hasattr(source, 'table'):
        source_info["Table"] = source.table
    
    # Add timestamp field
    if hasattr(source, 'timestamp_field'):
        source_info["Timestamp Field"] = source.timestamp_field
    
    source_data.append(source_info)

sources_df = pd.DataFrame(source_data)
display(sources_df)

## 8. Feature Views Summary Table

Create a summary table of all feature views with their key attributes.

In [None]:
def get_fv_summary(fv, fv_type="FeatureView"):
    """Extract summary info from a feature view."""
    schema = getattr(fv, 'schema', [])
    entities = [e.name if hasattr(e, 'name') else str(e) for e in fv.entities] if hasattr(fv, 'entities') else []
    tags = getattr(fv, 'tags', {})
    source = getattr(fv, 'batch_source', getattr(fv, 'source', None))
    source_name = getattr(source, 'name', 'N/A') if source else 'N/A'
    
    return {
        "Name": fv.name,
        "Type": fv_type,
        "Entities": ", ".join(entities),
        "# Features": len(schema),
        "TTL": str(getattr(fv, 'ttl', 'N/A')),
        "Online": getattr(fv, 'online', 'N/A'),
        "Source": source_name,
        "Tags": ", ".join(f"{k}={v}" for k, v in tags.items()) if tags else "N/A"
    }

# Collect all feature views
all_fv_data = []

for fv in feature_views:
    all_fv_data.append(get_fv_summary(fv, "FeatureView"))

for fv in batch_feature_views:
    all_fv_data.append(get_fv_summary(fv, "BatchFeatureView"))

for odfv in on_demand_feature_views:
    sources = list(odfv.source_feature_view_projections.keys())
    schema = getattr(odfv, 'schema', [])
    all_fv_data.append({
        "Name": odfv.name,
        "Type": "OnDemandFeatureView",
        "Entities": "N/A",
        "# Features": len(schema),
        "TTL": "N/A",
        "Online": True,
        "Source": ", ".join(sources),
        "Tags": "N/A"
    })

fv_summary_df = pd.DataFrame(all_fv_data)
print("\nüìã Feature Views Summary Table")
print("="*80)
display(fv_summary_df)

## 9. Query Feature View with Spark (Example)

Demonstrate how to fetch historical features using the Spark offline store.

In [None]:
from datetime import datetime, timedelta

# Create a sample entity DataFrame for historical feature retrieval
# This would typically come from your application data
entity_df = pd.DataFrame({
    "sample_id": ["sample_001", "sample_002", "sample_003"],
    "event_timestamp": [
        datetime.now() - timedelta(days=1),
        datetime.now() - timedelta(days=2),
        datetime.now() - timedelta(days=3),
    ]
})

print("Sample entity DataFrame:")
display(entity_df)

In [None]:
# Uncomment to fetch historical features (requires running infrastructure)
# This uses the Spark offline store configured in feature_store.yaml

# feature_refs = [
#     "malware_sample_features:app_package",
#     "malware_sample_features:is_malware",
#     "malware_sample_features:data_source",
#     "malware_sample_features:dataset_split",
# ]

# training_df = store.get_historical_features(
#     entity_df=entity_df,
#     features=feature_refs,
# ).to_df()

# print("Historical features retrieved via Spark:")
# display(training_df)

print("‚ÑπÔ∏è  Historical feature retrieval is commented out.")
print("   Uncomment the code above when infrastructure (LakeFS, Spark, Iceberg) is running.")

## 10. Registry Inspection

Inspect the Feast registry directly for additional metadata.

In [None]:
# Get registry information
print("\n" + "="*60)
print("REGISTRY INFORMATION")
print("="*60)

print(f"\nProject: {store.project}")
print(f"Provider: {store.config.provider}")
print(f"\nOffline Store Configuration:")
print(f"  Type: {store.config.offline_store.type}")

# Show Spark configuration from the offline store
if hasattr(store.config.offline_store, 'spark_conf'):
    print(f"\nSpark Configuration:")
    for key, value in store.config.offline_store.spark_conf.items():
        # Mask sensitive values
        if 'secret' in key.lower() or 'password' in key.lower() or 'key' in key.lower():
            print(f"    {key}: ***")
        else:
            print(f"    {key}: {value}")

print(f"\nOnline Store Configuration:")
print(f"  Type: {store.config.online_store.type}")

## 11. Cleanup

In [None]:
# Stop Spark session when done
# Uncomment if you want to stop the session
# spark.stop()

print("\n‚úÖ Notebook complete!")
print("   Spark session is still active. Call spark.stop() when finished.")