# LLM Pipeline Demo for Wearable Insights

This notebook demonstrates the end-to-end pipeline for generating insights from wearable data using LLMs. We'll use the processed data and prompt templates from previous notebooks to generate personalized insights.

In [None]:
# Import necessary libraries
import sys
import os
import json
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

# Add the src directory to the path so we can import our modules
sys.path.append('../')
from src.data_loader import DataLoader
from src.feature_engineer import FeatureEngineer
from src.insight_prompt_builder import InsightPromptBuilder
from src.llm_engine import LLMEngine

# Set up plotting
%matplotlib inline

## 1. Load Processed Data

First, let's load the processed data from the previous notebooks.

In [None]:
# Load combined features
processed_dir = '../data/processed'
with open(os.path.join(processed_dir, 'combined_features.json'), 'r') as f:
    combined_features = json.load(f)

# Load user goals
with open(os.path.join(processed_dir, 'user_goals.json'), 'r') as f:
    user_goals = json.load(f)

print(f"Loaded data for {len(combined_features)} days")
print(f"User: {user_goals['name']}")

In [None]:
# Get the most recent day's data
dates = list(combined_features.keys())
dates.sort()
latest_date = dates[-1]
latest_features = combined_features[latest_date]

print(f"Latest data date: {latest_date}")
print(f"Number of features: {len(latest_features)}")

## 2. Initialize Components

Now, let's initialize all the components we need for the pipeline.

In [None]:
# Initialize prompt builder
prompt_builder = InsightPromptBuilder()

# Initialize LLM engine
llm_engine = LLMEngine()

# Check if API key is set
api_key_set = os.environ.get("OPENAI_API_KEY") is not None
if not api_key_set:
    print("WARNING: OpenAI API key not set. Some cells will not generate actual insights.")
    print("To set the API key, run: export OPENAI_API_KEY='your-api-key'")

## 3. End-to-End Pipeline: Daily Insight Generation

Let's implement a complete pipeline for generating daily insights.

In [None]:
def generate_daily_insight(date, features, user_goals, tone="coach"):
    """Generate a daily insight for the specified date."""
    # Extract user's primary fitness goal
    fitness_goal = next((goal['goal'] for goal in user_goals['primary_goals'] if goal['area'] == 'fitness'), 
                        "improving overall fitness")
    
    # Build prompt
    prompt = prompt_builder.build_prompt(
        features,
        tone=tone,
        user_goal=fitness_goal,
        time_range=f"data from {date}"
    )
    
    # Generate insight
    if api_key_set:
        insight, metadata = llm_engine.generate_insight(prompt)
    else:
        # Simulate insight if API key not set
        insight = "[This is a simulated insight. Set OPENAI_API_KEY to generate real insights.]"
        metadata = {"model": "simulation", "latency_seconds": 0, "total_tokens": 0}
    
    return {
        "date": date,
        "prompt": prompt,
        "insight": insight,
        "metadata": metadata
    }

In [None]:
# Generate insight for the latest day
latest_insight = generate_daily_insight(latest_date, latest_features, user_goals)

print(f"=== DAILY INSIGHT FOR {latest_date} ===\n")
print(latest_insight["insight"])

if api_key_set:
    print("\nMetadata:")
    print(f"- Model: {latest_insight['metadata'].get('model')}")
    print(f"- Tokens: {latest_insight['metadata'].get('total_tokens')}")
    print(f"- Latency: {latest_insight['metadata'].get('latency_seconds'):.2f} seconds")

## 4. Generate Multiple Insights

Now, let's generate insights for all available days and different focus areas.

In [None]:
# Generate insights for all days (limited to last 3 days to save API calls)
days_to_process = min(3, len(dates))
recent_dates = dates[-days_to_process:]

daily_insights = []
for date in recent_dates:
    features = combined_features[date]
    insight = generate_daily_insight(date, features, user_goals)
    daily_insights.append(insight)
    print(f"Generated insight for {date}")

print(f"\nGenerated {len(daily_insights)} daily insights")

In [None]:
# Generate focused insights for different areas
focus_areas = ["sleep", "recovery", "activity"]
focused_insights = []

for focus in focus_areas:
    # Get relevant user goal for this focus area
    goal = next((g['goal'] for g in user_goals['primary_goals'] if g['area'] == focus), None)
    
    # Build focused prompt
    prompt = prompt_builder.build_focused_prompt(
        latest_features,
        focus_area=focus,
        user_goal=goal
    )
    
    # Generate insight
    if api_key_set:
        insight, metadata = llm_engine.generate_insight(prompt)
    else:
        # Simulate insight if API key not set
        insight = f"[This is a simulated {focus}-focused insight. Set OPENAI_API_KEY to generate real insights.]"
        metadata = {"model": "simulation", "latency_seconds": 0, "total_tokens": 0}
    
    focused_insights.append({
        "focus_area": focus,
        "prompt": prompt,
        "insight": insight,
        "metadata": metadata
    })
    
    print(f"Generated {focus}-focused insight")

print(f"\nGenerated {len(focused_insights)} focused insights")

## 5. Display Generated Insights

Let's display the generated insights in a readable format.

In [None]:
# Display daily insights
for insight in daily_insights:
    print(f"=== DAILY INSIGHT FOR {insight['date']} ===\n")
    print(insight["insight"])
    print("\n" + "-"*80 + "\n")

In [None]:
# Display focused insights
for insight in focused_insights:
    print(f"=== {insight['focus_area'].upper()}-FOCUSED INSIGHT ===\n")
    print(insight["insight"])
    print("\n" + "-"*80 + "\n")

## 6. Save Generated Insights

Let's save the generated insights for future reference.

In [None]:
# Create outputs directory if it doesn't exist
outputs_dir = '../outputs'
os.makedirs(outputs_dir, exist_ok=True)

# Save daily insights
daily_insights_path = os.path.join(outputs_dir, 'daily_insights.json')
with open(daily_insights_path, 'w') as f:
    # Convert dates to strings for JSON serialization
    serializable_insights = []
    for insight in daily_insights:
        serializable_insight = insight.copy()
        serializable_insight['metadata'] = {k: str(v) if isinstance(v, (datetime, timedelta)) else v 
                                          for k, v in insight['metadata'].items()}
        serializable_insights.append(serializable_insight)
    
    json.dump(serializable_insights, f, indent=2)

# Save focused insights
focused_insights_path = os.path.join(outputs_dir, 'focused_insights.json')
with open(focused_insights_path, 'w') as f:
    # Convert dates to strings for JSON serialization
    serializable_insights = []
    for insight in focused_insights:
        serializable_insight = insight.copy()
        serializable_insight['metadata'] = {k: str(v) if isinstance(v, (datetime, timedelta)) else v 
                                          for k, v in insight['metadata'].items()}
        serializable_insights.append(serializable_insight)
    
    json.dump(serializable_insights, f, indent=2)

print(f"Saved daily insights to {daily_insights_path}")
print(f"Saved focused insights to {focused_insights_path}")

## 7. Implement a Complete Insight Pipeline Function

Let's create a reusable function that implements the complete insight generation pipeline.

In [None]:
def generate_insights_pipeline(data_dir, raw_data_files, user_goals_file, output_dir, 
                              generate_daily=True, generate_focused=True, 
                              focus_areas=None, days_to_process=1):
    """Complete pipeline for generating insights from raw wearable data."""
    # Initialize components
    loader = DataLoader(data_dir=data_dir)
    feature_eng = FeatureEngineer()
    prompt_builder = InsightPromptBuilder()
    llm_engine = LLMEngine()
    
    # Step 1: Load and process raw data
    print("Step 1: Loading and processing raw data...")
    
    # Load user goals
    user_goals = loader.load_file(user_goals_file)
    
    # Process each raw data file
    hrv_data = None
    activity_sleep_data = None
    
    for file in raw_data_files:
        data = loader.load_file(file)
        
        if file.endswith('.csv') and 'hrv' in file.lower():
            hrv_data = data
        elif file.endswith('.json') and 'activity' in file.lower():
            activity_sleep_data = data
    
    if hrv_data is None or activity_sleep_data is None:
        print("Error: Missing required data files")
        return None
    
    # Step 2: Extract features
    print("Step 2: Extracting features...")
    
    # Process HRV data
    normalized_hrv = loader.normalize_data(hrv_data, 'hrv')
    cleaned_hrv = loader.clean_data(normalized_hrv)
    daily_hrv = loader.segment_by_day(cleaned_hrv)
    
    # Process activity data
    activities_df = pd.DataFrame(activity_sleep_data['activities'])
    activities_by_date = {}
    for activity in activity_sleep_data['activities']:
        date = activity['date']
        if date not in activities_by_date:
            activities_by_date[date] = []
        activities_by_date[date].append(activity)
    
    activity_dfs_by_day = {}
    for date, activities in activities_by_date.items():
        activity_dfs_by_day[date] = pd.DataFrame(activities)
    
    # Process sleep data
    sleep_df = pd.DataFrame(activity_sleep_data['sleep_sessions'])
    sleep_by_date = {}
    for sleep in activity_sleep_data['sleep_sessions']:
        date = sleep['date']
        if date not in sleep_by_date:
            sleep_by_date[date] = []
        sleep_by_date[date].append(sleep)
    
    sleep_dfs_by_day = {}
    for date, sleeps in sleep_by_date.items():
        sleep_dfs_by_day[date] = pd.DataFrame(sleeps)
    
    # Extract features for each day
    combined_features_by_day = {}
    
    # Get all unique dates
    all_dates = set(list(daily_hrv.keys()) + 
                    list(activity_dfs_by_day.keys()) + 
                    list(sleep_dfs_by_day.keys()))
    
    # Sort dates
    all_dates = sorted(list(all_dates))
    
    # Limit to specified number of days
    recent_dates = all_dates[-days_to_process:] if len(all_dates) > days_to_process else all_dates
    
    # Calculate training load
    activity_dfs = list(activity_dfs_by_day.values())
    training_load = feature_eng.calculate_training_load(activity_dfs)
    
    # Extract features for each day
    for date in recent_dates:
        hrv_features = feature_eng.extract_hrv_features(daily_hrv[date]) if date in daily_hrv else {}
        activity_features = feature_eng.extract_activity_features(activity_dfs_by_day[date]) if date in activity_dfs_by_day else {}
        sleep_features = feature_eng.extract_sleep_features(sleep_dfs_by_day[date]) if date in sleep_dfs_by_day else {}
        
        combined_features_by_day[date] = feature_eng.combine_features(
            hrv_features, activity_features, sleep_features, training_load
        )
    
    # Step 3: Generate insights
    print("Step 3: Generating insights...")
    
    results = {
        "daily_insights": [],
        "focused_insights": []
    }
    
    # Generate daily insights
    if generate_daily:
        for date in recent_dates:
            features = combined_features_by_day[date]
            insight = generate_daily_insight(date, features, user_goals)
            results["daily_insights"].append(insight)
            print(f"Generated daily insight for {date}")
    
    # Generate focused insights
    if generate_focused:
        focus_areas = focus_areas or ["sleep", "recovery", "activity"]
        latest_date = recent_dates[-1]
        latest_features = combined_features_by_day[latest_date]
        
        for focus in focus_areas:
            # Get relevant user goal for this focus area
            goal = next((g['goal'] for g in user_goals['primary_goals'] if g['area'] == focus), None)
            
            # Build focused prompt
            prompt = prompt_builder.build_focused_prompt(
                latest_features,
                focus_area=focus,
                user_goal=goal
            )
            
            # Generate insight
            if api_key_set:
                insight, metadata = llm_engine.generate_insight(prompt)
            else:
                # Simulate insight if API key not set
                insight = f"[This is a simulated {focus}-focused insight. Set OPENAI_API_KEY to generate real insights.]"
                metadata = {"model": "simulation", "latency_seconds": 0, "total_tokens": 0}
            
            results["focused_insights"].append({
                "focus_area": focus,
                "prompt": prompt,
                "insight": insight,
                "metadata": metadata
            })
            
            print(f"Generated {focus}-focused insight")
    
    # Step 4: Save results
    print("Step 4: Saving results...")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save daily insights
    if generate_daily:
        daily_insights_path = os.path.join(output_dir, 'daily_insights.json')
        with open(daily_insights_path, 'w') as f:
            # Convert dates to strings for JSON serialization
            serializable_insights = []
            for insight in results["daily_insights"]:
                serializable_insight = insight.copy()
                serializable_insight['metadata'] = {k: str(v) if isinstance(v, (datetime, timedelta)) else v 
                                                  for k, v in insight['metadata'].items()}
                serializable_insights.append(serializable_insight)
            
            json.dump(serializable_insights, f, indent=2)
        
        print(f"Saved daily insights to {daily_insights_path}")
    
    # Save focused insights
    if generate_focused:
        focused_insights_path = os.path.join(output_dir, 'focused_insights.json')
        with open(focused_insights_path, 'w') as f:
            # Convert dates to strings for JSON serialization
            serializable_insights = []
            for insight in results["focused_insights"]:
                serializable_insight = insight.copy()
                serializable_insight['metadata'] = {k: str(v) if isinstance(v, (datetime, timedelta)) else v 
                                                  for k, v in insight['metadata'].items()}
                serializable_insights.append(serializable_insight)
            
            json.dump(serializable_insights, f, indent=2)
        
        print(f"Saved focused insights to {focused_insights_path}")
    
    print("Pipeline completed successfully!")
    return results

In [None]:
# Run the complete pipeline
pipeline_results = generate_insights_pipeline(
    data_dir='../data/raw',
    raw_data_files=['sample_hrv_data.csv', 'sample_activity_sleep.json'],
    user_goals_file='user_goals.json',
    output_dir='../outputs',
    days_to_process=2,
    focus_areas=["sleep", "recovery"]
)

## Summary

In this notebook, we've demonstrated the complete end-to-end pipeline for generating insights from wearable data using LLMs:

1. **Data Loading**: We loaded the processed wearable data and user goals.

2. **Insight Generation**: We generated daily insights for recent days and focused insights for specific areas like sleep and recovery.

3. **Pipeline Implementation**: We implemented a reusable pipeline function that can process raw data files, extract features, generate insights, and save the results.

This pipeline can be integrated into a production system to automatically generate personalized insights from wearable data on a daily basis. The insights can be delivered to users through various channels, such as a mobile app, email, or a web dashboard.

In the next notebook, we'll explore how to implement an interactive agent that can respond to user queries about their wearable data.