# Wearable Data Preparation

This notebook demonstrates the process of loading, preprocessing, and feature extraction from wearable device data. We'll use the sample data files provided in the `data/raw` directory to showcase the data pipeline.

In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Add the src directory to the path so we can import our modules
sys.path.append('../')
from src.data_loader import DataLoader
from src.feature_engineer import FeatureEngineer

# Set up plotting
plt.style.use('ggplot')
sns.set(style="whitegrid")
%matplotlib inline

## 1. Load Raw Data

First, we'll use our `DataLoader` class to load the sample data files.

In [None]:
# Initialize the data loader
data_dir = '../data/raw'
loader = DataLoader(data_dir=data_dir)

# List available files
available_files = loader.list_available_files()
print(f"Available data files: {available_files}")

In [None]:
# Load HRV data
hrv_file = 'sample_hrv_data.csv'
hrv_data = loader.load_file(hrv_file)

# Display the first few rows
print(f"HRV data shape: {hrv_data.shape}")
hrv_data.head()

In [None]:
# Load activity and sleep data
activity_sleep_file = 'sample_activity_sleep.json'
activity_sleep_data = loader.load_file(activity_sleep_file)

# Display the structure
print("Activity and Sleep data structure:")
print(f"- Number of activities: {len(activity_sleep_data['activities'])}")
print(f"- Number of daily totals: {len(activity_sleep_data['daily_totals'])}")
print(f"- Number of sleep sessions: {len(activity_sleep_data['sleep_sessions'])}")

In [None]:
# Load user goals
goals_file = 'user_goals.json'
user_goals = loader.load_file(goals_file)

# Display user goals
print(f"User: {user_goals['name']}, Age: {user_goals['age']}")
print("\nPrimary Goals:")
for goal in user_goals['primary_goals']:
    print(f"- {goal['area'].title()}: {goal['goal']} (Priority: {goal['priority']})")

## 2. Normalize and Clean Data

Now we'll normalize and clean the data using our data loader's methods.

In [None]:
# Normalize HRV data
normalized_hrv = loader.normalize_data(hrv_data, 'hrv')
print(f"Normalized HRV data shape: {normalized_hrv.shape}")
normalized_hrv.head()

In [None]:
# Clean HRV data
cleaned_hrv = loader.clean_data(normalized_hrv)
print(f"Cleaned HRV data shape: {cleaned_hrv.shape}")

# Check for missing values
print("\nMissing values after cleaning:")
print(cleaned_hrv.isna().sum())

In [None]:
# Convert activities to DataFrame
activities_df = pd.DataFrame(activity_sleep_data['activities'])

# Normalize activity data
normalized_activities = loader.normalize_data(activities_df, 'activity')
print(f"Normalized activities data shape: {normalized_activities.shape}")
normalized_activities.head()

In [None]:
# Convert sleep sessions to DataFrame
sleep_df = pd.DataFrame(activity_sleep_data['sleep_sessions'])

# Normalize sleep data
normalized_sleep = loader.normalize_data(sleep_df, 'sleep')
print(f"Normalized sleep data shape: {normalized_sleep.shape}")
normalized_sleep.head()

## 3. Segment Data by Day

Next, we'll segment the data by day to prepare for feature extraction.

In [None]:
# Segment HRV data by day
daily_hrv = loader.segment_by_day(cleaned_hrv)
print(f"Number of days in HRV data: {len(daily_hrv)}")

# Display the first day's data
first_day = list(daily_hrv.keys())[0]
print(f"\nHRV data for {first_day}:")
daily_hrv[first_day].head()

In [None]:
# Segment activities by day (already segmented in the original data)
activities_by_date = {}
for activity in activity_sleep_data['activities']:
    date = activity['date']
    if date not in activities_by_date:
        activities_by_date[date] = []
    activities_by_date[date].append(activity)

print(f"Number of days in activity data: {len(activities_by_date)}")
print(f"Activities on {list(activities_by_date.keys())[0]}: {len(activities_by_date[list(activities_by_date.keys())[0]])}")

In [None]:
# Segment sleep data by day (already segmented in the original data)
sleep_by_date = {}
for sleep in activity_sleep_data['sleep_sessions']:
    date = sleep['date']
    if date not in sleep_by_date:
        sleep_by_date[date] = []
    sleep_by_date[date].append(sleep)

print(f"Number of days in sleep data: {len(sleep_by_date)}")

## 4. Extract Features

Now we'll use our `FeatureEngineer` class to extract meaningful features from the data.

In [None]:
# Initialize feature engineer
feature_eng = FeatureEngineer()

In [None]:
# Extract HRV features for each day
hrv_features_by_day = {}
for date, df in daily_hrv.items():
    hrv_features_by_day[date] = feature_eng.extract_hrv_features(df)

# Display features for the first day
first_day = list(hrv_features_by_day.keys())[0]
print(f"HRV features for {first_day}:")
for feature, value in hrv_features_by_day[first_day].items():
    print(f"- {feature}: {value:.2f}")

In [None]:
# Extract stress windows from HRV data
stress_windows = {}
for date, df in daily_hrv.items():
    stress_windows[date] = feature_eng.extract_stress_windows(df)

# Display stress windows for the first day
first_day = list(stress_windows.keys())[0]
print(f"Stress windows for {first_day}:")
if stress_windows[first_day]:
    for i, window in enumerate(stress_windows[first_day]):
        print(f"Window {i+1}:")
        print(f"- Start: {window['start_time']}")
        print(f"- End: {window['end_time']}")
        print(f"- Duration: {window['duration_min']:.1f} minutes")
        print(f"- Avg RMSSD: {window['avg_rmssd']:.1f}")
else:
    print("No significant stress windows detected.")

In [None]:
# Convert activities to DataFrames by day
activity_dfs_by_day = {}
for date, activities in activities_by_date.items():
    activity_dfs_by_day[date] = pd.DataFrame(activities)

# Extract activity features
activity_features_by_day = {}
for date, df in activity_dfs_by_day.items():
    activity_features_by_day[date] = feature_eng.extract_activity_features(df)

# Display activity features for the first day
first_day = list(activity_features_by_day.keys())[0]
print(f"Activity features for {first_day}:")
for feature, value in activity_features_by_day[first_day].items():
    print(f"- {feature}: {value}")

In [None]:
# Convert sleep data to DataFrames by day
sleep_dfs_by_day = {}
for date, sleeps in sleep_by_date.items():
    sleep_dfs_by_day[date] = pd.DataFrame(sleeps)

# Extract sleep features
sleep_features_by_day = {}
for date, df in sleep_dfs_by_day.items():
    sleep_features_by_day[date] = feature_eng.extract_sleep_features(df)

# Display sleep features for the first day
first_day = list(sleep_features_by_day.keys())[0]
print(f"Sleep features for {first_day}:")
for feature, value in sleep_features_by_day[first_day].items():
    if isinstance(value, (int, float)):
        print(f"- {feature}: {value:.2f}")
    else:
        print(f"- {feature}: {value}")

In [None]:
# Calculate training load
activity_dfs = list(activity_dfs_by_day.values())
training_load = feature_eng.calculate_training_load(activity_dfs)

print("Training load metrics:")
for metric, value in training_load.items():
    print(f"- {metric}: {value:.2f}")

## 5. Combine Features

Now we'll combine all the features into a single dictionary for each day.

In [None]:
# Get all unique dates
all_dates = set(list(hrv_features_by_day.keys()) + 
                list(activity_features_by_day.keys()) + 
                list(sleep_features_by_day.keys()))

# Combine features for each day
combined_features_by_day = {}
for date in all_dates:
    hrv_features = hrv_features_by_day.get(date, {})
    activity_features = activity_features_by_day.get(date, {})
    sleep_features = sleep_features_by_day.get(date, {})
    
    combined_features_by_day[date] = feature_eng.combine_features(
        hrv_features, activity_features, sleep_features, training_load
    )

# Display combined features for the first day
first_day = list(combined_features_by_day.keys())[0]
print(f"Combined features for {first_day}:")
print(f"Total features: {len(combined_features_by_day[first_day])}")

# Display a few key features
key_features = [
    'hrv_rmssd_mean', 'activity_total_steps', 'sleep_total_sleep_hours', 'recovery_score'
]
for feature in key_features:
    if feature in combined_features_by_day[first_day]:
        value = combined_features_by_day[first_day][feature]
        print(f"- {feature}: {value:.2f}")

## 6. Visualize Key Metrics

Let's visualize some key metrics to better understand the data.

In [None]:
# Create a DataFrame with key metrics for each day
metrics_df = pd.DataFrame(combined_features_by_day).T
metrics_df.index = pd.to_datetime(metrics_df.index)
metrics_df.sort_index(inplace=True)

# Select key metrics to plot
key_metrics = [
    'hrv_rmssd_mean', 'hrv_rmssd_min', 'hrv_rmssd_max',
    'activity_total_steps', 'activity_active_minutes',
    'sleep_total_sleep_hours', 'recovery_score'
]

# Filter metrics that exist in the DataFrame
available_metrics = [m for m in key_metrics if m in metrics_df.columns]
plot_df = metrics_df[available_metrics]

# Plot
fig, axes = plt.subplots(len(available_metrics), 1, figsize=(12, 3*len(available_metrics)))
for i, metric in enumerate(available_metrics):
    ax = axes[i] if len(available_metrics) > 1 else axes
    plot_df[metric].plot(ax=ax, marker='o')
    ax.set_title(metric.replace('_', ' ').title())
    ax.grid(True)

plt.tight_layout()
plt.show()

## 7. Save Processed Data

Finally, let's save the processed data for use in subsequent notebooks.

In [None]:
# Create processed data directory if it doesn't exist
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save combined features
with open(os.path.join(processed_dir, 'combined_features.json'), 'w') as f:
    # Convert dates to strings for JSON serialization
    serializable_features = {str(date): features for date, features in combined_features_by_day.items()}
    json.dump(serializable_features, f, indent=2)

# Save user goals
with open(os.path.join(processed_dir, 'user_goals.json'), 'w') as f:
    json.dump(user_goals, f, indent=2)

print(f"Saved processed data to {processed_dir}")

## Summary

In this notebook, we've demonstrated the complete data preparation pipeline:

1. Loading raw data from CSV and JSON files
2. Normalizing and cleaning the data
3. Segmenting data by day
4. Extracting meaningful features from HRV, activity, and sleep data
5. Calculating training load metrics
6. Combining all features into a comprehensive dataset
7. Visualizing key metrics
8. Saving processed data for use in insight generation

This processed data will be used in subsequent notebooks to generate personalized insights using LLMs.