# DiaTrend Data Preparation

This notebook demonstrates how to load, process, and extract features from the DiaTrend dataset, which contains real-world wearable time-series data including glucose levels, insulin doses, timestamps, and user logs.

In [None]:
# Import necessary libraries
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# Add the src directory to the path
sys.path.append('../')
from src.data_loader import DataLoader
from src.feature_engineer import FeatureEngineer

# Set up plotting
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Download and Load the DiaTrend Dataset

First, we'll download the DiaTrend dataset from Zenodo and load it into a pandas DataFrame.

In [None]:
# Initialize data loader
data_loader = DataLoader()

# Download DiaTrend dataset
diatrend_file = data_loader.download_diatrend_dataset()

# Load DiaTrend data
diatrend_df = data_loader.load_diatrend_data(diatrend_file)

# Display basic information about the dataset
print(f"DiaTrend dataset loaded with {len(diatrend_df)} records")
print(f"Time range: {diatrend_df['Time'].min()} to {diatrend_df['Time'].max()}")
print(f"Number of unique days: {diatrend_df['Day'].nunique()}")
print("\nDataset columns:")
for col in diatrend_df.columns:
    print(f"- {col}")

# Display the first few rows
diatrend_df.head()

## 2. Explore the Dataset

Let's explore the dataset to understand its structure and content.

In [None]:
# Basic statistics for glucose levels
print("Glucose Level Statistics:")
print(diatrend_df['GlucoseLevel'].describe())

# Basic statistics for insulin doses
print("\nInsulin Dose Statistics:")
print(diatrend_df['InsulinDose'].describe())

# Count non-null values in each column
print("\nNon-null values per column:")
print(diatrend_df.count())

# Check for comments
comment_count = diatrend_df['Comment'].notna().sum()
print(f"\nNumber of entries with comments: {comment_count} ({comment_count/len(diatrend_df)*100:.2f}%)")

# Sample of comments
if comment_count > 0:
    print("\nSample comments:")
    sample_comments = diatrend_df[diatrend_df['Comment'].notna()]['Comment'].sample(min(5, comment_count)).tolist()
    for i, comment in enumerate(sample_comments, 1):
        print(f"{i}. {comment}")

## 3. Segment Data by Day

Now, let's segment the data by day to analyze daily patterns.

In [None]:
# Segment data by day
daily_data = data_loader.segment_diatrend_by_day(diatrend_df)

print(f"Segmented data into {len(daily_data)} days")

# Select a sample day for detailed analysis
sample_day = list(daily_data.keys())[0]
sample_df = daily_data[sample_day]

print(f"\nSample day: {sample_day}")
print(f"Number of records: {len(sample_df)}")
print(f"Glucose readings: {sample_df['GlucoseLevel'].notna().sum()}")
print(f"Insulin doses: {sample_df['InsulinDose'].notna().sum()}")
print(f"Comments: {sample_df['Comment'].notna().sum()}")

# Plot glucose and insulin for the sample day
plt.figure(figsize=(14, 8))

# Plot glucose levels
ax1 = plt.subplot(211)
glucose_data = sample_df[sample_df['GlucoseLevel'].notna()]
ax1.plot(glucose_data['Time'], glucose_data['GlucoseLevel'], 'o-', color='blue', label='Glucose')
ax1.set_ylabel('Glucose (mg/dL)')
ax1.set_title(f'Glucose Levels for {sample_day}')
ax1.axhspan(70, 180, alpha=0.2, color='green', label='Target Range')
ax1.legend()

# Plot insulin doses
ax2 = plt.subplot(212, sharex=ax1)
insulin_data = sample_df[sample_df['InsulinDose'].notna()]
ax2.stem(insulin_data['Time'], insulin_data['InsulinDose'], 'r-', label='Insulin')
ax2.set_ylabel('Insulin (units)')
ax2.set_xlabel('Time')
ax2.set_title(f'Insulin Doses for {sample_day}')
ax2.legend()

plt.tight_layout()
plt.show()

## 4. Extract Features from Daily Data

Now, let's extract meaningful features from the daily data using our feature engineering module.

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer()

# Extract features for each day
daily_features = {}
for day, df in daily_data.items():
    daily_features[day] = feature_engineer.extract_diatrend_features(df)

print(f"Extracted features for {len(daily_features)} days")

# Display features for the sample day
print(f"\nFeatures for {sample_day}:")
for feature, value in daily_features[sample_day].items():
    print(f"- {feature}: {value}")

## 5. Analyze Glucose Volatility

Let's analyze glucose volatility in more detail.

In [None]:
# Calculate glucose volatility for the sample day
volatility_features = feature_engineer.calculate_glucose_volatility(sample_df)

print("Glucose Volatility Features:")
for feature, value in volatility_features.items():
    print(f"- {feature}: {value}")

# Plot glucose rate of change
glucose_df = sample_df[sample_df['GlucoseLevel'].notna()].copy()
glucose_df = glucose_df.sort_values('Time')
glucose_df['time_diff'] = glucose_df['Time'].diff().dt.total_seconds() / 60  # in minutes
glucose_df['glucose_diff'] = glucose_df['GlucoseLevel'].diff()
glucose_df['rate_of_change'] = glucose_df['glucose_diff'] / glucose_df['time_diff']

plt.figure(figsize=(14, 6))
valid_roc = glucose_df['rate_of_change'].replace([np.inf, -np.inf], np.nan).dropna()
plt.plot(glucose_df['Time'][1:], valid_roc, 'o-', color='purple')
plt.axhline(y=0, color='gray', linestyle='--')
plt.ylabel('Rate of Change (mg/dL per minute)')
plt.title(f'Glucose Rate of Change for {sample_day}')
plt.grid(True)
plt.show()

## 6. Analyze Meal Responses

Let's identify potential meals and analyze glucose responses.

In [None]:
# Extract meal-related features
meal_features = feature_engineer.extract_meal_related_features(sample_df)

print("Meal-Related Features:")
for feature, value in meal_features.items():
    print(f"- {feature}: {value}")

# Identify potential meal times
meal_indicators = ['meal', 'breakfast', 'lunch', 'dinner', 'eating', 'food', 'carbs', 'snack']
meal_rows = sample_df[sample_df['Comment'].str.lower().str.contains('|'.join(meal_indicators), na=False)]

# Also consider insulin doses as potential meal indicators
insulin_rows = sample_df[(sample_df['InsulinDose'] > 0) & sample_df['InsulinDose'].notna()]
meal_rows = pd.concat([meal_rows, insulin_rows]).drop_duplicates()

# Plot glucose around meal times
if len(meal_rows) > 0:
    plt.figure(figsize=(14, 10))
    
    for i, (_, meal_row) in enumerate(meal_rows.iterrows(), 1):
        if i > 3:  # Limit to 3 meals for clarity
            break
            
        meal_time = meal_row['Time']
        
        # Get glucose values 2 hours before and after meal
        window_start = meal_time - pd.Timedelta(hours=2)
        window_end = meal_time + pd.Timedelta(hours=2)
        
        meal_window = sample_df[(sample_df['Time'] >= window_start) & 
                               (sample_df['Time'] <= window_end) & 
                               sample_df['GlucoseLevel'].notna()]
        
        if len(meal_window) < 2:
            continue
            
        ax = plt.subplot(3, 1, i)
        
        # Convert to minutes relative to meal time
        meal_window['minutes'] = (meal_window['Time'] - meal_time).dt.total_seconds() / 60
        
        # Plot glucose
        ax.plot(meal_window['minutes'], meal_window['GlucoseLevel'], 'o-', color='blue')
        
        # Add meal time marker
        ax.axvline(x=0, color='red', linestyle='--', label='Meal Time')
        
        # Add comment if available
        comment = meal_row.get('Comment', '')
        insulin = meal_row.get('InsulinDose', 0)
        title = f"Meal at {meal_time.strftime('%H:%M')}"
        if pd.notna(insulin) and insulin > 0:
            title += f" (Insulin: {insulin} units)"
        if pd.notna(comment) and comment:
            title += f"\nComment: {comment}"
            
        ax.set_title(title)
        ax.set_xlabel('Minutes relative to meal')
        ax.set_ylabel('Glucose (mg/dL)')
        ax.grid(True)
        
    plt.tight_layout()
    plt.show()
else:
    print("No clear meal times identified for this day.")

## 7. Analyze Comment Sentiment

Let's analyze the sentiment and tags in user comments.

In [None]:
# Extract comment sentiment features
comment_features = feature_engineer.analyze_comment_sentiment(sample_df)

print("Comment Analysis Features:")
for feature, value in comment_features.items():
    print(f"- {feature}: {value}")

# Display all comments for the day
comments = sample_df[sample_df['Comment'].notna()]
if len(comments) > 0:
    print("\nAll comments for the day:")
    for _, row in comments.iterrows():
        print(f"[{row['Time'].strftime('%H:%M')}] {row['Comment']}")

## 8. Save Processed Data

Finally, let's save the processed data and extracted features for use in other notebooks.

In [None]:
# Create processed directory if it doesn't exist
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# Save daily features
with open(os.path.join(processed_dir, 'diatrend_daily_features.json'), 'w') as f:
    json.dump(daily_features, f, indent=2)

# Create sample user goals
user_goals = {
    "name": "Alex",
    "primary_goals": [
        {"area": "glucose", "goal": "Reduce post-meal glucose spikes"},
        {"area": "insulin", "goal": "Optimize insulin timing for better glucose control"},
        {"area": "lifestyle", "goal": "Understand how exercise affects glucose levels"}
    ],
    "diabetes_type": "Type 1",
    "target_glucose_range": "70-180 mg/dL"
}

# Save user goals
with open(os.path.join(processed_dir, 'diatrend_user_goals.json'), 'w') as f:
    json.dump(user_goals, f, indent=2)

print("Saved processed data and features to:")
print(f"- {os.path.join(processed_dir, 'diatrend_daily_features.json')}")
print(f"- {os.path.join(processed_dir, 'diatrend_user_goals.json')}")

## 9. Summary

In this notebook, we've demonstrated how to:

1. Download and load the DiaTrend dataset
2. Explore the dataset structure and content
3. Segment the data by day
4. Extract meaningful features from daily data
5. Analyze glucose volatility
6. Identify and analyze meal responses
7. Analyze comment sentiment and tags
8. Save processed data for use in other notebooks

These processed features will be used in subsequent notebooks to generate personalized insights using LLMs.