In [1]:
import numpy as np
import pandas as pd
import os
from scipy import stats, signal
from datetime import timedelta


In [2]:
sensor_dtype = np.dtype([
    ("x", ">f4"),
    ("y", ">f4"),
    ("z", ">f4"),
    ("time", ">i8"),
])


def load_raw_sensor_data(path):
    """Load and process sensor data from binary files"""
    acc_files = sorted([f for f in os.listdir(path) if f.endswith('.bin') and 'accelerometer' in f])
    gyro_files = sorted([f for f in os.listdir(path) if f.endswith('.bin') and 'gyroscope' in f])

    all_acc_data = []
    all_gyro_data = []

    # Load accelerometer data
    for file in acc_files:
        boot_time_nanos = int(file.split("_")[0]) * 1e6
        file_path = os.path.join(path, file)
        acc_data = np.fromfile(file_path, dtype=sensor_dtype)
        first_event_time = acc_data['time'][0]
        corrected_timestamps = ((acc_data['time'] - first_event_time) + boot_time_nanos) / 1e9
        corrected_datetimes = pd.to_datetime(corrected_timestamps, unit='s')
        df = pd.DataFrame(acc_data[["x", "y", "z"]].byteswap().newbyteorder())
        df['time'] = corrected_datetimes
        all_acc_data.append(df)

    # Load gyroscope data
    for file in gyro_files:
        boot_time_nanos = int(file.split("_")[0]) * 1e6
        file_path = os.path.join(path, file)
        gyro_data = np.fromfile(file_path, dtype=sensor_dtype)
        first_event_time = gyro_data['time'][0]
        corrected_timestamps = ((gyro_data['time'] - first_event_time) + boot_time_nanos) / 1e9
        corrected_datetimes = pd.to_datetime(corrected_timestamps, unit='s')
        df = pd.DataFrame(gyro_data[["x", "y", "z"]].byteswap().newbyteorder())
        df['time'] = corrected_datetimes
        all_gyro_data.append(df)

    return pd.concat(all_acc_data), pd.concat(all_gyro_data)


def sync_sensors(acc_data, gyro_data):
    """Synchronize accelerometer and gyroscope data"""
    common_start_time = max(acc_data['time'].min(), gyro_data['time'].min())
    common_end_time = min(acc_data['time'].max(), gyro_data['time'].max())

    acc_synced = acc_data[(acc_data['time'] >= common_start_time) & (acc_data['time'] <= common_end_time)]
    gyro_synced = gyro_data[(gyro_data['time'] >= common_start_time) & (gyro_data['time'] <= common_end_time)]

    return acc_synced.reset_index(drop=True), gyro_synced.reset_index(drop=True)


In [3]:
def analyze_session(path):
    """Analyze a single session's IMU data"""
    # Load and sync the data
    acc_data, gyro_data = load_raw_sensor_data(path)
    acc_synced, gyro_synced = sync_sensors(acc_data, gyro_data)
    
    # Calculate session duration
    duration = (acc_synced['time'].max() - acc_synced['time'].min()).total_seconds()
    
    # Calculate temporal characteristics
    time_diffs_acc = acc_synced['time'].diff()[1:].dt.total_seconds()
    time_diffs_gyro = gyro_synced['time'].diff()[1:].dt.total_seconds()
    
    actual_rate_acc = 1 / time_diffs_acc.mean()
    rate_std_acc = np.std(1 / time_diffs_acc)
    
    # Calculate temporal offset between sensors
    temporal_offset = (acc_synced['time'] - gyro_synced['time']).dt.total_seconds() * 1000  # in milliseconds
    
    # Calculate signal characteristics for both sensors
    acc_stats = calculate_signal_stats(acc_synced)
    gyro_stats = calculate_signal_stats(gyro_synced)
    
    # Calculate spectral characteristics
    acc_spectral = calculate_spectral_components(acc_synced)
    gyro_spectral = calculate_spectral_components(gyro_synced)
    
    # Calculate data quality metrics
    expected_samples = int(duration * actual_rate_acc)
    missing_samples_pct = (1 - len(acc_synced) / expected_samples) * 100
    
    # Sample interval confidence
    interval_ci = stats.norm.interval(0.95, 
                                    loc=time_diffs_acc.mean() * 1000,  # convert to ms
                                    scale=time_diffs_acc.std() * 1000)
    
    return {
        'duration_hours': duration / 3600,
        'temporal_chars': {
            'mean_sampling_rate': actual_rate_acc,
            'sampling_rate_std': rate_std_acc,
            'temporal_offset_mean': temporal_offset.mean(),
            'temporal_offset_std': temporal_offset.std()
        },
        'acc_stats': acc_stats,
        'gyro_stats': gyro_stats,
        'acc_spectral': acc_spectral,
        'gyro_spectral': gyro_spectral,
        'data_quality': {
            'missing_samples_pct': missing_samples_pct,
            'timestamp_continuity': (time_diffs_acc < 0.02).mean() * 100,  # % of intervals < 20ms
            'interval_ci': interval_ci
        }
    }

def calculate_signal_stats(data):
    """Calculate comprehensive signal statistics"""
    stats_dict = {}
    for axis in ['x', 'y', 'z']:
        signal_data = data[axis]
        stats_dict[axis] = {
            'range': [signal_data.min(), signal_data.max()],
            'mean': signal_data.mean(),
            'std': signal_data.std(),
            'skewness': stats.skew(signal_data),
            'kurtosis': stats.kurtosis(signal_data),
            'snr': calculate_snr(signal_data)
        }
    return stats_dict

def calculate_snr(signal_data):
    """Calculate Signal-to-Noise Ratio"""
    f, psd = signal.welch(signal_data, fs=100)  # Assuming 100Hz sampling rate
    signal_power = np.sum(psd[f < 10])  # Signal components below 10Hz
    noise_power = np.sum(psd[f >= 10])  # Noise components above 10Hz
    return 10 * np.log10(signal_power / noise_power) if noise_power > 0 else 0

def calculate_spectral_components(data):
    """Calculate spectral components in three frequency bands"""
    spectral_stats = {}
    for axis in ['x', 'y', 'z']:
        f, psd = signal.welch(data[axis], fs=100)
        total_power = np.sum(psd)
        
        # Calculate power in each frequency band
        low_freq = np.sum(psd[(f >= 0.1) & (f < 3)]) / total_power * 100
        mid_freq = np.sum(psd[(f >= 3) & (f < 10)]) / total_power * 100
        high_freq = np.sum(psd[f >= 10]) / total_power * 100
        
        spectral_stats[axis] = {
            'low_freq': low_freq,
            'mid_freq': mid_freq,
            'high_freq': high_freq
        }
    return spectral_stats

def analyze_dataset(base_path):
    """Analyze all sessions in the dataset"""
    session_dirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    all_sessions = []
    
    for session_dir in session_dirs:
        session_path = os.path.join(base_path, session_dir)
        session_stats = analyze_session(session_path)
        all_sessions.append(session_stats)
    
    # Calculate dataset-wide statistics
    durations = [s['duration_hours'] for s in all_sessions]
    sampling_rates = [s['temporal_chars']['mean_sampling_rate'] for s in all_sessions]
    
    summary = {
        'total_sessions': len(all_sessions),
        'total_duration': sum(durations),
        'avg_session_duration': np.mean(durations) * 60,  # in minutes
        'session_duration_std': np.std(durations) * 60,  # in minutes
        'avg_sampling_rate': np.mean(sampling_rates),
        'total_samples': sum(int(d * 3600 * r) for d, r in zip(durations, sampling_rates))
    }
    
    return summary, all_sessions

In [4]:
# Usage
if __name__ == "__main__":
    dataset_path = "../data/raw"
    summary, sessions = analyze_dataset(dataset_path)
    
    print("\nDataset Summary:")
    print(f"Total Sessions: {summary['total_sessions']}")
    print(f"Total Duration: {summary['total_duration']:.1f} hours")
    print(f"Average Session Duration: {summary['avg_session_duration']:.1f} minutes")
    print(f"Session Duration Std: {summary['session_duration_std']:.1f} minutes")
    print(f"Average Sampling Rate: {summary['avg_sampling_rate']:.2f} Hz")
    print(f"Total Samples: {summary['total_samples']:,}")

    print("\nDetailed Signal Statistics:")
    print("\nAccelerometer Statistics:")
    for session_idx, session in enumerate(sessions):
        print(f"\nSession {session_idx + 1}:")
        acc_stats = session['acc_stats']
        for axis in ['x', 'y', 'z']:
            print(f"\n{axis}-axis:")
            print(f"Range: [{acc_stats[axis]['range'][0]:.1f}, {acc_stats[axis]['range'][1]:.1f}] m/s²")
            print(f"Mean: {acc_stats[axis]['mean']:.2f} m/s²")
            print(f"Std: {acc_stats[axis]['std']:.2f}")
            print(f"Skewness: {acc_stats[axis]['skewness']:.2f}")
            print(f"Kurtosis: {acc_stats[axis]['kurtosis']:.2f}")
            print(f"SNR: {acc_stats[axis]['snr']:.1f} dB")

    print("\nGyroscope Statistics:")
    for session_idx, session in enumerate(sessions):
        print(f"\nSession {session_idx + 1}:")
        gyro_stats = session['gyro_stats']
        for axis in ['x', 'y', 'z']:
            print(f"\n{axis}-axis:")
            print(f"Range: [{gyro_stats[axis]['range'][0]:.1f}, {gyro_stats[axis]['range'][1]:.1f}] rad/s")
            print(f"Mean: {gyro_stats[axis]['mean']:.2f} rad/s")
            print(f"Std: {gyro_stats[axis]['std']:.2f}")
            print(f"Skewness: {gyro_stats[axis]['skewness']:.2f}")
            print(f"Kurtosis: {gyro_stats[axis]['kurtosis']:.2f}")
            print(f"SNR: {gyro_stats[axis]['snr']:.1f} dB")

    print("\nSpectral Components:")
    for session_idx, session in enumerate(sessions):
        print(f"\nSession {session_idx + 1}:")
        
        print("\nAccelerometer Spectral Components:")
        for axis in ['x', 'y', 'z']:
            print(f"\nSpectral components for {axis}-axis:")
            print(f"Low-frequency (0.1-3 Hz): {session['acc_spectral'][axis]['low_freq']:.1f}%")
            print(f"Mid-frequency (3-10 Hz): {session['acc_spectral'][axis]['mid_freq']:.1f}%")
            print(f"High-frequency (>10 Hz): {session['acc_spectral'][axis]['high_freq']:.1f}%")
        
        print("\nGyroscope Spectral Components:")
        for axis in ['x', 'y', 'z']:
            print(f"\nSpectral components for {axis}-axis:")
            print(f"Low-frequency (0.1-3 Hz): {session['gyro_spectral'][axis]['low_freq']:.1f}%")
            print(f"Mid-frequency (3-10 Hz): {session['gyro_spectral'][axis]['mid_freq']:.1f}%")
            print(f"High-frequency (>10 Hz): {session['gyro_spectral'][axis]['high_freq']:.1f}%")

    print("\nTemporal Characteristics:")
    for session_idx, session in enumerate(sessions):
        print(f"\nSession {session_idx + 1}:")
        temporal = session['temporal_chars']
        print(f"Mean Sampling Rate: {temporal['mean_sampling_rate']:.2f} Hz")
        print(f"Sampling Rate Std: {temporal['sampling_rate_std']:.2f} Hz")
        print(f"Mean Temporal Offset: {temporal['temporal_offset_mean']:.2f} ms")
        print(f"Temporal Offset Std: {temporal['temporal_offset_std']:.2f} ms")

    print("\nData Quality Metrics:")
    for session_idx, session in enumerate(sessions):
        print(f"\nSession {session_idx + 1}:")
        quality = session['data_quality']
        print(f"Missing Samples: {quality['missing_samples_pct']:.2f}%")
        print(f"Timestamp Continuity: {quality['timestamp_continuity']:.2f}%")
        print(f"Sample Interval 95% CI: [{quality['interval_ci'][0]:.1f}, {quality['interval_ci'][1]:.1f}] ms")


Dataset Summary:
Total Sessions: 26
Total Duration: 9.6 hours
Average Session Duration: 22.2 minutes
Session Duration Std: 10.9 minutes
Average Sampling Rate: 51.84 Hz
Total Samples: 1,800,366

Detailed Signal Statistics:

Accelerometer Statistics:

Session 1:

x-axis:
Range: [-17.7, 36.4] m/s²
Mean: 0.07 m/s²
Std: 4.51
Skewness: 0.45
Kurtosis: -0.66
SNR: 12.9 dB

y-axis:
Range: [-44.2, 13.9] m/s²
Mean: -7.18 m/s²
Std: 2.89
Skewness: 0.74
Kurtosis: 4.63
SNR: 9.2 dB

z-axis:
Range: [-14.7, 34.2] m/s²
Mean: 2.22 m/s²
Std: 4.23
Skewness: -0.03
Kurtosis: -0.27
SNR: 12.8 dB

Session 2:

x-axis:
Range: [-36.1, 16.9] m/s²
Mean: -1.15 m/s²
Std: 6.33
Skewness: -0.34
Kurtosis: -0.61
SNR: 14.8 dB

y-axis:
Range: [-49.1, 35.8] m/s²
Mean: -5.39 m/s²
Std: 3.90
Skewness: 0.12
Kurtosis: 0.28
SNR: 9.1 dB

z-axis:
Range: [-21.6, 25.7] m/s²
Mean: 2.74 m/s²
Std: 3.79
Skewness: -0.18
Kurtosis: 0.20
SNR: 12.3 dB

Session 3:

x-axis:
Range: [-10.8, 20.1] m/s²
Mean: 0.05 m/s²
Std: 4.75
Skewness: 0.76
Kurtosi

In [ ]:
# Information that cannot be derived from raw sensor data alone:
"""
The following metrics from your LaTeX document cannot be calculated from the raw sensor data alone:

1. Number of unique subjects (requires metadata)
2. Active eating duration and non-eating duration (requires activity labels)
3. Movement pattern analysis specific to eating gestures:
   - Mean acceleration peak during bites
   - Standard deviation of bite acceleration peak
   - Typical movement duration
   - Primary movement axis identification
4. Activity distribution (eating vs. non-eating periods)

These metrics would need to be calculated using additional data sources such as:
- Subject identification metadata
- Activity labels or annotations
- Eating gesture annotations
- Manual or automated bite detection results
"""