In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()  # Automatically applies Seaborn’s default style
from scipy import stats, signal
from pathlib import Path
import re
import json


In [2]:
sensor_dtype = np.dtype([
    ("x", ">f4"),
    ("y", ">f4"),
    ("z", ">f4"),
    ("time", ">i8"),
])

In [3]:
def load_raw_sensor_data(path):
    """Load and process sensor data from binary files"""
    acc_files = sorted([f for f in os.listdir(path) if f.endswith('.bin') and 'accelerometer' in f])
    gyro_files = sorted([f for f in os.listdir(path) if f.endswith('.bin') and 'gyroscope' in f])
    
    all_acc_data = []
    all_gyro_data = []
    
    # Load accelerometer data
    for file in acc_files:
        boot_time_nanos = int(file.split("_")[0]) * 1e6
        file_path = os.path.join(path, file)
        acc_data = np.fromfile(file_path, dtype=sensor_dtype)
        first_event_time = acc_data['time'][0]
        corrected_timestamps = ((acc_data['time'] - first_event_time) + boot_time_nanos) / 1e9
        corrected_datetimes = pd.to_datetime(corrected_timestamps, unit='s')
        df = pd.DataFrame(acc_data[["x", "y", "z"]].byteswap().newbyteorder())
        df['time'] = corrected_datetimes
        all_acc_data.append(df)
    
    # Load gyroscope data
    for file in gyro_files:
        boot_time_nanos = int(file.split("_")[0]) * 1e6
        file_path = os.path.join(path, file)
        gyro_data = np.fromfile(file_path, dtype=sensor_dtype)
        first_event_time = gyro_data['time'][0]
        corrected_timestamps = ((gyro_data['time'] - first_event_time) + boot_time_nanos) / 1e9
        corrected_datetimes = pd.to_datetime(corrected_timestamps, unit='s')
        df = pd.DataFrame(gyro_data[["x", "y", "z"]].byteswap().newbyteorder())
        df['time'] = corrected_datetimes
        all_gyro_data.append(df)
        
    return pd.concat(all_acc_data), pd.concat(all_gyro_data)


In [4]:
def sync_sensors(acc_data, gyro_data):
    """Synchronize accelerometer and gyroscope data"""
    common_start_time = max(acc_data['time'].min(), gyro_data['time'].min())
    common_end_time = min(acc_data['time'].max(), gyro_data['time'].max())
    
    acc_synced = acc_data[(acc_data['time'] >= common_start_time) & (acc_data['time'] <= common_end_time)]
    gyro_synced = gyro_data[(gyro_data['time'] >= common_start_time) & (gyro_data['time'] <= common_end_time)]
    
    return acc_synced.reset_index(drop=True), gyro_synced.reset_index(drop=True)


In [5]:
def calculate_sampling_rate(data):
    """Calculate actual sampling rate statistics from timestamps"""
    time_diffs = np.diff(data['time'].astype(np.int64)) / 1e9  # Convert to seconds
    sampling_stats = {
        'mean_rate': 1 / np.mean(time_diffs),
        'std_rate': np.std(1 / time_diffs),
        'min_rate': 1 / np.max(time_diffs),
        'max_rate': 1 / np.min(time_diffs)
    }
    return sampling_stats

In [6]:
def calculate_signal_stats(data):
    """Calculate comprehensive signal statistics"""
    return {
        'range': {
            'min': data[['x', 'y', 'z']].min().to_dict(),
            'max': data[['x', 'y', 'z']].max().to_dict()
        },
        'mean': data[['x', 'y', 'z']].mean().to_dict(),
        'std': data[['x', 'y', 'z']].std().to_dict(),
        'skewness': data[['x', 'y', 'z']].apply(stats.skew).to_dict(),
        'kurtosis': data[['x', 'y', 'z']].apply(stats.kurtosis).to_dict()
    }

In [7]:
# def analyze_spectrum(data, fs):
#     """Perform spectral analysis using Welch's method"""
#     freqs, Pxx = signal.welch(data[['x', 'y', 'z']].values, fs=fs, nperseg=1024)
#     
#     # Calculate power in different frequency bands
#     low_mask = (freqs >= 0.1) & (freqs < 3)
#     mid_mask = (freqs >= 3) & (freqs < 10)
#     high_mask = freqs >= 10
#     
#     power_dist = {
#         'low_freq': {
#             'x': np.sum(Pxx[low_mask, 0]),
#             'y': np.sum(Pxx[low_mask, 1]),
#             'z': np.sum(Pxx[low_mask, 2])
#         },
#         'mid_freq': {
#             'x': np.sum(Pxx[mid_mask, 0]),
#             'y': np.sum(Pxx[mid_mask, 1]),
#             'z': np.sum(Pxx[mid_mask, 2])
#         },
#         'high_freq': {
#             'x': np.sum(Pxx[high_mask, 0]),
#             'y': np.sum(Pxx[high_mask, 1]),
#             'z': np.sum(Pxx[high_mask, 2])
#         }
#     }
#     
#     return freqs, Pxx, power_dist

In [8]:
base_path = "../data/raw"
output_path = "data_specs"
Path(output_path).mkdir(exist_ok=True)

In [9]:
# Store results
all_sessions = []
all_acc_data = []
all_gyro_data = []
session_durations = []
sampling_rates_acc = []
sampling_rates_gyro = []

In [10]:
for session_dir in Path(base_path).glob("*"):
    if not session_dir.is_dir():
        continue
    
    print(f"Processing {session_dir.name}...")
    
    # Extract subject info
    parts = session_dir.name.split('_')
    subject_id = int(re.findall(r'\d+', parts[0])[0])
    meal_num = int(parts[-1]) if len(parts) > 2 else 1
    
    # Load and process data
    acc_data, gyro_data = load_raw_sensor_data(session_dir)
    acc_synced, gyro_data = sync_sensors(acc_data, gyro_data)
    
    # Calculate session duration in minutes
    duration = (acc_synced['time'].max() - acc_synced['time'].min()).total_seconds() / 60
    session_durations.append(duration)
    
    # Calculate sampling rates
    sampling_rates_acc.append(calculate_sampling_rate(acc_synced))
    sampling_rates_gyro.append(calculate_sampling_rate(gyro_data))
    
    # Store data for overall analysis
    all_acc_data.append(acc_synced)
    all_gyro_data.append(gyro_data)
    all_sessions.append({
        'subject_id': subject_id,
        'meal_num': meal_num,
        'duration': duration
    })

Processing 17_meal_2...
Processing 1_meal_1...
Processing 22_meal_1...
Processing 20...
Processing 18...
Processing 9...
Processing 11...
Processing 7...
Processing 16...
Processing 6...
Processing 10...
Processing 19...
Processing 8...
Processing 21...
Processing 17_meal_1...
Processing 1_meal_2...
Processing 22_meal_2...
Processing 3**...
Processing 23...
Processing 4...
Processing 15...
Processing 12...
Processing 2...
Processing 13...
Processing 5...
Processing 14...


In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import os

# Combine all data
combined_acc = pd.concat(all_acc_data, ignore_index=True)
combined_gyro = pd.concat(all_gyro_data, ignore_index=True)

# Calculate mean sampling rate for spectral analysis
mean_fs_acc = np.mean([s['mean_rate'] for s in sampling_rates_acc])
mean_fs_gyro = np.mean([s['mean_rate'] for s in sampling_rates_gyro])

# Generate comprehensive results
results = {
    'general': {
        'total_sessions': len(all_sessions),
        'unique_subjects': len(set(s['subject_id'] for s in all_sessions)),
        'total_duration_minutes': sum(session_durations),
        'mean_duration_minutes': np.mean(session_durations),
        'std_duration_minutes': np.std(session_durations)
    },
    'sampling_rates': {
        'accelerometer': {
            'mean': np.mean([s['mean_rate'] for s in sampling_rates_acc]),
            'std': np.std([s['std_rate'] for s in sampling_rates_acc]),  # Fixed from np.mean to np.std
            'min': np.min([s['min_rate'] for s in sampling_rates_acc]),
            'max': np.max([s['max_rate'] for s in sampling_rates_acc])
        },
        'gyroscope': {
            'mean': np.mean([s['mean_rate'] for s in sampling_rates_gyro]),
            'std': np.std([s['std_rate'] for s in sampling_rates_gyro]),  # Fixed from np.mean to np.std
            'min': np.min([s['min_rate'] for s in sampling_rates_gyro]),
            'max': np.max([s['max_rate'] for s in sampling_rates_gyro])
        }
    },
    'accelerometer_stats': calculate_signal_stats(combined_acc),
    'gyroscope_stats': calculate_signal_stats(combined_gyro)
}

# Commenting out Spectral Analysis for simplicity, fix later if needed
# _, _, acc_spectrum = analyze_spectrum(combined_acc, mean_fs_acc)
# _, _, gyro_spectrum = analyze_spectrum(combined_gyro, mean_fs_gyro)
# results['spectral_analysis'] = {
#     'accelerometer': acc_spectrum,
#     'gyroscope': gyro_spectrum
# }

# Generate visualizations
plt.style.use('ggplot')  # Using a safe, guaranteed-to-exist style

# 1. Session Duration Distribution
plt.figure(figsize=(10, 6))
sns.histplot(session_durations, bins=20)
plt.title('Session Duration Distribution')
plt.xlabel('Duration (minutes)')
plt.ylabel('Count')
plt.savefig(os.path.join(output_path, 'session_durations.png'))
plt.close()

# 2. Sampling Rate Stability
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist([s['mean_rate'] for s in sampling_rates_acc], bins=20)
plt.title('Accelerometer Sampling Rate Distribution')
plt.xlabel('Sampling Rate (Hz)')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.hist([s['mean_rate'] for s in sampling_rates_gyro], bins=20)
plt.title('Gyroscope Sampling Rate Distribution')
plt.xlabel('Sampling Rate (Hz)')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig(os.path.join(output_path, 'sampling_rates.png'))
plt.close()

# Commented out Spectral Power Distribution visualization for simplicity
# Uncomment and fix as needed later

# Save results to JSON
with open(os.path.join(output_path, 'analysis_results.json'), 'w') as f:
    json.dump(results, f, indent=4)

# Print summary
print("\nDataset Summary:")
print(f"Total Sessions: {results['general']['total_sessions']}")
print(f"Unique Subjects: {results['general']['unique_subjects']}")
print(f"Total Duration: {results['general']['total_duration_minutes']:.2f} minutes")
print(f"Mean Session Duration: {results['general']['mean_duration_minutes']:.2f} minutes")
print("\nSampling Rates:")
print(f"Accelerometer: {results['sampling_rates']['accelerometer']['mean']:.2f} ± {results['sampling_rates']['accelerometer']['std']:.2f} Hz")
print(f"Gyroscope: {results['sampling_rates']['gyroscope']['mean']:.2f} ± {results['sampling_rates']['gyroscope']['std']:.2f} Hz")



Dataset Summary:
Total Sessions: 26
Unique Subjects: 23
Total Duration: 576.52 minutes
Mean Session Duration: 22.17 minutes

Sampling Rates:
Accelerometer: 51.84 ± 265.82 Hz
Gyroscope: 51.03 ± 105.95 Hz


In [None]:
import matplotlib.pyplot as plt
print(plt.style.available)


In [None]:
plt.style.use('ggplot')  # Choose any that’s reliably listed in `plt.style.available`


In [None]:

import pandas as pd
import numpy as np
from scipy import stats

# Assuming data is loaded as 'acc_data' and 'gyro_data'
# Load your datasets
acc_data = pd.read_csv('accelerometer_data.csv')  # Replace with your actual file path
gyro_data = pd.read_csv('gyroscope_data.csv')     # Replace with your actual file path

# Parameters (assumed sampling rate)
sampling_rate = 100  # Hz, adjust as needed

class SensorAnalysis:
    def __init__(self, sampling_rate):
        self.sampling_rate = sampling_rate

    def calculate_sync_gaps(self, acc_data, gyro_data):
        """Calculate temporal offsets between synchronized sensors"""
        acc_times = acc_data['time'].astype(np.int64)
        gyro_times = gyro_data['time'].astype(np.int64)

        offset_stats = {
            'mean_offset_ms': (acc_times - gyro_times).mean() / 1e6,
            'std_offset_ms': (acc_times - gyro_times).std() / 1e6,
            'max_offset_ms': (acc_times - gyro_times).max() / 1e6
        }
        return offset_stats

    def analyze_sampling_consistency(self, data):
        """Analyze sampling interval consistency"""
        intervals = np.diff(data['time'].astype(np.int64)) / 1e9
        interval_stats = {
            'expected_interval': 1/self.sampling_rate,
            'mean_interval': np.mean(intervals),
            'std_interval': np.std(intervals),
            'confidence_interval': stats.t.interval(0.95, len(intervals)-1,
                                                  loc=np.mean(intervals),
                                                  scale=stats.sem(intervals))
        }
        return interval_stats

    def analyze_data_completeness(self, data, expected_samples):
        """Analyze data completeness and continuity"""
        actual_samples = len(data)
        missing_ratio = 1 - (actual_samples / expected_samples)

        # Analyze timestamp continuity
        time_gaps = np.diff(data['time'].astype(np.int64)) / 1e9
        continuity_stats = {
            'completeness_ratio': 1 - missing_ratio,
            'total_gaps': np.sum(time_gaps > (2/self.sampling_rate)),
            'max_gap_duration': np.max(time_gaps),
            'mean_gap_duration': np.mean(time_gaps[time_gaps > (2/self.sampling_rate)])
        }
        return continuity_stats

    def validate_sensor_ranges(self, data):
        """Validate sensor measurements against manufacturer specs"""
        # Add your sensor's specifications here
        ACC_RANGE = 16  # g, adjust based on your sensor
        GYRO_RANGE = 2000  # deg/s, adjust based on your sensor

        validation_stats = {
            'out_of_range_samples': len(data[
                (data['x'].abs() > ACC_RANGE) |
                (data['y'].abs() > ACC_RANGE) |
                (data['z'].abs() > ACC_RANGE)
            ]),
            'range_violation_ratio': len(data[
                (data['x'].abs() > ACC_RANGE) |
                (data['y'].abs() > ACC_RANGE) |
                (data['z'].abs() > ACC_RANGE)
            ]) / len(data)
        }
        return validation_stats

# Initialize the analysis class
analysis = SensorAnalysis(sampling_rate)

# Perform synchronization gap analysis
sync_gaps = analysis.calculate_sync_gaps(acc_data, gyro_data)
print("Synchronization Gap Analysis:")
print(sync_gaps)

# Analyze sampling consistency for accelerometer data
sampling_consistency_acc = analysis.analyze_sampling_consistency(acc_data)
print("\nSampling Consistency Analysis (Accelerometer):")
print(sampling_consistency_acc)

# Analyze sampling consistency for gyroscope data
sampling_consistency_gyro = analysis.analyze_sampling_consistency(gyro_data)
print("\nSampling Consistency Analysis (Gyroscope):")
print(sampling_consistency_gyro)

# Expected number of samples for data completeness analysis
expected_samples = int((acc_data['time'].iloc[-1] - acc_data['time'].iloc[0]) / (1e9 / sampling_rate))

# Analyze data completeness for accelerometer data
completeness_acc = analysis.analyze_data_completeness(acc_data, expected_samples)
print("\nData Completeness Analysis (Accelerometer):")
print(completeness_acc)

# Analyze data completeness for gyroscope data
completeness_gyro = analysis.analyze_data_completeness(gyro_data, expected_samples)
print("\nData Completeness Analysis (Gyroscope):")
print(completeness_gyro)

# Validate sensor ranges for accelerometer data
validation_acc = analysis.validate_sensor_ranges(acc_data)
print("\nSensor Range Validation (Accelerometer):")
print(validation_acc)

# Validate sensor ranges for gyroscope data
validation_gyro = analysis.validate_sensor_ranges(gyro_data)
print("\nSensor Range Validation (Gyroscope):")
print(validation_gyro)

# Consolidated Analysis Summary
print("\n--- Consolidated Analysis Summary ---")
print("Synchronization Gap Analysis:", sync_gaps)
print("Sampling Consistency (Accelerometer):", sampling_consistency_acc)
print("Sampling Consistency (Gyroscope):", sampling_consistency_gyro)
print("Data Completeness (Accelerometer):", completeness_acc)
print("Data Completeness (Gyroscope):", completeness_gyro)
print("Sensor Range Validation (Accelerometer):", validation_acc)
print("Sensor Range Validation (Gyroscope):", validation_gyro)
