In [10]:
import sys
sys.path.append('../src')  # Add source directory to path

In [11]:
from signal_processing.base_signal_processor import BaseSignalProcessor
from signal_processing.motion_artifact_detector import MotionArtifactDetector
from signal_processing.adaptive_filter import AdaptiveFilter
from signal_processing.kalman_filter import KalmanFilter
from signal_processing.wavelet_denoiser import WaveletDenoiser
from signal_processing.pipeline import SignalProcessingPipeline

### Loading Unified Data

In [12]:
processor = BaseSignalProcessor(data_path="../data/processed/cleaned_unified_dataset.parquet")
dataset = processor.load_data()
print(dataset.head())

Loading unified dataset...
                                 bvp  label  subject_id    dataset  \
2020-01-03 08:00:00+00:00   5.673109      0           2  physionet   
2020-01-03 08:00:00+00:00   7.687833      0           2  physionet   
2020-01-03 08:00:00+00:00   1.509560      0           2  physionet   
2020-01-03 08:00:00+00:00  12.999866      0           2  physionet   
2020-01-03 08:00:00+00:00  20.798602      0           2  physionet   

                                 device skin_tone  noise_level     acc_x  \
2020-01-03 08:00:00+00:00   apple_watch      V-VI      0.05088 -0.817685   
2020-01-03 08:00:00+00:00   apple_watch      I-II      0.07712 -0.973498   
2020-01-03 08:00:00+00:00   apple_watch    III-IV      0.06400 -1.054134   
2020-01-03 08:00:00+00:00  galaxy_watch    III-IV      0.09600 -1.000000   
2020-01-03 08:00:00+00:00  galaxy_watch      V-VI      0.07632 -1.000000   

                               acc_y     acc_z  
2020-01-03 08:00:00+00:00 -62.628226  4.996602

In [13]:
import numpy as np

In [14]:
def _robust_normalize(data: np.ndarray) -> np.ndarray:
    """Enhanced normalization with fallback"""
    data = np.nan_to_num(data, nan=np.median(data))
        
    # Fallback to std if IQR is zero
    q75, q25 = np.percentile(data, [75, 25])
    iqr = q75 - q25
    if iqr < 1e-6:
        std = np.std(data) + 1e-6
        normalized = (data - np.mean(data)) / std
    else:
        normalized = (data - np.median(data)) / iqr
        
    # Secondary clipping
    return np.clip(normalized, -3, 3)

# Compute and normalize accelerometer magnitude
dataset['acc_mag'] = np.sqrt(dataset['acc_x']**2 + dataset['acc_y']**2 + dataset['acc_z']**2)
dataset['acc_mag'] = _robust_normalize(dataset['acc_mag'].values)

### Motion Artifact Detection

In [15]:
detector = MotionArtifactDetector()
dataset = detector.detect_motion_bursts(dataset)
print(dataset[['acc_x', 'acc_y', 'acc_z', 'motion_burst']].head())

                              acc_x      acc_y     acc_z  motion_burst
2020-01-03 08:00:00+00:00 -0.817685 -62.628226  4.996602           0.0
2020-01-03 08:00:00+00:00 -0.973498 -62.739436  5.184150           0.0
2020-01-03 08:00:00+00:00 -1.054134 -62.992483  5.020381           0.0
2020-01-03 08:00:00+00:00 -1.000000 -69.300000  5.000000           0.0
2020-01-03 08:00:00+00:00 -1.000000 -69.300000  5.000000           0.0


In [16]:
num_unique_motion_bursts = dataset['motion_burst'].nunique()
motion_burst_counts = dataset['motion_burst'].value_counts()
print(f"Value counts of motion_burst:\n{motion_burst_counts}")

# Check motion burst distribution
motion_burst_counts = dataset['motion_burst'].value_counts(normalize=True) * 100
print(f"Motion Burst Distribution:\n{motion_burst_counts}")


Value counts of motion_burst:
motion_burst
0.0    6488769
1.0      65984
Name: count, dtype: int64
Motion Burst Distribution:
motion_burst
0.0    98.993341
1.0     1.006659
Name: proportion, dtype: float64


In [17]:
# import matplotlib.pyplot as plt
# # Visualize results
# plt.figure(figsize=(12, 6))
# plt.plot(dataset['acc_mag'], label="Accelerometer Magnitude")
# plt.plot(dataset['motion_burst'] * dataset['acc_mag'].max(), label="Motion Bursts", linestyle='--')
# plt.legend()
# plt.title("Motion Burst Detection")
# plt.show()

### Adaptive Filtering for Motion Artifact Removal

In [18]:
adaptive_filter = AdaptiveFilter()
cleaned_bvp = adaptive_filter.apply_adaptive_filter(
    noisy_signal=dataset['bvp'].values,
    reference_signal=dataset['acc_mag'].values,
    motion_burst=dataset['motion_burst'].values
)
dataset['bvp_cleaned'] = cleaned_bvp

INFO:root:Applying adaptive filtering... Input length: 6554753


### Apply kalman filter

In [19]:
kalman_filter = KalmanFilter()
bvp_smoothed = kalman_filter.apply_kalman_filter(
    signal=dataset['bvp_cleaned'].values,
    motion_burst=dataset['motion_burst'].values
)
dataset['bvp_smoothed'] = bvp_smoothed

### Wavelet Denoising

In [20]:
# notebooks/phase3_signal_processing.ipynb

wavelet_denoiser = WaveletDenoiser()
denoised_bvp = wavelet_denoiser.apply_wavelet_denoising(dataset['bvp_smoothed'].values,motion_burst=dataset['motion_burst'].values,skin_tone=dataset['skin_tone'].iloc[0])

# Verify lengths match before assignment
assert len(denoised_bvp) == len(dataset), "Denoised signal length mismatch"

dataset['bvp_denoised'] = denoised_bvp

In [21]:
dataset.head()

Unnamed: 0,bvp,label,subject_id,dataset,device,skin_tone,noise_level,acc_x,acc_y,acc_z,acc_mag,motion_burst,bvp_cleaned,bvp_smoothed,bvp_denoised
2020-01-03 08:00:00+00:00,5.673109,0,2,physionet,apple_watch,V-VI,0.05088,-0.817685,-62.628226,4.996602,62.832549,0.0,3.682249,3.682249,3.723733
2020-01-03 08:00:00+00:00,7.687833,0,2,physionet,apple_watch,I-II,0.07712,-0.973498,-62.739436,5.18415,62.960781,0.0,3.682249,3.682249,3.745387
2020-01-03 08:00:00+00:00,1.50956,0,2,physionet,apple_watch,III-IV,0.064,-1.054134,-62.992483,5.020381,63.201016,0.0,3.682249,3.682249,3.749604
2020-01-03 08:00:00+00:00,12.999866,0,2,physionet,galaxy_watch,III-IV,0.096,-1.0,-69.3,5.0,69.487337,0.0,3.682249,3.682249,3.719471
2020-01-03 08:00:00+00:00,20.798602,0,2,physionet,galaxy_watch,V-VI,0.07632,-1.0,-69.3,5.0,69.487337,0.0,3.682249,3.682249,3.669948


## Runnig the Pipeline

In [22]:
import pandas as pd

In [23]:
dataset_ = pd.read_parquet("../data/processed/cleaned_unified_dataset.parquet")
pipeline = SignalProcessingPipeline()
processed_df = pipeline.process_signal(dataset_)

INFO:root:Applying adaptive filtering... Input length: 10000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['bvp_cleaned'] = bvp_denoised
INFO:root:Applying adaptive filtering... Input length: 10000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['bvp_cleaned'] = bvp_denoised
INFO:root:Applying adaptive filtering... Input length: 10000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gui

In [24]:
processed_df.columns

Index(['bvp', 'label', 'subject_id', 'dataset', 'device', 'skin_tone',
       'noise_level', 'acc_x', 'acc_y', 'acc_z', 'acc_mag', 'motion_burst',
       'bvp_cleaned', 'bvp_smoothed', 'bvp_denoised'],
      dtype='object')

### Signal Quality Metrics

In [25]:
from scipy.signal import find_peaks
from fastdtw import fastdtw

class SignalQualityMetrics:
    def compute_snr(self, cleaned: np.ndarray, original: np.ndarray) -> float:
        """Cross-correlation aligned SNR calculation"""
        # 1. Remove DC offsets
        cleaned = cleaned - np.median(cleaned)
        original = original - np.median(original)
        
        # 2. Find optimal delay using cross-correlation
        corr = np.correlate(original, cleaned, mode='full')
        delay = corr.argmax() - (len(original) - 1)
        
        # 3. Align signals without warping
        if delay > 0:
            aligned_clean = cleaned[delay:]
            aligned_original = original[:-delay]
        else:
            aligned_clean = cleaned[:delay]
            aligned_original = original[-delay:]
            
        # 4. Trim to common length
        min_len = min(len(aligned_clean), len(aligned_original))
        aligned_clean = aligned_clean[:min_len]
        aligned_original = aligned_original[:min_len]
        
        # 5. Calculate proper SNR
        noise = aligned_original - aligned_clean
        signal_power = np.mean(aligned_original**2)
        noise_power = np.mean(noise**2)
        return 10 * np.log10(signal_power / (noise_power + 1e-9))

    def compute_artifact_density(self, motion_burst: np.ndarray) -> float:
        return 100 * np.mean(motion_burst)

In [26]:
# Compute metrics using original BVP as reference
metrics_calculator = SignalQualityMetrics()
snr = metrics_calculator.compute_snr(
    processed_df['bvp_cleaned'].values,
    processed_df['bvp'].values  # Use original signal as reference
)
artifact_density = metrics_calculator.compute_artifact_density(processed_df['motion_burst'].values)

print(f"SNR: {snr:.2f} dB, Artifact Density: {artifact_density:.2f}%")

SNR: -0.01 dB, Artifact Density: 1.01%


## Saving cleaned data

In [27]:
pipeline.save_cleaned_dataset(dataset, "../data/cleaned_signal_dataset")

Cleaned dataset saved to ../data/cleaned_signal_dataset
