In [1]:
import sys
sys.path.append('../src')  # Add source directory to path

In [2]:
from signal_processing.base_signal_processor import BaseSignalProcessor
from signal_processing.motion_artifact_detector import MotionArtifactDetector
from signal_processing.adaptive_filter import AdaptiveFilter
from signal_processing.kalman_filter import KalmanFilter
from signal_processing.wavelet_denoiser import WaveletDenoiser
from signal_processing.pipeline import SignalProcessingPipeline

### Loading Unified Data

In [3]:
processor = BaseSignalProcessor(data_path="../data/processed/cleaned_unified_dataset.parquet")
dataset = processor.load_data()
print(dataset.head())

Loading unified dataset...
                                 bvp  label  subject_id    dataset  \
2020-01-03 08:00:00+00:00   5.673109      0           2  physionet   
2020-01-03 08:00:00+00:00   7.687833      0           2  physionet   
2020-01-03 08:00:00+00:00   1.509560      0           2  physionet   
2020-01-03 08:00:00+00:00  12.999866      0           2  physionet   
2020-01-03 08:00:00+00:00  20.798602      0           2  physionet   

                                 device skin_tone  noise_level     acc_x  \
2020-01-03 08:00:00+00:00   apple_watch      V-VI      0.05088 -0.817685   
2020-01-03 08:00:00+00:00   apple_watch      I-II      0.07712 -0.973498   
2020-01-03 08:00:00+00:00   apple_watch    III-IV      0.06400 -1.054134   
2020-01-03 08:00:00+00:00  galaxy_watch    III-IV      0.09600 -1.000000   
2020-01-03 08:00:00+00:00  galaxy_watch      V-VI      0.07632 -1.000000   

                               acc_y     acc_z  
2020-01-03 08:00:00+00:00 -62.628226  4.996602

In [4]:
import numpy as np

In [5]:
def _robust_normalize(data: np.ndarray) -> np.ndarray:
    """Enhanced normalization with fallback"""
    data = np.nan_to_num(data, nan=np.median(data))
        
    # Fallback to std if IQR is zero
    q75, q25 = np.percentile(data, [75, 25])
    iqr = q75 - q25
    if iqr < 1e-6:
        std = np.std(data) + 1e-6
        normalized = (data - np.mean(data)) / std
    else:
        normalized = (data - np.median(data)) / iqr
        
    # Secondary clipping
    return np.clip(normalized, -3, 3)

# Compute and normalize accelerometer magnitude
dataset['acc_mag'] = np.sqrt(dataset['acc_x']**2 + dataset['acc_y']**2 + dataset['acc_z']**2)
dataset['acc_mag'] = _robust_normalize(dataset['acc_mag'].values)

### Motion Artifact Detection

In [6]:
detector = MotionArtifactDetector()
dataset = detector.detect_motion_bursts(dataset)
print(dataset[['acc_x', 'acc_y', 'acc_z', 'motion_burst']].head())

                              acc_x      acc_y     acc_z  motion_burst
2020-01-03 08:00:00+00:00 -0.817685 -62.628226  4.996602           0.0
2020-01-03 08:00:00+00:00 -0.973498 -62.739436  5.184150           0.0
2020-01-03 08:00:00+00:00 -1.054134 -62.992483  5.020381           0.0
2020-01-03 08:00:00+00:00 -1.000000 -69.300000  5.000000           0.0
2020-01-03 08:00:00+00:00 -1.000000 -69.300000  5.000000           0.0


In [7]:
num_unique_motion_bursts = dataset['motion_burst'].nunique()
motion_burst_counts = dataset['motion_burst'].value_counts()
print(f"Value counts of motion_burst:\n{motion_burst_counts}")

# Check motion burst distribution
motion_burst_counts = dataset['motion_burst'].value_counts(normalize=True) * 100
print(f"Motion Burst Distribution:\n{motion_burst_counts}")


Value counts of motion_burst:
motion_burst
0.0    5412458
1.0    1142295
Name: count, dtype: int64
Motion Burst Distribution:
motion_burst
0.0    82.573028
1.0    17.426972
Name: proportion, dtype: float64


In [8]:
print(f"New artifact density: {dataset['motion_burst'].mean() * 100:.2f}%")
print(dataset['acc_mag'].describe())

New artifact density: 17.43%
count    6.554753e+06
mean     4.667615e-01
std      9.818329e-01
min     -8.811038e-01
25%     -4.866879e-02
50%      0.000000e+00
75%      9.513312e-01
max      1.122975e+01
Name: acc_mag, dtype: float64


In [9]:
# import matplotlib.pyplot as plt
# # Visualize results
# plt.figure(figsize=(12, 6))
# plt.plot(dataset['acc_mag'], label="Accelerometer Magnitude")
# plt.plot(dataset['motion_burst'] * dataset['acc_mag'].max(), label="Motion Bursts", linestyle='--')
# plt.legend()
# plt.title("Motion Burst Detection")
# plt.show()

### Adaptive Filtering for Motion Artifact Removal

In [10]:
adaptive_filter = AdaptiveFilter()
cleaned_bvp = adaptive_filter.apply_adaptive_filter(
    noisy_signal=dataset['bvp'].values,
    reference_signal=dataset['acc_mag'].values,
    motion_burst=dataset['motion_burst'].values
)
dataset['bvp_cleaned'] = cleaned_bvp

INFO:root:Applying adaptive filtering... Input length: 6554753


### Apply kalman filter

In [11]:
kalman_filter = KalmanFilter()
bvp_smoothed = kalman_filter.apply_kalman_filter(
    signal=dataset['bvp_cleaned'].values,
    motion_burst=dataset['motion_burst'].values
)
dataset['bvp_smoothed'] = bvp_smoothed

### Wavelet Denoising

In [12]:
# notebooks/phase3_signal_processing.ipynb

wavelet_denoiser = WaveletDenoiser()
denoised_bvp = wavelet_denoiser.apply_wavelet_denoising(dataset['bvp_smoothed'].values,motion_burst=dataset['motion_burst'].values,skin_tone=dataset['skin_tone'].iloc[0])

# Verify lengths match before assignment
assert len(denoised_bvp) == len(dataset), "Denoised signal length mismatch"

dataset['bvp_denoised'] = denoised_bvp

In [13]:
dataset.head()

Unnamed: 0,bvp,label,subject_id,dataset,device,skin_tone,noise_level,acc_x,acc_y,acc_z,acc_mag,is_clean,motion_burst,bvp_cleaned,bvp_smoothed,bvp_denoised
2020-01-03 08:00:00+00:00,5.673109,0,2,physionet,apple_watch,V-VI,0.05088,-0.817685,-62.628226,4.996602,-0.023022,False,0.0,3.6816,3.6816,3.685076
2020-01-03 08:00:00+00:00,7.687833,0,2,physionet,apple_watch,I-II,0.07712,-0.973498,-62.739436,5.18415,-0.021271,False,0.0,3.682223,3.681811,3.683689
2020-01-03 08:00:00+00:00,1.50956,0,2,physionet,apple_watch,III-IV,0.064,-1.054134,-62.992483,5.020381,-0.01799,False,0.0,3.683069,3.682137,3.682469
2020-01-03 08:00:00+00:00,12.999866,0,2,physionet,galaxy_watch,III-IV,0.096,-1.0,-69.3,5.0,1.016824,False,0.0,3.684116,3.682556,3.68184
2020-01-03 08:00:00+00:00,20.798602,0,2,physionet,galaxy_watch,V-VI,0.07632,-1.0,-69.3,5.0,1.016824,False,0.0,3.681431,3.682352,3.681621


In [14]:
print(dataset[['bvp', 'bvp_cleaned', 'bvp_smoothed']].std())

bvp             28.719013
bvp_cleaned      0.100806
bvp_smoothed     0.024714
dtype: float64


## Runnig the Pipeline

In [15]:
import pandas as pd

In [16]:
dataset_ = pd.read_parquet("../data/processed/cleaned_unified_dataset.parquet")
pipeline = SignalProcessingPipeline()
processed_df = pipeline.process_signal(dataset_)

INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adaptive filtering... Input length: 10000
INFO:root:Applying adapt

In [17]:
processed_df.columns

Index(['bvp', 'label', 'subject_id', 'dataset', 'device', 'skin_tone',
       'noise_level', 'acc_x', 'acc_y', 'acc_z', 'is_clean', 'acc_mag',
       'motion_burst', 'bvp_cleaned'],
      dtype='object')

In [18]:
print(processed_df[['bvp', 'bvp_cleaned']].std())

bvp            28.719013
bvp_cleaned     1.605948
dtype: float64


In [19]:
import numpy as np

### Signal Quality Metrics

In [20]:
from scipy.signal import find_peaks
from scipy.signal import correlate

class SignalQualityMetrics:
    def compute_snr(self, cleaned: np.ndarray, original: np.ndarray) -> float:
        """Optimized SNR calculation with FFT-based correlation"""
        # 0. Downsample first (from 30Hz to 10Hz)
        ds_factor = 3
        cleaned_ds = cleaned[::ds_factor]
        original_ds = original[::ds_factor]
        
        # 1. Remove DC offsets using mean (faster than median)
        cleaned_ds = cleaned_ds - np.mean(cleaned_ds)
        original_ds = original_ds - np.mean(original_ds)
        
        # 2. FFT-based cross-correlation with limited search window
        corr = correlate(original_ds, cleaned_ds, mode='same', method='fft')
        max_lag = len(original_ds) // 10  # 10% maximum lag
        center = len(original_ds) // 2
        search_window = corr[center-max_lag:center+max_lag]
        delay = (np.argmax(search_window) - max_lag) * ds_factor
        
        # 3. Direct slicing without full alignment
        aligned_clean = cleaned[max(-delay,0):min(len(cleaned),len(cleaned)-delay)]
        aligned_original = original[max(delay,0):min(len(original),len(original)+delay)]
        
        # 4. Vectorized power calculation
        signal_power = np.dot(aligned_original, aligned_original) / len(aligned_original)
        noise_power = np.dot(aligned_original-aligned_clean, aligned_original-aligned_clean) / len(aligned_original)
        
        return 10 * np.log10(signal_power / (noise_power + 1e-9))

    def compute_artifact_density(self, motion_burst: np.ndarray) -> float:
        return 100 * np.mean(motion_burst)

In [21]:
# Compute metrics using original BVP as reference
metrics_calculator = SignalQualityMetrics()
snr = metrics_calculator.compute_snr(
    processed_df['bvp_cleaned'].values,
    processed_df['bvp'].values  # Use original signal as reference
)

artifact_density = metrics_calculator.compute_artifact_density(processed_df['motion_burst'].values)

print(f"SNR: {snr:.2f} dB, Artifact Density: {artifact_density:.2f}%")

SNR: 0.09 dB, Artifact Density: 17.43%


## Saving cleaned data

In [22]:
pipeline.save_cleaned_dataset(dataset, "../data/cleaned_signal_dataset")

Cleaned dataset saved to ../data/cleaned_signal_dataset
