In [1]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append('../src')  # Add source directory to path

In [3]:
from feature_engineering.feature_pipeline import FeatureEngineeringPipeline

In [4]:
def load_data(file_path: str) -> pd.DataFrame:
    """
    Load sample data from a Parquet file.
    
    Args:
        file_path: Path to the Parquet file
        
    Returns:
        DataFrame containing the data
    """
    print(f"Loading data from {file_path}...")
    df = pd.read_parquet(file_path)
    print(f"Loaded {len(df)} samples with {len(df.columns)} columns")
    return df

In [5]:
data_path = "../data/cleaned_signal_dataset"
output_dir = "../features"

In [6]:
os.makedirs(output_dir, exist_ok=True)

## Load Data

In [7]:
df = load_data(data_path)

Loading data from ../data/cleaned_signal_dataset...
Loaded 6554753 samples with 24 columns


## Print data info


In [8]:
print("\nData preview:")
print(df.head())
    
print("\nData columns:")
print(df.columns.tolist())
    
print("\nData statistics:")

print(df.describe())


Data preview:
         bvp  label  subject_id    dataset        device skin_tone  \
0   5.673109      0           2  physionet   apple_watch      V-VI   
1   7.687833      0           2  physionet   apple_watch      I-II   
2   1.509560      0           2  physionet   apple_watch    III-IV   
3  12.999866      0           2  physionet  galaxy_watch    III-IV   
4  20.798602      0           2  physionet  galaxy_watch      V-VI   

   noise_level     acc_x      acc_y     acc_z  ...  bvp_smoothed  \
0      0.05088 -0.817685 -62.628226  4.996602  ...      1.427777   
1      0.07712 -0.973498 -62.739436  5.184150  ...      2.176976   
2      0.06400 -1.054134 -62.992483  5.020381  ...      2.724725   
3      0.09600 -1.000000 -69.300000  5.000000  ...      5.804123   
4      0.07632 -1.000000 -69.300000  5.000000  ...      8.756003   

   bvp_denoised  heart_rate  pulse_amplitude  ibi  signal_quality  \
0      1.330897    78.26087              0.0  0.0        0.548307   
1      1.507533  

# Define device info

In [9]:
device_info = {
    'device_type': 'apple_watch',
    'sensor_quality': 0.9,
    'wearing_position': 'wrist_top'
}

# Initialize feature engineering pipeline

In [10]:
pipeline = FeatureEngineeringPipeline(
    window_size=300,  # 10 seconds at 30 Hz
    overlap=0.5,      # 50% overlap
    sampling_rate=30, # 30 Hz
    output_dir=output_dir,
    device_info=device_info,
    random_state=42
)

In [11]:
# Define column names
ppg_col = 'bvp_denoised'
acc_x_col = 'acc_x' if 'acc_x' in df.columns else None
acc_y_col = 'acc_y' if 'acc_y' in df.columns else None
acc_z_col = 'acc_z' if 'acc_z' in df.columns else None
metadata_cols = ['device_type'] if 'device_type' in df.columns else None
target_col = 'stress_level' if 'stress_level' in df.columns else None

## Run the pipeline

In [12]:
selected_features = pipeline.run_pipeline(
    df=df,
    ppg_col=ppg_col,
    acc_x_col=acc_x_col,
    acc_y_col=acc_y_col,
    acc_z_col=acc_z_col,
    metadata_cols=metadata_cols,
    target_col=target_col,
    batch_size=100,  # Process 100 windows at a time
    n_features=20,   # Select top 20 features
    visualize=True   # Generate visualizations
)

Starting feature engineering pipeline...
Extracting features from 6554753 samples...
Created 43697 windows with 300 samples each and 50% overlap
Processed batch 1/437
Processed batch 2/437
Processed batch 3/437
Processed batch 4/437
Processed batch 5/437
Processed batch 6/437
Processed batch 7/437
Processed batch 8/437
Processed batch 9/437
Processed batch 10/437
Processed batch 11/437
Processed batch 12/437
Processed batch 13/437
Processed batch 14/437
Processed batch 15/437
Processed batch 16/437
Processed batch 17/437
Processed batch 18/437
Processed batch 19/437
Processed batch 20/437
Processed batch 21/437
Processed batch 22/437
Processed batch 23/437
Processed batch 24/437
Processed batch 25/437
Processed batch 26/437
Processed batch 27/437
Processed batch 28/437
Processed batch 29/437
Processed batch 30/437
Processed batch 31/437
Processed batch 32/437
Processed batch 33/437
Processed batch 34/437
Processed batch 35/437
Processed batch 36/437
Processed batch 37/437
Processed bat

# Print selected features

In [13]:

print("\nSelected features:")
print(selected_features.columns.tolist())
    
print("\nFeature engineering completed successfully!")
print(f"Results saved to {output_dir}")


Selected features:
['window_idx', 'start_idx', 'end_idx', 'time_mean', 'time_median', 'time_std', 'time_var', 'time_range', 'time_iqr', 'time_mad', 'time_skew', 'time_kurtosis', 'time_min', 'time_max', 'time_peak_to_peak', 'time_p10', 'time_p25', 'time_p75', 'time_p90', 'time_mean_derivative', 'time_std_derivative', 'time_max_derivative', 'time_mean_2nd_derivative', 'time_std_2nd_derivative', 'time_max_2nd_derivative', 'time_peak_count', 'time_mean_peak_height', 'time_std_peak_height', 'time_mean_peak_interval', 'time_std_peak_interval', 'time_min_peak_interval', 'time_max_peak_interval', 'time_est_heart_rate', 'time_sdnn', 'time_rmssd', 'time_pnn50', 'time_pnn20', 'time_hrv_cv', 'freq_vlf_power', 'freq_lf_power', 'freq_hf_power', 'freq_cardiac_power', 'freq_total_power', 'freq_lf_power_norm', 'freq_hf_power_norm', 'freq_lf_hf_ratio', 'freq_spectral_centroid', 'freq_spectral_spread', 'freq_spectral_skewness', 'freq_spectral_kurtosis', 'freq_spectral_edge_90', 'freq_spectral_edge_95', 