* Step 1: daily merge - merging all of the modalities together-- for easy access -- per participant
* Step 2: Concatenate days : one dataframe per participant
* Step 3: Attach metadata: Add participant_id, optional time window info
* Step 4: Stack participants: combine all into one final dataset for ML and EDA
* Step 5: Merge in stress labels using algined timestamps

In [61]:
import os
import pandas as pd

# Path to the participant folder (e.g., "Participant 1")
parent_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9"
output_dir = os.path.join(parent_dir, "cleaned")

# Create the 'cleaned' subfolder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Identify date folders like '2023-12-24'
day_folders = sorted([
    f for f in os.listdir(parent_dir)
    if os.path.isdir(os.path.join(parent_dir, f)) and f.startswith("2023")
])

for day in day_folders:
    day_path = os.path.join(parent_dir, day)
    daily_dfs = []

    for root, dirs, files in os.walk(day_path):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(file_path)
                    if "missing_value_reason" in df.columns:
                        df = df[df["missing_value_reason"] != "device_not_recording"]
                    daily_dfs.append(df)
                except Exception as e:
                    print(f"❌ Failed to read {file_path}: {e}")

    if not daily_dfs:
        print(f"⚠️ No valid CSV files found in {day}")
        continue

    # Merge and clean
    merged_df = pd.concat(daily_dfs, axis=1)
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    merged_df = merged_df.dropna(axis=1, how='all')

    # Drop optional columns if they exist
    merged_df = merged_df.drop(columns=[
        col for col in ['timestamp_unix', 'participant_full_id'] if col in merged_df.columns
    ])

    # Convert timestamp and extract hour/minute
    if 'timestamp_iso' in merged_df.columns:
        try:
            merged_df['timestamp_iso'] = pd.to_datetime(merged_df['timestamp_iso'], errors='coerce')
            merged_df['hour'] = merged_df['timestamp_iso'].dt.hour
            merged_df['minute'] = merged_df['timestamp_iso'].dt.minute
        except Exception as e:
            print(f"⚠️ Failed to parse timestamp for {day}: {e}")

    # Save cleaned file
    output_path = os.path.join(output_dir, f"cleaned_{day}.csv")
    merged_df.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")



✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-16.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-17.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-18.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-19.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-20.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-21.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9\cleaned\cleaned_2023-12-22.csv


In [62]:
import os
import pandas as pd

input_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 9/cleaned"
output_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_biomarkers/cleaned_09"

os.makedirs(output_dir, exist_ok=True)

for fname in os.listdir(input_dir):
    if not fname.endswith(".csv"):
        continue

    df = pd.read_csv(os.path.join(input_dir, fname))

    # Clean columns
    df.columns = df.columns.str.replace(r"^001_", "", regex=True)
    df.columns = df.columns.str.replace(r"\.\d+$", "", regex=True)
    df = df.loc[:, ~df.columns.duplicated()]

    # Save cleaned version
    df.to_csv(os.path.join(output_dir, fname), index=False)


### Check NA values in cleaned files before moving on


In [63]:
import os
import pandas as pd

cleaned_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_biomarkers/cleaned_09"
na_summary = []

# Loop through each cleaned daily file
for file in sorted(os.listdir(cleaned_dir)):
    if file.endswith(".csv"):
        file_path = os.path.join(cleaned_dir, file)
        df = pd.read_csv(file_path)
        
        # Calculate % missing for each column
        na_percent = df.isna().mean() * 100
        na_row = na_percent.round(1).to_dict()
        na_row["file"] = file
        na_summary.append(na_row)

# Convert to DataFrame
na_df = pd.DataFrame(na_summary)
na_df.set_index("file", inplace=True)

# Show columns with highest average NA %
avg_na = na_df.mean().sort_values(ascending=False)

# Save for inspection
na_df.to_csv("participant9_na_summary.csv")
print("✅ Saved NA summary to participant9_na_summary.csv")

# Preview worst offenders
print("🔍 Columns with highest average missingness:")
print(avg_na.head(10))


✅ Saved NA summary to participant9_na_summary.csv
🔍 Columns with highest average missingness:
missing_value_reason     96.000000
prv_rmssd_ms             80.557143
respiratory_rate_brpm    79.300000
sleep_detection_stage     3.642857
body_position_right       3.485714
body_position_left        3.485714
vector_magnitude          3.428571
activity_class            3.428571
activity_counts           3.428571
activity_intensity        3.428571
dtype: float64


In [64]:
df = pd.read_csv('participant9_na_summary.csv')
df

Unnamed: 0,file,timestamp_iso,accelerometers_std_g,missing_value_reason,counts_x_axis,counts_y_axis,counts_z_axis,vector_magnitude,activity_class,activity_counts,...,met,prv_rmssd_ms,pulse_rate_bpm,respiratory_rate_brpm,sleep_detection_stage,step_counts,temperature_celsius,wearing_detection_percentage,hour,minute
0,cleaned_2023-12-16.csv,0.0,0.2,99.8,0.2,0.2,0.2,0.2,0.2,0.2,...,0.2,96.8,0.2,87.9,0.2,0.2,0.2,0.0,0.0,0.0
1,cleaned_2023-12-17.csv,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,69.7,0.0,76.2,0.0,0.0,0.0,0.0,0.0,0.0
2,cleaned_2023-12-18.csv,0.0,0.1,99.9,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,88.3,0.1,82.0,0.1,0.1,0.1,0.0,0.0,0.0
3,cleaned_2023-12-19.csv,0.0,2.5,97.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,79.9,2.5,79.0,2.5,2.5,2.5,0.0,0.0,0.0
4,cleaned_2023-12-20.csv,0.0,7.4,92.6,7.4,7.4,7.4,7.4,7.4,7.4,...,7.4,65.9,7.4,71.8,7.4,7.4,7.4,0.0,0.0,0.0
5,cleaned_2023-12-21.csv,0.0,2.4,97.6,2.4,2.4,2.4,2.4,2.4,2.4,...,2.4,79.7,2.4,75.8,2.4,2.4,2.4,0.0,0.0,0.0
6,cleaned_2023-12-22.csv,0.0,11.4,88.6,11.4,11.4,11.4,11.4,11.4,11.4,...,11.4,83.6,11.4,82.4,12.9,11.4,11.4,0.0,0.0,0.0


# Step 2: combine all participants and their modalities

In [65]:
import os
import pandas as pd

# Directory containing all cleaned participant folders
base_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_biomarkers"

# Collect data from all participants
all_participants = []

for folder_name in sorted(os.listdir(base_dir)):
    if not folder_name.startswith("cleaned_"):
        continue  # Skip non-participant folders

    participant_id = folder_name.replace("cleaned_", "")  # e.g., '01'
    folder_path = os.path.join(base_dir, folder_name)

    daily_data = []
    for file in sorted(os.listdir(folder_path)):
        if file.endswith(".csv"):
            file_path = os.path.join(folder_path, file)
            
            # Extract date string from filename
            date_str = file.replace("cleaned_", "").replace(".csv", "")  # e.g., "2023-12-24"
            
            # Load CSV and tag with participant and date
            df = pd.read_csv(file_path)
            df["participant_id"] = participant_id
            df["date"] = date_str
            
            daily_data.append(df)

    # Merge all days for the participant
    if daily_data:
        participant_df = pd.concat(daily_data, ignore_index=True)
        all_participants.append(participant_df)

# Merge all participants into one dataset
master_df = pd.concat(all_participants, ignore_index=True)

# Save the result
master_df.to_csv("all_participants_cleaned_features.csv", index=False)
print("✅ Final master file saved: all_participants_cleaned_features.csv")



✅ Final master file saved: all_participants_cleaned_features.csv


In [66]:
master_df

Unnamed: 0,timestamp_iso,accelerometers_std_g,counts_x_axis,counts_y_axis,counts_z_axis,vector_magnitude,activity_class,activity_counts,activity_intensity,body_position_left,...,step_counts,temperature_celsius,wearing_detection_percentage,hour,minute,participant_id,date,missing_value_reason,prv_rmssd_ms,respiratory_rate_brpm
0,2023-12-24 23:58:00+00:00,0.176,776.0,1938.0,1869.0,2801.0,generic,57.0,MPA,standing,...,25.0,30.08,100.0,23,58,01,2023-12-24,,,
1,2023-12-25 00:00:00+00:00,0.122,441.0,1238.0,684.0,1481.0,generic,53.0,sedentary,sitting_reclining_lying,...,8.0,32.21,100.0,0,0,01,2023-12-25,,,
2,2023-12-25 00:01:00+00:00,0.025,483.0,1385.0,1019.0,1786.0,generic,61.0,sedentary,sitting_reclining_lying,...,0.0,32.56,100.0,0,1,01,2023-12-25,,,
3,2023-12-25 00:02:00+00:00,0.036,1098.0,1086.0,824.0,1750.0,generic,69.0,LPA,sitting_reclining_lying,...,6.0,32.82,100.0,0,2,01,2023-12-25,,,
4,2023-12-25 00:03:00+00:00,0.034,1494.0,1767.0,833.0,2459.0,generic,76.0,LPA,sitting_reclining_lying,...,5.0,32.95,100.0,0,3,01,2023-12-25,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101705,2023-12-22 17:32:00+00:00,0.107,2175.0,2027.0,2321.0,3771.0,walking,148.0,MPA,standing,...,44.0,30.37,100.0,17,32,09,2023-12-22,,,
101706,2023-12-22 17:33:00+00:00,0.058,1605.0,1090.0,1537.0,2475.0,generic,132.0,LPA,sitting_reclining_lying,...,15.0,30.07,100.0,17,33,09,2023-12-22,,,
101707,2023-12-22 17:34:00+00:00,0.104,1892.0,1306.0,1686.0,2850.0,walking,139.0,LPA,miscellaneous,...,59.0,29.83,100.0,17,34,09,2023-12-22,,,
101708,2023-12-22 17:35:00+00:00,0.072,886.0,1604.0,1837.0,2594.0,generic,143.0,sedentary,standing,...,52.0,29.42,100.0,17,35,09,2023-12-22,,,


In [76]:
master_df.isnull().sum()

timestamp_iso                      0
accelerometers_std_g               0
counts_x_axis                      0
counts_y_axis                      0
counts_z_axis                      0
vector_magnitude                   0
activity_class                  3885
activity_counts                    0
activity_intensity              3885
body_position_left              3974
body_position_right             3974
eda_scl_usiemens                   0
met                                0
pulse_rate_bpm                     0
sleep_detection_stage              0
step_counts                        0
temperature_celsius                0
wearing_detection_percentage       0
hour                               0
minute                             0
participant_id                     0
dtype: int64

In [75]:
# Set threshold (25% of total number of rows)
threshold_col = 0.25 * len(master_df)

# Identify columns with >25% missing values
cols_to_drop = master_df.columns[master_df.isnull().sum() > threshold_col]

# Drop those columns
master_df = master_df.drop(columns=cols_to_drop)

# Print confirmation
print(f"Dropped {len(cols_to_drop)} columns with more than 25% missing values:")
print(cols_to_drop.tolist())


Dropped 0 columns with more than 25% missing values:
[]


In [77]:
import numpy as np

# 1. Drop the 'date' column (optional if already removed)
master_df = master_df.drop(columns=["date"])

# 2. Identify numeric columns (excluding ID/timestamp)
numeric_cols = master_df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ["participant_id", "hour", "minute"]]

# 3. Impute missing values with median per participant
master_df[numeric_cols] = (
    master_df.groupby("participant_id")[numeric_cols]
    .transform(lambda x: x.fillna(x.median()))
)


KeyError: "['date'] not found in axis"

In [78]:
master_df.dtypes.loc[["activity_class", "activity_counts", "activity_intensity", "body_position_left"]]


activity_class         object
activity_counts       float64
activity_intensity     object
body_position_left     object
dtype: object

In [79]:
cols_to_check = ["activity_class", "activity_intensity", "body_position_left"]

for col in cols_to_check:
    print(f"\nColumn: {col}")
    print(master_df[col].dropna().value_counts())



Column: activity_class
activity_class
still      49105
generic    38634
walking     9926
running      160
Name: count, dtype: int64

Column: activity_intensity
activity_intensity
sedentary    57555
LPA          22122
MPA          14501
VPA           3647
Name: count, dtype: int64

Column: body_position_left
body_position_left
sitting_reclining_lying    61853
standing                   32746
miscellaneous               3137
Name: count, dtype: int64


In [80]:
activity_class_map = {'still': 0, 'generic': 1, 'walking': 2, 'running': 3}
master_df['activity_class'] = master_df['activity_class'].map(activity_class_map)

In [81]:
activity_intensity_map = {'sedentary': 0, 'LPA': 1, 'MPA': 2, 'VPA': 3}
master_df['activity_intensity'] = master_df['activity_intensity'].map(activity_intensity_map)

In [82]:
master_df['body_position_left'], _ = pd.factorize(master_df['body_position_left'])

In [87]:
master_df['body_position_right'], _ = pd.factorize(master_df['body_position_right'])

In [83]:
master_df.isnull().sum()

timestamp_iso                      0
accelerometers_std_g               0
counts_x_axis                      0
counts_y_axis                      0
counts_z_axis                      0
vector_magnitude                   0
activity_class                  3885
activity_counts                    0
activity_intensity              3885
body_position_left                 0
body_position_right             3974
eda_scl_usiemens                   0
met                                0
pulse_rate_bpm                     0
sleep_detection_stage              0
step_counts                        0
temperature_celsius                0
wearing_detection_percentage       0
hour                               0
minute                             0
participant_id                     0
dtype: int64

In [88]:
# Columns to impute
cat_cols = ['activity_class', 'activity_intensity', 'body_position_left', 'body_position_right']

# Fill NaNs using median per participant
master_df[cat_cols] = (
    master_df.groupby("participant_id")[cat_cols]
    .transform(lambda x: x.fillna(x.median()))
)


In [89]:
master_df.isnull().sum()

timestamp_iso                   0
accelerometers_std_g            0
counts_x_axis                   0
counts_y_axis                   0
counts_z_axis                   0
vector_magnitude                0
activity_class                  0
activity_counts                 0
activity_intensity              0
body_position_left              0
body_position_right             0
eda_scl_usiemens                0
met                             0
pulse_rate_bpm                  0
sleep_detection_stage           0
step_counts                     0
temperature_celsius             0
wearing_detection_percentage    0
hour                            0
minute                          0
participant_id                  0
dtype: int64

In [90]:
output_path = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_master_df.csv"
master_df.to_csv(output_path, index=False)
print(f"✅ Saved cleaned data to: {output_path}")


✅ Saved cleaned data to: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_master_df.csv
