In [14]:
import os
import pandas as pd

# Path to the participant folder (e.g., "Participant 1")
parent_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1"
output_dir = os.path.join(parent_dir, "cleaned")

# Create the 'cleaned' subfolder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Identify date folders like '2023-12-24'
day_folders = sorted([
    f for f in os.listdir(parent_dir)
    if os.path.isdir(os.path.join(parent_dir, f)) and f.startswith("2023")
])

for day in day_folders:
    day_path = os.path.join(parent_dir, day)
    daily_dfs = []

    for root, dirs, files in os.walk(day_path):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_csv(file_path)
                    daily_dfs.append(df)
                except Exception as e:
                    print(f"❌ Failed to read {file_path}: {e}")

    if not daily_dfs:
        print(f"⚠️ No CSV files found in {day}")
        continue

    # Merge and clean
    merged_df = pd.concat(daily_dfs, axis=1)
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    merged_df = merged_df.dropna(axis=1, how='all')

    # Drop optional columns if they exist
    merged_df = merged_df.drop(columns=[
        col for col in ['timestamp_unix', 'participant_full_id'] if col in merged_df.columns
    ])

    # Convert timestamp and extract hour/minute
    if 'timestamp_iso' in merged_df.columns:
        try:
            merged_df['timestamp_iso'] = pd.to_datetime(merged_df['timestamp_iso'], errors='coerce')
            merged_df['hour'] = merged_df['timestamp_iso'].dt.hour
            merged_df['minute'] = merged_df['timestamp_iso'].dt.minute
        except Exception as e:
            print(f"⚠️ Failed to parse timestamp for {day}: {e}")

    # Save cleaned file
    output_path = os.path.join(output_dir, f"cleaned_{day}.csv")
    merged_df.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")


✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-24.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-25.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-26.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-27.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-28.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-29.csv
✅ Saved: C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1\cleaned\cleaned_2023-12-30.csv


In [17]:
import os
import pandas as pd

input_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/raw_biomarkers/Participant 1/cleaned"
output_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_biomarkers/cleaned_01"

os.makedirs(output_dir, exist_ok=True)

for fname in os.listdir(input_dir):
    if not fname.endswith(".csv"):
        continue

    df = pd.read_csv(os.path.join(input_dir, fname))

    # Clean columns
    df.columns = df.columns.str.replace(r"^001_", "", regex=True)
    df.columns = df.columns.str.replace(r"\.\d+$", "", regex=True)
    df = df.loc[:, ~df.columns.duplicated()]

    # Save cleaned version
    df.to_csv(os.path.join(output_dir, fname), index=False)


In [19]:
import pandas as pd

# Load a cleaned file from cleaned_01
df = pd.read_csv("C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_biomarkers/cleaned_01/cleaned_2023-12-26.csv")
df.head()

Unnamed: 0,timestamp_iso,accelerometers_std_g,missing_value_reason,counts_x_axis,counts_y_axis,counts_z_axis,vector_magnitude,activity_class,activity_counts,activity_intensity,...,met,prv_rmssd_ms,pulse_rate_bpm,respiratory_rate_brpm,sleep_detection_stage,step_counts,temperature_celsius,wearing_detection_percentage,hour,minute
0,2023-12-26 00:00:00+00:00,0.176,,2854.0,2931.0,2178.0,4634.0,generic,112.0,MPA,...,4.88,,100.0,,0.0,79.0,32.08,100.0,0,0
1,2023-12-26 00:01:00+00:00,0.159,,2251.0,3572.0,1473.0,4471.0,generic,122.0,MPA,...,3.31,,97.0,,0.0,86.0,31.91,100.0,0,1
2,2023-12-26 00:02:00+00:00,0.211,,3155.0,4605.0,1652.0,5821.0,generic,126.0,MPA,...,3.78,,106.0,,0.0,119.0,31.79,100.0,0,2
3,2023-12-26 00:03:00+00:00,0.088,,1809.0,1151.0,1124.0,2420.0,generic,121.0,LPA,...,2.7,,99.0,,0.0,39.0,31.71,100.0,0,3
4,2023-12-26 00:04:00+00:00,0.129,,2395.0,1933.0,2382.0,3891.0,generic,136.0,LPA,...,2.42,,95.0,,0.0,84.0,31.67,100.0,0,4


### Check NA values in cleaned files before moving on


In [20]:
import os
import pandas as pd

cleaned_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis/data/cleaned_biomarkers/cleaned_01"
na_summary = []

# Loop through each cleaned daily file
for file in sorted(os.listdir(cleaned_dir)):
    if file.endswith(".csv"):
        file_path = os.path.join(cleaned_dir, file)
        df = pd.read_csv(file_path)
        
        # Calculate % missing for each column
        na_percent = df.isna().mean() * 100
        na_row = na_percent.round(1).to_dict()
        na_row["file"] = file
        na_summary.append(na_row)

# Convert to DataFrame
na_df = pd.DataFrame(na_summary)
na_df.set_index("file", inplace=True)

# Show columns with highest average NA %
avg_na = na_df.mean().sort_values(ascending=False)

# Save for inspection
na_df.to_csv("participant1_na_summary.csv")
print("✅ Saved NA summary to participant1_na_summary.csv")

# Preview worst offenders
print("🔍 Columns with highest average missingness:")
print(avg_na.head(10))


✅ Saved NA summary to participant1_na_summary.csv
🔍 Columns with highest average missingness:
prv_rmssd_ms             90.083333
respiratory_rate_brpm    86.483333
missing_value_reason     58.683333
body_position_left       35.428571
body_position_right      35.428571
counts_z_axis            35.414286
met                      35.414286
counts_x_axis            35.414286
temperature_celsius      35.414286
step_counts              35.414286
dtype: float64


In [21]:
df = pd.read_csv('participant1_na_summary.csv')
df

Unnamed: 0,file,timestamp_iso,accelerometers_std_g,missing_value_reason,counts_x_axis,counts_y_axis,counts_z_axis,vector_magnitude,activity_class,activity_counts,...,met,pulse_rate_bpm,sleep_detection_stage,step_counts,temperature_celsius,wearing_detection_percentage,hour,minute,prv_rmssd_ms,respiratory_rate_brpm
0,cleaned_2023-12-24.csv,0.0,99.9,0.1,99.9,99.9,99.9,99.9,99.9,99.9,...,99.9,99.9,99.9,99.9,99.9,99.9,0.0,0.0,,
1,cleaned_2023-12-25.csv,0.0,5.6,94.4,5.6,5.6,5.6,5.6,5.6,5.6,...,5.6,5.6,5.6,5.6,5.6,1.3,0.0,0.0,89.5,84.4
2,cleaned_2023-12-26.csv,0.0,27.2,72.8,27.2,27.2,27.2,27.2,27.2,27.2,...,27.2,27.2,27.2,27.2,27.2,23.5,0.0,0.0,97.8,96.5
3,cleaned_2023-12-27.csv,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.6,68.4
4,cleaned_2023-12-28.csv,0.0,6.5,93.5,6.5,6.5,6.5,6.5,6.5,6.5,...,6.5,6.5,6.5,6.5,6.5,0.0,0.0,0.0,92.0,90.7
5,cleaned_2023-12-29.csv,0.0,56.5,43.5,56.5,56.5,56.5,56.5,56.5,56.5,...,56.5,56.5,56.5,56.5,56.5,26.3,0.0,0.0,95.6,92.4
6,cleaned_2023-12-30.csv,0.0,52.2,47.8,52.2,52.2,52.2,52.2,52.2,52.2,...,52.2,52.2,52.2,52.2,52.2,49.9,0.0,0.0,91.0,86.5


In [22]:
# Drop the 'file' column (row index) and keep just the % NA values
na_pct = df.drop(columns='file').mean().sort_values()

# Filter to keep only features with < 30% missing on average
low_missing = na_pct[na_pct < 30.0]

print("✅ Features with < 30% missingness across days:")
print(low_missing)

✅ Features with < 30% missingness across days:
timestamp_iso                    0.0
minute                           0.0
hour                             0.0
wearing_detection_percentage    28.7
dtype: float64
