In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 10)

# load_data.py

In [None]:
if "df_raw" not in globals():
    df_raw = pd.read_csv("../data/raw/apple_health_export_2025-11-22.csv")

In [None]:
df_raw.head()

# preprocessing_raw_data.py
fix dtypes
handle missing values
remove invalid rows
timestamp → date
rename columns

In [None]:
# check unique columns
df_raw["type"].unique()

## fix_dtpes_datetime()

In [None]:
# fix_dtpes_datetime()
df_raw["startDate"] = pd.to_datetime(df_raw["startDate"])
df_raw["endDate"] = pd.to_datetime(df_raw["endDate"])

## select_datasource_watch()

In [None]:
df_raw["sourceName"].unique()

In [None]:
# select_datasource_watch()
df_raw = df_raw[df_raw["sourceName"].str.contains("Watch", na=False)]
df_raw.head()

# preprocessing_sleep_data.py

## process_sleep_data()

In [None]:
df_sleep = df_raw[df_raw["type"] == "SleepAnalysis"]
df_sleep.head()

### select_sleep_data()

In [None]:
# select_sleep_data()
df_sleep = df_raw[df_raw["type"] == "SleepAnalysis"]
df_sleep = df_sleep[["value", "startDate", "endDate"]]

In [None]:
df_sleep.head()

### get_sleep_day()
if sleep segment starts in the evening, count it for the next day

In [None]:
def get_sleep_day(start):
    # if sleep segment starts in the evening, count it for the next day
    if start.hour >= 18:
        return (start + pd.Timedelta(days=1)).date()
    else:
        return start.date()

In [None]:
df_sleep["sleep_day"] = df_sleep["startDate"].apply(get_sleep_day)

In [None]:
df_sleep.head()

###  select_time_20to12_as_sleep()

In [None]:
sleep_not_night = df_sleep[(df_sleep["startDate"].dt.hour < 20) & (df_sleep["startDate"].dt.hour > 12)]
print(sleep_not_night)

In [None]:
# select_time_20to12_as_sleep()
df_sleep = df_sleep[(df_sleep["startDate"].dt.hour > 20) | (df_sleep["startDate"].dt.hour < 12) ]

### compute_sleep_duraiton_each_stage()

In [None]:
# compute_sleep_duraiton_in_minu()
df_sleep["duration"] = (df_sleep["endDate"] - df_sleep["startDate"])#.dt.total_seconds()
df_sleep["duration_minute"] = df_sleep["duration"].dt.total_seconds() / 60
# df_sleep["duration_hour"] = df_sleep["duration"].dt.total_seconds() / 60 /60

In [None]:
df_sleep.head()

### aggregate_daily_sleep_by_stage(df_sleep)
to do: describe these two steps

In [None]:
# aggregated_daily_sleep_by_stage()
sleep_grouped = df_sleep.groupby(["sleep_day", "value"])["duration_minute"].sum().reset_index()
sleep_grouped.head()

In [None]:
# aggregated_daily_sleep_by_stage()
sleep_agg = (sleep_grouped.pivot(
    index="sleep_day",
    columns="value",
    values="duration_minute"
)
             .fillna(0)
             .reset_index())

sleep_agg.columns.name = None

sleep_agg

### rename_sleep_data_column(sleep_agg)

In [None]:
sleep_agg.columns = sleep_agg.columns.str.replace("HKCategoryValueSleepAnalysis", "", regex=False)

In [None]:
sleep_agg.columns

In [None]:
sleep_agg = sleep_agg.rename(columns ={
    'AsleepCore': 'sleepCore',
    'AsleepDeep': 'sleepDeep',
    'AsleepREM': 'sleepREM',
    'AsleepUnspecified': 'sleepUnspecified',
    'Awake': 'awake',
    'InBed': 'inBed'
})

### completeness_check()

In [None]:
 df_incomplete = sleep_agg[(sleep_agg["sleepCore"] == 0) &
                        (sleep_agg["sleepDeep"] == 0) &
                        (sleep_agg["sleepREM"] == 0)]
 df_incomplete

In [None]:
df_sleep0 = sleep_agg[
    (sleep_agg["sleepCore"] != 0) &
    (sleep_agg["sleepDeep"] != 0) &
    (sleep_agg["sleepREM"] != 0)
]

### na, null checken

In [None]:
df_sleep0.isnull().sum()
df_sleep0.isna().sum()

### Duplicates

In [None]:
df_sleep0.duplicated().sum()

### clean data with <4h sleep sum

In [None]:
df_sleep0["sleep_sum"] = df_sleep0["sleepCore"] + df_sleep0["sleepDeep"] + df_sleep0["sleepREM"] + df_sleep0['sleepUnspecified'] + df_sleep0["awake"]

In [None]:
sleep_less_than_4h = df_sleep0[df_sleep0["sleep_sum"] < 240]
sleep_less_than_4h

In [None]:
df_sleep1 = df_sleep0[df_sleep0["sleep_sum"] >= 240]

### remove inBed

In [None]:
df_sleep1 = df_sleep1.drop(columns=["inBed"])

### calculate_sleep_index

In [None]:
df_sleep1["sleep_efficiency"] = (df_sleep1["sleep_sum"] - df_sleep1["awake"]) / df_sleep1["sleep_sum"]

In [None]:
df_sleep1["deep_rem_ratio"] = (df_sleep1["sleepDeep"] + df_sleep1["sleepREM"]) / (df_sleep1["sleepCore"] + df_sleep1["sleepDeep"] + df_sleep1["sleepREM"])

In [None]:
# deep_rem_score
# 20–50% reflects realistic wearable sleep distributions
# Below 20% → insufficient restorative sleep
# Above 50% → often rebound, short sleep, or artifact

low, high = 0.20, 0.50

df_sleep1["deep_rem_score"] = (
    (df_sleep1["deep_rem_ratio"] - low) / (high - low)
).clip(0, 1)

In [None]:
# SleepQuality=100×(0.6⋅sleep_efficiency+0.4⋅deep_rem_ratio)
df_sleep1["sleep_quality"] = 100 * (
    0.6 * df_sleep1["sleep_efficiency"]
    + 0.4 * df_sleep1["deep_rem_score"]
)

In [None]:
df_sleep1

### plot and check

In [None]:
plt.hist(df_sleep1["sleep_sum"] /60, bins=30)
plt.title("Verteilung der Schlafdauer (Stunden)", fontsize=14)
plt.xlabel("Dauer (Stunden)", fontsize=12)
plt.ylabel("Häufigkeit", fontsize=12)
plt.show()

In [None]:
# Identify Outliers with box-plot
plt.boxplot(df_sleep1["sleep_sum"])
plt.title("Sleep Duration sum (minutes)")
plt.ylabel("Minutes")
plt.ylim(bottom=0)
plt.show()

In [None]:
# Identify Outliers with box-plot
plt.hist(df_sleep1["sleep_quality"],bins=30)
plt.title("Sleep quality (percentage)")
plt.ylabel("Percentage")
plt.ylim(bottom=0)
plt.show()

In [None]:
# Identify Outliers with box-plot
plt.hist(df_sleep1["deep_rem_score"], bins=30)
plt.title("deep_rem_score [0,1]")
plt.ylabel("Percentage")
plt.ylim(bottom=0)
plt.show()

# preprocessing_HRV_data.py

### select_HRV_data()

In [None]:
# select_hrv_data()
df_hrv = df_raw[df_raw["type"] == "HeartRateVariabilitySDNN"]
df_hrv = df_hrv[["value", "startDate", "endDate"]]
df_hrv.head()

### get_hrv_day()

In [None]:
def get_hrv_day(end):
        return end.date()

In [None]:
df_hrv["hrv_day"] = df_hrv["endDate"].apply(get_hrv_day)

In [None]:
df_hrv.head()

### fix_dtpes_hrv_value()

In [None]:
df_hrv["value"] = pd.to_numeric(df_hrv["value"], errors="coerce")

In [None]:
df_hrv.head()

### aggregate_daily_mean_hrv()

In [None]:
hrv_agg = df_hrv.groupby(["hrv_day"])["value"].mean().reset_index()
hrv_agg.head()

In [None]:
df_hrv = hrv_agg.rename(columns={"value": "mean_hrv"})

In [None]:
df_hrv

### plot and check

In [None]:
plt.hist(df_hrv["mean_hrv"], bins=30)
plt.title("Verteilung der Mean HRV", fontsize=14)
plt.xlabel("Mean HRV (ms)", fontsize=12)
plt.ylabel("Häufigkeit", fontsize=12)
plt.show()

In [None]:
# Identify Outliers with box-plot
plt.boxplot(df_hrv["mean_hrv"])
plt.title("mean HRV")
plt.ylabel("HRV")
plt.show()

# StepCount

In [None]:
df_stepCount_raw = df_raw[df_raw["type"] == "StepCount"]
df_stepCount_raw.head()

In [None]:
df_stepCount_raw = df_stepCount_raw[["value", "startDate", "endDate"]]

In [None]:
def get_stepCount_day(end):
        return end.date()

In [None]:
df_stepCount_raw["stepCount_day"] = df_stepCount_raw["endDate"].apply(get_stepCount_day)

In [None]:
df_stepCount_raw["value"] = pd.to_numeric(df_stepCount_raw["value"], errors="coerce")

In [None]:
df_stepCount_raw

In [None]:
step_agg = df_stepCount_raw.groupby(["stepCount_day"])["value"].sum().reset_index()

In [None]:
df_stepCount = step_agg.rename(columns={"value": "stepCount"})

In [None]:
df_stepCount

In [None]:
plt.hist(df_stepCount["stepCount"], bins=30)

# Exercise

In [None]:
df_raw["type"].unique()

In [None]:
df_sport_raw = df_raw[df_raw["type"] == "AppleExerciseTime"]
df_sport_raw

In [None]:
df_sport_raw = df_sport_raw[["value", "startDate", "endDate"]]
df_sport_raw["value"] = pd.to_numeric(df_sport_raw["value"], errors="coerce")

In [None]:
def get_sport_day(end):
        return end.date()

In [None]:
df_sport_raw["sport_day"] = df_sport_raw["endDate"].apply(get_sport_day)

In [None]:
sport_agg = df_sport_raw.groupby(["sport_day"])["value"].sum().reset_index()
sport_agg

In [None]:
df_sport = sport_agg.rename(columns={"value": "sportTime"})
df_sport

# Periode

In [None]:
df_raw.type.unique()

In [None]:
df_menstrual = df_raw[df_raw["type"] == "MenstrualFlow"]
# df_hrv = df_hrv[["value", "startDate", "endDate"]]
df_menstrual

# Merge final data

In [None]:
df_final = pd.merge(df_sleep1, df_hrv, left_on="sleep_day", right_on="hrv_day", how="inner")
df_final = df_final.drop(columns=["hrv_day"])
df_final = df_final.rename(columns={"sleep_day": "date"})

In [None]:
df_final

In [None]:
df_final = pd.merge(df_final, df_stepCount, left_on="date", right_on="stepCount_day", how="inner")
df_final = df_final.drop(columns=["stepCount_day"])

In [None]:
df_final

In [None]:
df_final = pd.merge(df_final, df_sport, left_on="date", right_on="sport_day", how="left")
df_final = df_final.drop(columns=["sport_day"])

In [None]:
df_final = df_final.sort_values(by=["date"])

In [None]:
df_final.to_csv("../data/processed/data_final.csv", index=False)