# NeoHealth AI â€“ Data Cleaning & Merging Pipeline

## Objective
This notebook cleans and processes multiple raw physiological and self-reported datasets related to menstrual health and merges them into a single machine-learning-ready dataset.

In [1]:
import pandas as pd
import numpy as np
import os

RAW_DIR = "../data/raw"
PROCESSED_DIR = "../data/processed"

os.makedirs(PROCESSED_DIR, exist_ok=True)

In [2]:
hormones = pd.read_csv(f"{RAW_DIR}/hormones_and_selfreport.csv")
sleep = pd.read_csv(f"{RAW_DIR}/sleep_score.csv")
stress = pd.read_csv(f"{RAW_DIR}/stress_score.csv")
heart = pd.read_csv(f"{RAW_DIR}/resting_heart_rate.csv")
steps = pd.read_csv(f"{RAW_DIR}/steps.csv")

print("Hormones:", hormones.shape)
print("Sleep:", sleep.shape)
print("Stress:", stress.shape)
print("Heart:", heart.shape)
print("Steps:", steps.shape)

Hormones: (5659, 22)
Sleep: (5308, 12)
Stress: (7932, 14)
Heart: (13737, 6)
Steps: (7666949, 6)


In [3]:
print("Hormones columns:\n", hormones.columns)
print("\nSleep columns:\n", sleep.columns)
print("\nStress columns:\n", stress.columns)
print("\nHeart columns:\n", heart.columns)
print("\nSteps columns:\n", steps.columns)

Hormones columns:
 Index(['id', 'study_interval', 'is_weekend', 'day_in_study', 'phase', 'lh',
       'estrogen', 'pdg', 'flow_volume', 'flow_color', 'appetite',
       'exerciselevel', 'headaches', 'cramps', 'sorebreasts', 'fatigue',
       'sleepissue', 'moodswing', 'stress', 'foodcravings', 'indigestion',
       'bloating'],
      dtype='object')

Sleep columns:
 Index(['id', 'study_interval', 'is_weekend', 'day_in_study', 'timestamp',
       'overall_score', 'composition_score', 'revitalization_score',
       'duration_score', 'deep_sleep_in_minutes', 'resting_heart_rate',
       'restlessness'],
      dtype='object')

Stress columns:
 Index(['id', 'study_interval', 'is_weekend', 'day_in_study', 'timestamp',
       'stress_score', 'sleep_points', 'max_sleep_points',
       'responsiveness_points', 'max_responsiveness_points', 'exertion_points',
       'max_exertion_points', 'status', 'calculation_failed'],
      dtype='object')

Heart columns:
 Index(['id', 'study_interval', 'is_we

In [4]:
# Hormones & symptoms (base dataset)
hormones = hormones[[
"id", "day_in_study", "phase",
"lh", "estrogen", "pdg",
"cramps", "fatigue", "moodswing",
"stress", "bloating", "sleepissue"
]]


# Sleep dataset (already daily)
sleep = sleep[[
"id", "day_in_study",
"overall_score", "deep_sleep_in_minutes",
"resting_heart_rate"
]]


# Stress dataset (may have multiple rows per day)
stress = stress[[
"id", "day_in_study", "stress_score"
]]


# Heart rate dataset
heart = heart[[
"id", "day_in_study", "value"
]]


# Steps dataset
steps = steps[[
"id", "day_in_study", "steps"
]]

In [5]:
stress_daily = stress.groupby(["id", "day_in_study"])["stress_score"].mean().reset_index()

In [6]:
heart_daily = heart.groupby(["id", "day_in_study"])["value"].mean().reset_index()
heart_daily = heart_daily.rename(columns={"value": "avg_resting_heart_rate"})

In [7]:
steps_daily = steps.groupby(["id", "day_in_study"])["steps"].sum().reset_index()
steps_daily = steps_daily.rename(columns={"steps": "daily_steps"})

In [8]:
df = hormones.merge(sleep, on=["id", "day_in_study"], how="left")
df = df.merge(stress_daily, on=["id", "day_in_study"], how="left")
df = df.merge(heart_daily, on=["id", "day_in_study"], how="left")
df = df.merge(steps_daily, on=["id", "day_in_study"], how="left")


df.head()

Unnamed: 0,id,day_in_study,phase,lh,estrogen,pdg,cramps,fatigue,moodswing,stress,bloating,sleepissue,overall_score,deep_sleep_in_minutes,resting_heart_rate,stress_score,avg_resting_heart_rate,daily_steps
0,1,1,Follicular,2.9,94.2,,Very Low/Little,High,Very Low/Little,Moderate,Very Low/Little,Low,,,,,74.785346,992.0
1,1,2,Follicular,1.2,226.3,,Very Low/Little,High,Very Low/Little,Moderate,Very Low/Little,Very High,,,,,80.407307,838.0
2,1,3,Follicular,3.5,276.8,,Very Low/Little,Very High,Very Low/Little,Low,Very Low/Little,Very High,,,,,84.686869,2586.0
3,1,4,Fertility,1.8,322.1,,Very Low/Little,High,Very Low/Little,Low,Very Low/Little,Very High,80.0,93.0,84.0,,83.852219,1275.0
4,1,5,Fertility,4.6,244.9,,Very Low/Little,High,Very Low/Little,Low,Very Low/Little,High,,,,,0.0,436.0


In [9]:
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [10]:
print(df.shape)
df.info()
df.head()

(5889, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5889 entries, 0 to 5888
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      5889 non-null   int64  
 1   day_in_study            5889 non-null   int64  
 2   phase                   5888 non-null   object 
 3   lh                      5889 non-null   float64
 4   estrogen                5889 non-null   float64
 5   pdg                     5889 non-null   float64
 6   cramps                  3440 non-null   object 
 7   fatigue                 3444 non-null   object 
 8   moodswing               3433 non-null   object 
 9   stress                  3440 non-null   object 
 10  bloating                3441 non-null   object 
 11  sleepissue              3442 non-null   object 
 12  overall_score           5889 non-null   float64
 13  deep_sleep_in_minutes   5889 non-null   float64
 14  resting_heart_rate      5889 

Unnamed: 0,id,day_in_study,phase,lh,estrogen,pdg,cramps,fatigue,moodswing,stress,bloating,sleepissue,overall_score,deep_sleep_in_minutes,resting_heart_rate,stress_score,avg_resting_heart_rate,daily_steps
0,1,1,Follicular,2.9,94.2,6.250051,Very Low/Little,High,Very Low/Little,Moderate,Very Low/Little,Low,76.704408,77.706259,67.187076,67.992872,74.785346,992.0
1,1,2,Follicular,1.2,226.3,6.250051,Very Low/Little,High,Very Low/Little,Moderate,Very Low/Little,Very High,76.704408,77.706259,67.187076,67.992872,80.407307,838.0
2,1,3,Follicular,3.5,276.8,6.250051,Very Low/Little,Very High,Very Low/Little,Low,Very Low/Little,Very High,76.704408,77.706259,67.187076,67.992872,84.686869,2586.0
3,1,4,Fertility,1.8,322.1,6.250051,Very Low/Little,High,Very Low/Little,Low,Very Low/Little,Very High,80.0,93.0,84.0,67.992872,83.852219,1275.0
4,1,5,Fertility,4.6,244.9,6.250051,Very Low/Little,High,Very Low/Little,Low,Very Low/Little,High,76.704408,77.706259,67.187076,67.992872,0.0,436.0


In [11]:
# Convert Likert-scale symptom columns to numeric
likert_mapping = {
    "Not at all": 0,
    "Low": 1,
    "Mild": 1,
    "Moderate": 2,
    "High": 3,
    "Very high": 4,
    "Very High": 4
}

symptom_cols = ["cramps","fatigue","moodswing","stress","bloating","sleepissue"]

for col in symptom_cols:
    df[col] = df[col].map(likert_mapping)

In [12]:
# Sort data for time-series features
df = df.sort_values(["id", "day_in_study"])

history_cols = ["lh","estrogen","pdg","stress","overall_score","daily_steps"]

for col in history_cols:
    df[f"{col}_prev1"] = df.groupby("id")[col].shift(1)
    df[f"{col}_prev2"] = df.groupby("id")[col].shift(2)

In [13]:
df["phase_simple"] = df["phase"].replace({
    "Menstrual": "Low Hormone",
    "Follicular": "Rising Hormone",
    "Fertility": "Peak Hormone",
    "Luteal": "High Progesterone"
})

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
hormone_cols = ["lh","estrogen","pdg"]

df[hormone_cols] = scaler.fit_transform(df[hormone_cols])

In [15]:
df = df.dropna()

In [16]:
output_path = f"{PROCESSED_DIR}/final_dataset.csv"
df.to_csv(output_path, index=False)


print("Final dataset saved to:", output_path)

Final dataset saved to: ../data/processed/final_dataset.csv
