In [1]:
import pandas as pd

# Load all 4 WHOOP exports
sleep = pd.read_csv("../data/whoop/sleeps.csv")
workouts = pd.read_csv("../data/whoop/workouts.csv")
journal = pd.read_csv("../data/whoop/journal_entries.csv")
physio = pd.read_csv("../data/whoop/physiological_cycles.csv")


In [10]:


# Add a 'date' column based on WHOOP's cycle start time
for df in (sleep, workouts, journal, physio):
    df["date"] = pd.to_datetime(df["Cycle start time"]).dt.date


# ----------------------------------------------------
# 1) SLEEP FEATURES  (from sleeps.csv)
# ----------------------------------------------------
sleep_cols = [
    "date",
    "Sleep onset",
    "Wake onset",
    "Asleep duration (min)",
    "In bed duration (min)",
    "Light sleep duration (min)",
    "Deep (SWS) duration (min)",
    "REM duration (min)",
    "Awake duration (min)",
    "Sleep performance %",
    "Sleep efficiency %",
    "Sleep need (min)",
    "Sleep debt (min)",
    "Respiratory rate (rpm)",
    "Nap",
]

sleep_feat = sleep[sleep_cols].rename(columns={
    "Sleep onset": "sleep_onset",
    "Wake onset": "wake_onset",
    "Asleep duration (min)": "asleep_duration_min",
    "In bed duration (min)": "in_bed_duration_min",
    "Light sleep duration (min)": "light_sleep_duration_min",
    "Deep (SWS) duration (min)": "deep_sleep_duration_min",
    "REM duration (min)": "rem_duration_min",
    "Awake duration (min)": "awake_duration_min",
    "Sleep performance %": "sleep_performance_pct",
    "Sleep efficiency %": "sleep_efficiency_pct",
    "Sleep need (min)": "sleep_need_min",
    "Sleep debt (min)": "sleep_debt_min",
    "Respiratory rate (rpm)": "respiratory_rate_rpm",
    "Nap": "nap",
})
sleep_feat["sleep_hours"] = sleep_feat["asleep_duration_min"] / 60.0


# ----------------------------------------------------
# 2) PHYSIOLOGICAL FEATURES  (from physiological_cycles.csv)
# ----------------------------------------------------
physio_cols = [
    "date",
    "Recovery score %",
    "Resting heart rate (bpm)",
    "Heart rate variability (ms)",
    "Skin temp (celsius)",
    "Blood oxygen %",
    "Day Strain",
]

physio_feat = physio[physio_cols].rename(columns={
    "Recovery score %": "recovery_score_pct",
    "Resting heart rate (bpm)": "resting_hr_bpm",
    "Heart rate variability (ms)": "hrv_ms",
    "Skin temp (celsius)": "skin_temp_c",
    "Blood oxygen %": "blood_oxygen_pct",
    "Day Strain": "day_strain",
})


# ----------------------------------------------------
# 3) WORKOUT FEATURES (daily)  (from workouts.csv)
# ----------------------------------------------------
workouts_daily = (
    workouts.groupby("date", as_index=False)
    .agg({
        "Duration (min)": "sum",           # total duration per day
        "Activity Strain": "mean",         # avg strain per day
        "Energy burned (cal)": "sum",      # total calories per day
        "Max HR (bpm)": "max",             # max HR per day
        "Average HR (bpm)": "mean",        # avg HR per day
        "Workout start time": "min",       # first workout start
        "Workout end time": "max",         # last workout end
    })
    .rename(columns={
        "Duration (min)": "total_workout_minutes",
        "Activity Strain": "avg_activity_strain",
        "Energy burned (cal)": "total_workout_energy_burned_cal",
        "Max HR (bpm)": "max_hr_bpm",
        "Average HR (bpm)": "avg_hr_bpm",
        "Workout start time": "first_workout_start_time",
        "Workout end time": "last_workout_end_time",
    })
)

workouts_daily["had_workout"] = workouts_daily["total_workout_minutes"] > 0


# ----------------------------------------------------
# 4) JOURNAL FEATURES (alcohol, weed, caffeine, melatonin, late food)
#     from journal_entries.csv
# ----------------------------------------------------
# Pivot so each question becomes a column
journal_wide = (
    journal.pivot_table(
        index="date",
        columns="Question text",
        values="Answered yes",
        aggfunc="first",
    )
    .reset_index()
)

# Map original question text ‚Üí simpler column names
journal_wide = journal_wide.rename(columns={
    "Ate food close to bedtime?": "ate_food_close_to_bedtime",
    "Consumed caffeine?": "consumed_caffeine",
    "Have any alcoholic drinks?": "have_any_alcoholic_drinks",
    "Took a melatonin supplement?": "took_a_melatonin_supplement",
    "Used marijuana?": "used_marijuana",
})

# Build boolean flags
for col in [
    "ate_food_close_to_bedtime",
    "consumed_caffeine",
    "have_any_alcoholic_drinks",
    "took_a_melatonin_supplement",
    "used_marijuana",
]:
    if col in journal_wide.columns:
        journal_wide[col] = journal_wide[col].fillna(False).astype(bool)
    else:
        journal_wide[col] = False

journal_flags = journal_wide[[
    "date",
    "have_any_alcoholic_drinks",
    "consumed_caffeine",
    "ate_food_close_to_bedtime",
    "used_marijuana",
    "took_a_melatonin_supplement",
]].rename(columns={
    "have_any_alcoholic_drinks": "had_alcohol",
    "consumed_caffeine": "had_caffeine",
    "ate_food_close_to_bedtime": "ate_late",
    "used_marijuana": "used_marijuana",
    "took_a_melatonin_supplement": "used_melatonin",
})


# ----------------------------------------------------
# 5) MERGE EVERYTHING BY DATE
# ----------------------------------------------------
df = physio_feat.merge(sleep_feat, on="date", how="left")
df = df.merge(workouts_daily, on="date", how="left")
df = df.merge(journal_flags, on="date", how="left")

# Fill NaNs for booleans + workout metrics only
df = df.fillna({
    "total_workout_minutes": 0,
    "avg_activity_strain": 0,
    "total_workout_energy_burned_cal": 0,
    "max_hr_bpm": 0,
    "avg_hr_bpm": 0,
    "had_workout": False,
    "had_alcohol": False,
    "had_caffeine": False,
    "ate_late": False,
    "used_marijuana": False,
    "used_melatonin": False,
})

df = df.sort_values("date")

# ----------------------------------------------------
# 6) EXPORT
# ----------------------------------------------------
out_csv = "../data/modified/whoop_combined.csv"
out_json = "../data/modified/whoop.json"

df.to_csv(out_csv, index=False)
df.to_json(out_json, orient="records", indent=2, date_format="iso")

print(f"‚úÖ Combined WHOOP dataset created with {len(df)} days of data")
print("CSV:", out_csv)
print("JSON:", out_json)
print(df.head())


‚úÖ Combined WHOOP dataset created with 158 days of data
CSV: ../data/modified/whoop_combined.csv
JSON: ../data/modified/whoop.json
           date  recovery_score_pct  resting_hr_bpm  hrv_ms  skin_temp_c  \
157  2025-02-27                 NaN             NaN     NaN          NaN   
156  2025-02-28                23.0            65.0    56.0        34.99   
155  2025-02-28                23.0            65.0    56.0        34.99   
154  2025-03-01                53.0            64.0    56.0        34.61   
153  2025-03-02                71.0            62.0    69.0        34.70   

     blood_oxygen_pct  day_strain          sleep_onset           wake_onset  \
157               NaN         9.3                  NaN                  NaN   
156             97.00        17.8  2025-02-28 02:00:37  2025-02-28 09:44:56   
155             97.00        17.8  2025-02-28 12:11:03  2025-02-28 15:41:06   
154             96.34         9.3  2025-03-01 02:46:53  2025-03-01 11:54:16   
153             

  journal_wide[col] = journal_wide[col].fillna(False).astype(bool)
  journal_wide[col] = journal_wide[col].fillna(False).astype(bool)
  journal_wide[col] = journal_wide[col].fillna(False).astype(bool)
  journal_wide[col] = journal_wide[col].fillna(False).astype(bool)
  journal_wide[col] = journal_wide[col].fillna(False).astype(bool)
  df = df.fillna({


In [14]:
# Install required package if not already installed
%pip install ics -q


Note: you may need to restart the kernel to use updated packages.


In [None]:
from ics import Calendar
from datetime import datetime, date, timedelta
from pathlib import Path
import json

# ----------------------------------------------------
# CONFIG
# ----------------------------------------------------
CAL_DIR = Path("../data/calendars")          # folder with all your .ics files
OUTPUT_PATH = Path("../data/modified/calendar.json") # output JSON

# Match your WHOOP date range
START_DATE = date(2025, 2, 28)
END_DATE   = date(2025, 6, 18)


def any_keyword(events, keywords):
    """Return True if ANY event title contains ANY of the keywords."""
    titles = [ (e["title"] or "").lower() for e in events ]
    return any(
        any(k in title for k in keywords)
        for title in titles
    )


def has_long_task_block(task_events, gap_threshold_min=15, block_min=120):
    """
    Long task definition:
      - any single task >= block_min minutes, OR
      - cluster of tasks with gaps <= gap_threshold_min and
        total continuous block length >= block_min minutes
    """
    if not task_events:
        return False

    # If any single task is already >= block_min, we're done
    if any(e["duration_min"] >= block_min for e in task_events):
        return True

    # Sort by start time
    sorted_tasks = sorted(task_events, key=lambda e: e["start"])

    # Parse datetimes
    parsed = [
        (
            datetime.fromisoformat(e["start"]),
            datetime.fromisoformat(e["end"])
        )
        for e in sorted_tasks
    ]

    # Merge into contiguous blocks with small gaps
    blocks = []
    current_start, current_end = parsed[0]

    for s, e in parsed[1:]:
        gap_min = (s - current_end).total_seconds() / 60.0
        if gap_min <= gap_threshold_min:
            # same block; extend end if needed
            if e > current_end:
                current_end = e
        else:
            # close previous block
            blocks.append((current_start, current_end))
            current_start, current_end = s, e

    # last block
    blocks.append((current_start, current_end))

    # check block lengths
    for s, e in blocks:
        block_len_min = (e - s).total_seconds() / 60.0
        if block_len_min >= block_min:
            return True

    return False


def main():
    ics_files = list(CAL_DIR.glob("*.ics"))
    if not ics_files:
        print(f"‚ö†Ô∏è No .ics files found in {CAL_DIR}. Put your calendar exports there.")
        return

    print("Found .ics files:", [f.name for f in ics_files])

    events_by_day = {}

    # ------------------------------------------------
    # 1) READ ALL CALENDARS & COLLECT EVENTS PER DAY
    # ------------------------------------------------
    for ics_file in ics_files:
        print(f"üì• Reading {ics_file.name}")
        with open(ics_file, "r") as f:
            cal = Calendar(f.read())

        for e in cal.events:
            if e.begin is None or e.end is None:
                continue

            start_dt = e.begin.datetime
            end_dt = e.end.datetime
            event_date = start_dt.date()

            # filter to WHOOP date range
            if not (START_DATE <= event_date <= END_DATE):
                continue

            dur_min = int((end_dt - start_dt).total_seconds() / 60)
            date_str = event_date.isoformat()

            events_by_day.setdefault(date_str, []).append({
                "title": (e.name or "").strip(),
                "start": start_dt.isoformat(),
                "end": end_dt.isoformat(),
                "duration_min": dur_min,
                "source_calendar": ics_file.name,
            })

    # ------------------------------------------------
    # 2) SUMMARIZE PER DAY
    # ------------------------------------------------
    calendar_summary = []

    for date_str, events in events_by_day.items():
        total_busy = sum(e["duration_min"] for e in events)
        num_events = len(events)

        # late events (after 8pm)
        num_late = 0
        for e in events:
            try:
                hour = datetime.fromisoformat(e["start"]).hour
            except Exception:
                hour = 0
            if hour >= 20:
                num_late += 1

        # Flags via keywords
        has_exam = any_keyword(events, ["exam", "midterm", "quiz", "test"])
        has_workout = any_keyword(events, ["gym", "workout", "lift", "run", "cardio"])
        has_interview = any_keyword(events, ["interview", "screen", "onsite"])

        # Task-ish things (for has_task_due + long tasks)
        task_keywords = [
            "assignment", "hw", "homework", "project",
            "paper", "quiz", "exam", "due"
        ]
        has_task_due = any_keyword(events, task_keywords)

        # Extract only "task" events
        task_events = []
        for e in events:
            title_lower = (e["title"] or "").lower()
            if any(k in title_lower for k in task_keywords):
                task_events.append(e)

        has_long_tasks = has_long_task_block(
            task_events,
            gap_threshold_min=15,  # "little or no break"
            block_min=120          # 2 hours
        )

        calendar_summary.append({
            "date": date_str,
            "total_busy_minutes": total_busy,
            "num_events": num_events,
            "num_late_events": num_late,
            "has_exam": has_exam,
            "has_workout": has_workout,
            "has_interview": has_interview,
            "has_task_due": has_task_due,
            "has_long_tasks": has_long_tasks,
        })

    # ------------------------------------------------
    # 3) FILL IN MISSING DAYS WITH ZEROS (OPTIONAL BUT NICE)
    # ------------------------------------------------
    # build lookup by date
    by_date = {d["date"]: d for d in calendar_summary}

    all_days = []
    curr = START_DATE
    while curr <= END_DATE:
        date_str = curr.isoformat()
        if date_str in by_date:
            all_days.append(by_date[date_str])
        else:
            all_days.append({
                "date": date_str,
                "total_busy_minutes": 0,
                "num_events": 0,
                "num_late_events": 0,
                "has_exam": False,
                "has_workout": False,
                "has_interview": False,
                "has_task_due": False,
                "has_long_tasks": False,
            })
        curr += timedelta(days=1)

    calendar_summary = sorted(all_days, key=lambda x: x["date"])

    # ------------------------------------------------
    # 4) WRITE OUTPUT
    # ------------------------------------------------
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_PATH, "w") as f:
        json.dump(calendar_summary, f, indent=2)

    print(f"‚úÖ Wrote {len(calendar_summary)} days to {OUTPUT_PATH}")


if __name__ == "__main__":
    main()


Found .ics files: ['vedantajwani@gmail.com.ics', 'vajwani5021@gmail.com.ics', 'vajwani@umass.edu.ics', 'Birthdays_vedantajwani@gmail.com.ics']
üì• Reading vedantajwani@gmail.com.ics
üì• Reading vajwani5021@gmail.com.ics
üì• Reading vajwani@umass.edu.ics
üì• Reading Birthdays_vedantajwani@gmail.com.ics
‚úÖ Wrote 111 days to ../data/modified/calendar.json


In [12]:
whoop = pd.read_json("../data/modified/whoop.json")
calendar = pd.read_json("../data/modified/calendar.json")

whoop["date"] = pd.to_datetime(whoop["date"])
calendar["date"] = pd.to_datetime(calendar["date"])

final = whoop.merge(calendar, on="date", how="left")

# Fill NaNs from calendar side (if any)
for col in calendar.columns:
    if col == "date":
        continue
    if final[col].dtype == bool or final[col].dtype == "boolean":
        final[col] = final[col].fillna(False)
    else:
        final[col] = final[col].fillna(0)

final.to_csv("../data/modified/final_dataset.csv", index=False)
final.to_json("../data/modified/final_dataset.json", orient="records", indent=2, date_format="iso")

print("‚úÖ Final dataset ready:", final.shape)


‚úÖ Final dataset ready: (158, 43)


In [13]:
import json

# 1. Load your original file (array of objects)
with open("../data/modified/final_dataset.json", "r") as f:
    data = json.load(f)  # this should be a list of dicts

print(type(data), len(data))  # optional sanity check

# 2. Write as NDJSON (one JSON object per line)
with open("../data/modified/final_data_nd.json", "w") as f:
    for row in data:
        f.write(json.dumps(row) + "\n")


<class 'list'> 158
