In [None]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [13]:
from pathlib import Path
import csv
import json
import numpy as np
import pandas as pd

INPUT_CSV = Path("../data/oura_synthetic_dataset.csv")

OUT_DIR = Path("../data")
OUT_JSON = OUT_DIR / "oura_synthetic_dataset.json"
OUT_JSONL = OUT_DIR / "oura_synthetic_dataset.jsonl"

OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Input:", INPUT_CSV)


Input: ../data/oura_synthetic_dataset.csv


In [14]:
rows = []

with INPUT_CSV.open(newline="") as f:
    reader = csv.reader(f)
    header = next(reader)

    for r in reader:
        # Remove trailing empty column if present
        if len(r) == len(header) + 1 and r[-1] == "":
            r = r[:-1]

        # Normalize length
        if len(r) < len(header):
            r += [""] * (len(header) - len(r))
        elif len(r) > len(header):
            r = r[:len(header)]

        rows.append(r)

raw_df = pd.DataFrame(rows, columns=header)

print("Raw shape:", raw_df.shape)
raw_df.head()


Raw shape: (105, 29)


Unnamed: 0,user_id,date,data_type,sleep_score,sleep_total_duration_sec,sleep_deep_duration_sec,sleep_rem_duration_sec,sleep_light_duration_sec,sleep_awake_time_sec,sleep_efficiency,...,activity_active_calories,activity_high_activity_time_sec,activity_medium_activity_time_sec,activity_low_activity_time_sec,body_temperature_deviation,stress_high_time_sec,recovery_high_time_sec,spo2_average,vo2_max,cardiovascular_age
0,user_001,2024-01-15,sleep,85.0,28800.0,7200.0,6480.0,14400.0,720.0,94.0,...,,,,,,,,,,
1,user_001,2024-01-15,readiness,,,,,,,,...,,,,,0.2,,,,,
2,user_001,2024-01-15,activity,,,,,,,,...,2456.0,892.0,3240.0,5820.0,7680.0,,,,,
3,user_001,2024-01-15,daily_stress,,,,,,,,...,,,,,,,2340.0,12600.0,,
4,user_001,2024-01-15,spo2,,,,,,,,...,,,,,,,,,96.8,


In [15]:
df = raw_df.copy()

# Empty strings → NaN
df = df.replace({"": np.nan})

# Parse date
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date

# Convert numeric columns
key_cols = {"user_id", "date", "data_type"}

for col in df.columns:
    if col not in key_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")

df.head()


  df = df.replace({"": np.nan})


Unnamed: 0,user_id,date,data_type,sleep_score,sleep_total_duration_sec,sleep_deep_duration_sec,sleep_rem_duration_sec,sleep_light_duration_sec,sleep_awake_time_sec,sleep_efficiency,...,activity_active_calories,activity_high_activity_time_sec,activity_medium_activity_time_sec,activity_low_activity_time_sec,body_temperature_deviation,stress_high_time_sec,recovery_high_time_sec,spo2_average,vo2_max,cardiovascular_age
0,user_001,2024-01-15,sleep,85.0,28800.0,7200.0,6480.0,14400.0,720.0,94.0,...,,,,,,,,,,
1,user_001,2024-01-15,readiness,,,,,,,,...,,,,,0.2,,,,,
2,user_001,2024-01-15,activity,,,,,,,,...,2456.0,892.0,3240.0,5820.0,7680.0,,,,,
3,user_001,2024-01-15,daily_stress,,,,,,,,...,,,,,,,2340.0,12600.0,,
4,user_001,2024-01-15,spo2,,,,,,,,...,,,,,,,,,96.8,


In [16]:
def first_non_null(series):
    s = series.dropna()
    return s.iloc[0] if len(s) else np.nan

merged = (
    df.groupby(["user_id", "date"], as_index=False)
      .agg({col: first_non_null for col in df.columns if col != "data_type"})
)

print("Merged shape:", merged.shape)
merged.head()


Merged shape: (26, 28)


Unnamed: 0,user_id,date,sleep_score,sleep_total_duration_sec,sleep_deep_duration_sec,sleep_rem_duration_sec,sleep_light_duration_sec,sleep_awake_time_sec,sleep_efficiency,sleep_latency_sec,...,activity_active_calories,activity_high_activity_time_sec,activity_medium_activity_time_sec,activity_low_activity_time_sec,body_temperature_deviation,stress_high_time_sec,recovery_high_time_sec,spo2_average,vo2_max,cardiovascular_age
0,user_001,2024-01-15,85.0,28800.0,7200.0,6480.0,14400.0,720.0,94.0,480.0,...,2456.0,892.0,3240.0,5820.0,0.2,,2340.0,12600.0,96.8,
1,user_001,2024-01-16,79.0,27000.0,6300.0,5940.0,13860.0,900.0,91.0,600.0,...,2234.0,745.0,2880.0,5040.0,0.3,,,,97.1,
2,user_001,2024-01-17,88.0,29700.0,7740.0,6840.0,14520.0,600.0,95.0,390.0,...,2765.0,1045.0,4320.0,7560.0,8640.0,-0.1,,,,
3,user_001,2024-01-18,86.0,29100.0,7380.0,6660.0,14460.0,600.0,94.0,420.0,...,2689.0,989.0,3960.0,7020.0,0.0,,2040.0,13680.0,,
4,user_002,2024-01-15,72.0,25200.0,5400.0,5040.0,13680.0,1080.0,88.0,720.0,...,2145.0,654.0,2160.0,4320.0,0.5,,,,,45.2


In [17]:
def to_record(row):
    def g(key, default=None):
        val = row.get(key)
        if pd.isna(val):
            return default
        if isinstance(val, (np.integer, np.floating)):
            return val.item()
        return val

    return {
        "user_id": g("user_id"),
        "date": str(g("date")),

        "sleep": {
            "score": g("sleep_score"),
            "total_duration_sec": g("sleep_total_duration_sec"),
            "deep_duration_sec": g("sleep_deep_duration_sec"),
            "rem_duration_sec": g("sleep_rem_duration_sec"),
            "light_duration_sec": g("sleep_light_duration_sec"),
            "avg_hrv": g("average_hrv"),
            "avg_heart_rate": g("average_heart_rate"),
        },

        "readiness": {
            "score": g("readiness_score"),
            "temperature_deviation": g("body_temperature_deviation"),
        },

        "activity": {
            "score": g("activity_score"),
            "steps": g("activity_steps"),
            "total_calories": g("activity_total_calories"),
            "active_calories": g("activity_active_calories"),
        }
    }

records = [to_record(r) for _, r in merged.iterrows()]

records[0]


{'user_id': 'user_001',
 'date': '2024-01-15',
 'sleep': {'score': 85.0,
  'total_duration_sec': 28800.0,
  'deep_duration_sec': 7200.0,
  'rem_duration_sec': 6480.0,
  'light_duration_sec': 14400.0,
  'avg_hrv': 65.0,
  'avg_heart_rate': 58.0},
 'readiness': {'score': 82.0, 'temperature_deviation': 0.2},
 'activity': {'score': None,
  'steps': 78.0,
  'total_calories': 8542.0,
  'active_calories': 2456.0}}

In [18]:
bad = []

for i, r in enumerate(records):
    if not r["date"]:
        bad.append((i, "missing date"))
    if not any([
        r["sleep"]["score"],
        r["readiness"]["score"],
        r["activity"]["score"],
    ]):
        bad.append((i, "no scores"))

if bad:
    print("⚠️ Issues:", bad[:5])
else:
    print("✅ Dataset looks good")


✅ Dataset looks good


In [19]:
# JSON array
with OUT_JSON.open("w") as f:
    json.dump(records, f, indent=2)

# JSONL (one record per line)
with OUT_JSONL.open("w") as f:
    for r in records:
        f.write(json.dumps(r) + "\n")

print("Saved:")
print(" -", OUT_JSON)
print(" -", OUT_JSONL)


Saved:
 - ../data/oura_synthetic_dataset.json
 - ../data/oura_synthetic_dataset.jsonl
