In [26]:
from pathlib import Path
import csv
import json
import numpy as np
import pandas as pd

INPUT_CSV = Path("../data/oura_dataset_30days.csv")

OUT_DIR = Path("../data")
OUT_JSON = OUT_DIR / "oura_synthetic_dataset2.json"
OUT_JSONL = OUT_DIR / "oura_synthetic_dataset2.jsonl"

OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Input:", INPUT_CSV)


Input: ../data/oura_dataset_30days.csv


In [None]:
rows = []

with INPUT_CSV.open(newline="") as f:
    reader = csv.reader(f)
    header = next(reader)

    for r in reader:
        # Remove trailing empty column if present
        if len(r) == len(header) + 1 and r[-1] == "":
            r = r[:-1]

        # Normalize length
        if len(r) < len(header):
            r += [""] * (len(header) - len(r))
        elif len(r) > len(header):
            r = r[:len(header)]

        rows.append(r)

raw_df = pd.DataFrame(rows, columns=header)

print("Raw shape:", raw_df.shape)
raw_df.head()


Raw shape: (435, 33)


Unnamed: 0,user_id,age,gender,date,day_of_week,data_type,activity_type,sleep_score,sleep_total_duration_sec,sleep_deep_duration_sec,...,activity_active_calories,activity_high_activity_time_sec,activity_medium_activity_time_sec,activity_low_activity_time_sec,body_temperature_deviation,stress_high_time_sec,recovery_high_time_sec,spo2_average,vo2_max,cardiovascular_age
0,user_001,28,Female,2024-01-01,Monday,sleep,Cycling,80.0,27456.0,5261.0,...,,,,,,,,,,
1,user_001,28,Female,2024-01-01,Monday,readiness,Cycling,,,,...,,,,,0.4,,,,,
2,user_001,28,Female,2024-01-01,Monday,activity,Cycling,,,,...,677.0,3291.0,5143.0,6956.0,,,,,,
3,user_001,28,Female,2024-01-01,Monday,daily_stress,Cycling,,,,...,,,,,,2668.0,12696.0,,,
4,user_001,28,Female,2024-01-01,Monday,spo2,Cycling,,,,...,,,,,,,,97.3,,


In [28]:
df = raw_df.copy()

# Empty strings → NaN
df = df.replace({"": np.nan})

# Parse date
 
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.strftime("%Y-%m-%d")

# Convert numeric columns
non_numeric = {"user_id", "date", "data_type", "gender", "day_of_week", "activity_type"}

for col in df.columns:
    if col in non_numeric:
        continue
    df[col] = pd.to_numeric(df[col], errors="coerce")


df.head()


Unnamed: 0,user_id,age,gender,date,day_of_week,data_type,activity_type,sleep_score,sleep_total_duration_sec,sleep_deep_duration_sec,...,activity_active_calories,activity_high_activity_time_sec,activity_medium_activity_time_sec,activity_low_activity_time_sec,body_temperature_deviation,stress_high_time_sec,recovery_high_time_sec,spo2_average,vo2_max,cardiovascular_age
0,user_001,28,Female,2024-01-01,Monday,sleep,Cycling,80.0,27456.0,5261.0,...,,,,,,,,,,
1,user_001,28,Female,2024-01-01,Monday,readiness,Cycling,,,,...,,,,,0.4,,,,,
2,user_001,28,Female,2024-01-01,Monday,activity,Cycling,,,,...,677.0,3291.0,5143.0,6956.0,,,,,,
3,user_001,28,Female,2024-01-01,Monday,daily_stress,Cycling,,,,...,,,,,,2668.0,12696.0,,,
4,user_001,28,Female,2024-01-01,Monday,spo2,Cycling,,,,...,,,,,,,,97.3,,


In [29]:
def first_non_null(series):
    s = series.dropna()
    return s.iloc[0] if len(s) else np.nan

merged = (
    df.groupby(["user_id", "date"], as_index=False)
      .agg({col: first_non_null for col in df.columns if col != "data_type"})
)

# keep column, but it's null after merge
merged["data_type"] = np.nan

print("Merged shape:", merged.shape)
merged.head()


Merged shape: (90, 33)


Unnamed: 0,user_id,age,gender,date,day_of_week,activity_type,sleep_score,sleep_total_duration_sec,sleep_deep_duration_sec,sleep_rem_duration_sec,...,activity_high_activity_time_sec,activity_medium_activity_time_sec,activity_low_activity_time_sec,body_temperature_deviation,stress_high_time_sec,recovery_high_time_sec,spo2_average,vo2_max,cardiovascular_age,data_type
0,user_001,28,Female,2024-01-01,Monday,Cycling,80.0,27456.0,5261.0,6076.0,...,3291.0,5143.0,6956.0,0.4,2668.0,12696.0,97.3,49.0,25.0,
1,user_001,28,Female,2024-01-02,Tuesday,Rest Day,90.0,27779.0,6127.0,7446.0,...,213.0,1302.0,8149.0,0.0,2273.0,10724.0,,,,
2,user_001,28,Female,2024-01-03,Wednesday,Swimming,75.0,28064.0,6048.0,7459.0,...,4204.0,4381.0,7747.0,-0.2,3805.0,12827.0,97.8,,,
3,user_001,28,Female,2024-01-04,Thursday,Cycling,87.0,28321.0,5441.0,6698.0,...,3513.0,5548.0,7992.0,-0.2,3893.0,12343.0,,,,
4,user_001,28,Female,2024-01-05,Friday,Yoga,77.0,27249.0,4945.0,6994.0,...,1772.0,2795.0,7009.0,-0.3,3517.0,13098.0,97.5,,,


In [30]:
import math

# exact CSV columns (keep as-is)
CSV_COLS = list(df.columns)  # uses the columns from your input CSV header

def to_none(x):
    # convert NaN -> None for valid JSON
    if x is None:
        return None
    if isinstance(x, float) and math.isnan(x):
        return None
    return x

records = []
for r in merged.to_dict(orient="records"):
    rec = {c: to_none(r.get(c)) for c in CSV_COLS}

    # after merge, this must be null
    rec["data_type"] = None

    # optional metadata (safe; NOT derived)
    rec["source"] = "oura_csv"

    records.append(rec)

records[0]


{'user_id': 'user_001',
 'age': 28,
 'gender': 'Female',
 'date': '2024-01-01',
 'day_of_week': 'Monday',
 'data_type': None,
 'activity_type': 'Cycling',
 'sleep_score': 80.0,
 'sleep_total_duration_sec': 27456.0,
 'sleep_deep_duration_sec': 5261.0,
 'sleep_rem_duration_sec': 6076.0,
 'sleep_light_duration_sec': 16119.0,
 'sleep_awake_time_sec': 897.0,
 'sleep_efficiency': 96.0,
 'sleep_latency_sec': 253.0,
 'average_heart_rate': 60.0,
 'lowest_heart_rate': 48.0,
 'average_hrv': 69.0,
 'average_breath_rate': 12.4,
 'readiness_score': 78.0,
 'activity_score': 90.0,
 'activity_steps': 12888.0,
 'activity_total_calories': 2298.0,
 'activity_active_calories': 677.0,
 'activity_high_activity_time_sec': 3291.0,
 'activity_medium_activity_time_sec': 5143.0,
 'activity_low_activity_time_sec': 6956.0,
 'body_temperature_deviation': 0.4,
 'stress_high_time_sec': 2668.0,
 'recovery_high_time_sec': 12696.0,
 'spo2_average': 97.3,
 'vo2_max': 49.0,
 'cardiovascular_age': 25.0,
 'source': 'oura_csv

In [31]:
# JSON array
with OUT_JSON.open("w") as f:
    json.dump(records, f, indent=2)

# JSONL (one record per line)
with OUT_JSONL.open("w") as f:
    for r in records:
        f.write(json.dumps(r) + "\n")

print("Saved:")
print(" -", OUT_JSON)
print(" -", OUT_JSONL)


Saved:
 - ../data/oura_synthetic_dataset2.json
 - ../data/oura_synthetic_dataset2.jsonl
