In [113]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json
from datetime import datetime, timedelta
from TakeoutParser import TakeoutParser

# set up the directory system
data_directory = str(Path.home()) + "/class/s25/public-health/project3/jackson-data"
garmin_directory = data_directory + "/garmin_data/"
takeout_directory = data_directory + "/takeout/takeout-files/"
timeline_directory = data_directory + "/timeline/"


In [31]:
def ms_to_minutes(ms):
    return ms / 1000 / 60

def extract_first_key(dlist, key):
    if isinstance(dlist, list) and len(dlist) > 0 and isinstance(dlist[0], dict):
        return dlist[0].get(key)
    return None

In [104]:
# load up the Garmin data
sleep = pd.read_json(garmin_directory + "sleep.json")
with open(garmin_directory + "summarizedActivities.json") as f:
        raw_data = json.load(f)[0]['summarizedActivitiesExport']
activities = pd.DataFrame(raw_data)
wellness = pd.read_json(garmin_directory + "/wellness-metrics.json")

# select the features we want

# wellness 
wellness['duration'] = wellness['durationInMilliseconds'].apply(ms_to_minutes)
wellness = wellness[['calendarDate', 'duration', 'totalSteps', 'moderateIntensityMinutes', 'vigorousIntensityMinutes', 'minAvgHeartRate', 'maxAvgHeartRate', 'restingHeartRate',
                      'allDayStress', "bodyBattery"]]
wellness['aggregatorList'] = wellness['allDayStress'].apply(lambda x: x.get('aggregatorList'))
wellness['list0'] = wellness['aggregatorList'].apply(lambda x: x[0] if x else None)
# wellness['maxStressLevel'] = wellness['aggregatorList'].apply(lambda x: x[0]['maxStressLevel'] if x else None)
wellness['averageStressLevel'] = wellness['list0'].apply(lambda x: x.get('averageStressLevel'))
wellness['maxStressLevel'] = wellness['list0'].apply(lambda x: x.get("maxStressLevel"))
wellness = wellness.drop(["aggregatorList", "list0", "allDayStress"], axis=1)

# sleep
sleep = sleep[['calendarDate', "sleepStartTimestampGMT", "sleepEndTimestampGMT", "sleepWindowConfirmationType", "deepSleepSeconds", "lightSleepSeconds"]]


# activity
activities['calendarDate'] = activities['beginTimestamp'].apply(lambda x: datetime.utcfromtimestamp(x / 1000))
activities = activities[['calendarDate', 'activityType', 'name', 'sportType', 'avgHr', 'maxHr', 'calories', 'bmrCalories', 'duration', 'moderateIntensityMinutes', 'vigorousIntensityMinutes']]

# convert to datetime as needed
wellness['calendarDate'] = pd.to_datetime(wellness['calendarDate'])
sleep['calendarDate'] = pd.to_datetime(sleep['calendarDate'])
activities['calendarDate'] = pd.to_datetime(activities['calendarDate'])

# filter dates
start_date = '2025-01-20'
end_date = '2025-04-20'

wellness = wellness[(wellness['calendarDate'] >= start_date) & (wellness['calendarDate'] <= end_date)]
sleep = sleep[(sleep['calendarDate'] >= start_date) & (sleep['calendarDate'] <= end_date)]
activities = activities[(activities['calendarDate'] >= start_date) & (activities['calendarDate'] <= end_date)]


# joining together the tables on the date

step_one = pd.merge(sleep, activities, on='calendarDate', how='outer')
step_two = pd.merge(step_one, wellness, on='calendarDate', how='outer')

  activities['calendarDate'] = activities['beginTimestamp'].apply(lambda x: datetime.utcfromtimestamp(x / 1000))


In [106]:
# reading in timeline data

with open(timeline_directory + "jackson_timeline.json") as f:
    raw_data = json.load(f)

raw_data

timeline = pd.json_normalize(raw_data)
timeline

# merge the timeline data to the other data
timeline['calendarDate'] = pd.to_datetime(timeline['date'])

step_three = pd.merge(step_two, timeline, on='calendarDate', how='outer')
step_three


Unnamed: 0,calendarDate,sleepStartTimestampGMT,sleepEndTimestampGMT,sleepWindowConfirmationType,deepSleepSeconds,lightSleepSeconds,activityType,name,sportType,avgHr,...,restingHeartRate,bodyBattery,averageStressLevel,maxStressLevel,date,person,events.academic,events.work,events.activities,events.notes
0,2025-01-20 00:00:00,2025-01-20T04:00:00.0,2025-01-20T12:00:00.0,UNCONFIRMED,,,,,,,...,65.0,"{'userProfilePK': 123010558, 'calendarDate': '...",74.0,90.0,2025-01-20,Jackson,"[{'type': 'class', 'course': 'CS6501-005', 'de...",[],"[{'type': 'laundry', 'description': 'washing, ...",
1,2025-01-20 17:03:25,,,,,,running,Charlottesville Carrera,RUNNING,150.0,...,,,,,,,,,,
2,2025-01-21 00:00:00,2025-01-21T04:00:00.0,2025-01-21T12:00:00.0,UNCONFIRMED,,,,,,,...,65.0,"{'userProfilePK': 123010558, 'calendarDate': '...",36.0,91.0,2025-01-21,Jackson,[],"[{'job_title': 'Data Scientist Inter', 'hours'...",[],
3,2025-01-21 19:19:38,,,,,,strength_training,Strength,TRAINING,101.0,...,,,,,,,,,,
4,2025-01-22 00:00:00,2025-01-22T04:00:00.0,2025-01-22T12:00:00.0,UNCONFIRMED,,,,,,,...,,,-1.0,,2025-01-22,Jackson,"[{'type': 'class', 'course': 'CS6501-005', 'de...",[],"[{'type': 'practice music', 'description': 'pr...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,2025-04-17 18:11:39,,,,,,cycling,Charlottesville Ciclismo,CYCLING,156.0,...,,,,,,,,,,
168,2025-04-18 00:00:00,2025-04-18T04:09:00.0,2025-04-18T11:11:00.0,ENHANCED_CONFIRMED_FINAL,1080.0,13080.0,,,,,...,56.0,"{'userProfilePK': 123010558, 'calendarDate': '...",44.0,99.0,2025-04-18,Jackson,"[{'type': 'assignment', 'course': 'CS6501-006'...","[{'job_title': 'Data Scientist Intern', 'hours...","[{'type': 'Concert', 'description': 'Went to a...",
169,2025-04-18 18:39:34,,,,,,lap_swimming,Natación en piscina,SWIMMING,139.0,...,,,,,,,,,,
170,2025-04-19 00:00:00,2025-04-19T04:48:00.0,2025-04-19T12:35:00.0,ENHANCED_CONFIRMED,480.0,17340.0,,,,,...,57.0,"{'userProfilePK': 123010558, 'calendarDate': '...",46.0,99.0,2025-04-19,Jackson,[],[],"[{'type': 'Social', 'description': 'Went to fr...",


In [114]:
# now we need to get the takeout data
# ads, gmail, Image-Search, Maps.html, Search.html, video-search, youtube
directory = takeout_directory
tp = TakeoutParser(directory)
tp.scrape_all_takeout_files()

[#######] 100.00%

In [118]:
tmp_df = tp.df

tmp_df = tmp_df.groupby('date')['content'].apply(list).reset_index()

tmp_df['calendarDate'] = pd.to_datetime(tmp_df['date'])
step_four = pd.merge(step_three, tmp_df, on='calendarDate', how='inner')

step_four

Unnamed: 0,calendarDate,sleepStartTimestampGMT,sleepEndTimestampGMT,sleepWindowConfirmationType,deepSleepSeconds,lightSleepSeconds,activityType,name,sportType,avgHr,...,averageStressLevel,maxStressLevel,date_x,person,events.academic,events.work,events.activities,events.notes,date_y,content
0,2025-01-20,2025-01-20T04:00:00.0,2025-01-20T12:00:00.0,UNCONFIRMED,,,,,,,...,74.0,90.0,2025-01-20,Jackson,"[{'type': 'class', 'course': 'CS6501-005', 'de...",[],"[{'type': 'laundry', 'description': 'washing, ...",,2025-01-20,"[YouTube, YouTube, YouTube, YouTube, YouTube, ..."
1,2025-01-21,2025-01-21T04:00:00.0,2025-01-21T12:00:00.0,UNCONFIRMED,,,,,,,...,36.0,91.0,2025-01-21,Jackson,[],"[{'job_title': 'Data Scientist Inter', 'hours'...",[],,2025-01-21,"[YouTube, YouTube, YouTube, YouTube, YouTube, ..."
2,2025-01-22,2025-01-22T04:00:00.0,2025-01-22T12:00:00.0,UNCONFIRMED,,,,,,,...,-1.0,,2025-01-22,Jackson,"[{'type': 'class', 'course': 'CS6501-005', 'de...",[],"[{'type': 'practice music', 'description': 'pr...",,2025-01-22,"[Maps, Maps, Maps, Maps, Maps, Maps, Maps, Map..."
3,2025-01-23,2025-01-23T04:00:00.0,2025-01-23T12:00:00.0,UNCONFIRMED,,,,,,,...,32.0,83.0,2025-01-23,Jackson,[],"[{'job_title': 'Data Scientist Intern', 'hours...","[{'type': 'Errands', 'description': 'Needed to...",,2025-01-23,"[YouTube, YouTube, youtube.com, youtube.com, y..."
4,2025-01-24,2025-01-24T04:00:00.0,2025-01-24T12:00:00.0,UNCONFIRMED,,,,,,,...,19.0,76.0,2025-01-24,Jackson,[],"[{'job_title': 'Data Scientist Intern', 'hours...","[{'type': 'Social', 'description': 'Called my ...",,2025-01-24,"[Maps, YouTube, youtube.com, youtube.com, yout..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,2025-04-15,2025-04-15T02:59:00.0,2025-04-15T11:49:00.0,ENHANCED_CONFIRMED_FINAL,3540.0,20820.0,,,,,...,28.0,92.0,2025-04-15,Jackson,"[{'type': 'assignment', 'course': 'CS6501-006'...","[{'job_title': 'Data Scientist Intern', 'hours...","[{'type': 'Errands', 'description': 'Bought su...",,2025-04-15,"[Maps, Maps, Maps, Maps, Maps, Maps, Maps, you..."
82,2025-04-16,2025-04-16T02:26:00.0,2025-04-16T11:19:00.0,ENHANCED_CONFIRMED,4500.0,17880.0,,,,,...,31.0,96.0,2025-04-16,Jackson,"[{'type': 'class', 'course': 'CS6501-005', 'de...",[],"[{'type': 'Personal Projects', 'description': ...",,2025-04-16,"[youtube.com, youtube.com, youtube.com, Search..."
83,2025-04-17,2025-04-17T00:31:00.0,2025-04-17T10:25:00.0,OFF_WRIST,0.0,0.0,,,,,...,73.0,99.0,2025-04-17,Jackson,"[{'type': 'assignment', 'course': 'CS6501-006'...","[{'job_title': 'Data Scientist Intern', 'hours...","[{'type': 'Errand', 'description': 'Groceries'...",,2025-04-17,"[Maps, Maps, Maps, Maps, Maps, Maps, Maps, Map..."
84,2025-04-18,2025-04-18T04:09:00.0,2025-04-18T11:11:00.0,ENHANCED_CONFIRMED_FINAL,1080.0,13080.0,,,,,...,44.0,99.0,2025-04-18,Jackson,"[{'type': 'assignment', 'course': 'CS6501-006'...","[{'job_title': 'Data Scientist Intern', 'hours...","[{'type': 'Concert', 'description': 'Went to a...",,2025-04-18,"[youtube.com, youtube.com, youtube.com, youtub..."


In [120]:
step_four.to_csv(data_directory + "jackson-data.csv")