In [1]:
# Process FITBIT's json heart rate files
# Usually one activity per file, and creates a dataframe per activity
# Heart rates with low confidence are not included in the dataframe
# If there are no high confidence heart rates, the dataframe is not used

# The process checks that there are no long periods with no activity in the dataframe
# If there is such a period, a new activity dataframe after the silent period is created

# The process also removes short activities 

# reset_indexing of the dataframe is used often to prevent warrnings, see:
# http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
import glob
import pickle

In [3]:
# Settings for high/low confidence, long silent period, short time activity, peak zone

confidence_limit = 0    #0 - low confidence; 1,2,3 - high enough confidence
silent_time_limit = "20 minutes"
short_time_limit = "5 minutes"
peak_zone_limit=135    # My peak zone limit

# Functions

In [4]:
def processHeartRate(df):
    
    df['Date'] = pd.to_datetime(df['dateTime']).dt.date
    df['Time'] = pd.to_datetime(df['dateTime']).dt.time
    
    # Copying the dict values to the dataframe, based on:
    # https://stackoverflow.com/questions/29681906/python-pandas-dataframe-from-series-of-dict
    
    temp_df = pd.DataFrame(list(df['value']))
    df['BPM']=temp_df['bpm']
    df['Confidence']=temp_df['confidence']
 
    # Cleans the data frame from low confidence heart rates
    # The if clause prevents removing all items and returning an empty df
    # The dfExists flag indicates that there is no valid dataframe
    # The reset_index create a new continuos index starting with zero   
     
    dfTemp=df[df[:]['Confidence']>confidence_limit]
    
    if len(dfTemp)>1:
        dfExists=True
        dfNew=dfTemp.reset_index(drop=True)
    else:
        #print('File with only low confidnece readings',df.iloc[0]['Date'],end=' ')
        dfExists=False
        dfNew=df
    
    return dfExists, dfNew

In [5]:
def calculateSummery(df,fileName):
    
    Date=df.iloc[0]['Date'] # Assumes a single date along the dataframe
    
    Duration= datetime.combine( Date, df['Time'].max() ) - datetime.combine( Date,df['Time'].min() )
    MeanBPM=round(df['BPM'].mean(),0)
    MaxBPM=df['BPM'].max()
    
    peak1=df['BPM']>peak_zone_limit
    peak2=peak1.value_counts()
    try:
        PeakZone=peak2[True]
    except KeyError:
        PeakZone=0
        
    String= str(df.iloc[0]['Date']) + ' ' + str(Duration) + ' ' + str(MeanBPM)   
    Summary = {'string': String, 'date': Date, 'meanBPM': MeanBPM, 'maxBPM': MaxBPM,\
               'duration':Duration, 'peakZone': PeakZone, 'fileName': fileName}

    return Summary

In [6]:
# Each json heart rate file covers a day; this function identifies days with more than one activity
# Uses a hurisitc, that there is no data for more than 20 minutes, to divide into a new activity

def identifySilentTime(df):
    
    breakExists=False
    j=0
    for i in range(len(df)-1):
        delta=df.iloc[i+1]['dateTime'] - df.iloc[i]['dateTime']
        if (delta>pd.Timedelta(silent_time_limit)):
            #print('BREAK ',i,df.iloc[i]['Time'],df.iloc[i+1]['Time'])
            breakExists=True
            j=i
            break
    return breakExists,j

# Process

In [7]:
files=glob.glob("DATA\heart_rate*.json")
print(len(files))

293


In [8]:
start=0          # File number to start from 
end=293             # Process up to this file

In [9]:
filesToRead=files[start:end]
print('files',len(files),'start',start,'end',end, 'files to read',len(filesToRead))

files 293 start 0 end 293 files to read 293


In [10]:
DFs=[]
Summaries=[]

print('Process monitoring, file numbers: - standard df, / time break dfs, no df *')
readNewFile=True
i=0
while (i< len(filesToRead)) or (not readNewFile): 
        
    if (readNewFile):
        fileName=filesToRead[i]
        df0=pd.read_json(fileName)
        i+=1 # Next file 
        
    dfExists, df = processHeartRate(df0)
    
    if (dfExists):
                
        breakExists,breakIndex=identifySilentTime(df)
        if (breakExists):
            df1=df[0:breakIndex+1]
            df0=df[breakIndex+1:].reset_index(drop=True)
            readNewFile=False
            print(i-1,end='/')
        else: # (NOT breakExists)
            df1=df
            readNewFile=True
            print(i-1,end='-')
        # IF (breakExists) ends
        
        DFs.append(df1)
        summary = calculateSummery(df1,fileName)
        Summaries.append(summary)
    
    else: # (NOT dfExists)
        print(i-1,end='*')
        readNewFile=True
    # IF (dfExists) ends
    
# WHILE ends 
print('\nlen(DFs),len(Summaries)',len(DFs),len(Summaries))

Process monitoring, file numbers: - standard df, / time break dfs, no df *
0/0*1/1/1/1*2/2/2-3/3/3/3/3-4/4-5-6/6-7/7/7-8/8-9/9-10/10-11/11*12/12-13-14/14-15/15-16-17-18-19/19-20-21-22/22-23/23/23-24/24*25-26/26-27/27-28/28-29-30-31/31-32-33/33-34-35-36/36-37-38-39/39-40/40-41/41/41-42/42-43/43/43-44-45-46/46-47/47-48-49-50-51/51-52/52*53*54-55-56-57-58-59*60/60-61-62-63/63*64-65/65*66-67-68-69-70-71/71-72-73/73-74/74-75/75/75/75/75-76-77-78-79/79-80-81/81-82-83/83-84/84-85-86-87/87-88-89-90-91*92/92/92-93-94/94/94*95/95/95-96-97-98-99-100/100-101/101-102*103/103-104/104/104-105*106*107*108*109*110-111/111-112/112/112/112/112*113-114-115-116/116/116-117-118/118/118-119/119/119-120-121*122/122-123-124/124/124/124-125-126-127-128-129/129-130*131-132*133*134-135*136/136/136/136/136-137*138*139-140/140-141*142-143*144-145*146-147/147*148-149/149-150-151-152/152-153*154-155*156-157-158-159-160*161-162*163/163-164/164/164*165/165/165/165/165/165/165-166/166/166-167*168/168-169/169-170*171-172

In [11]:
# Remove short sequences

for idx, d in enumerate(Summaries):
    delta=d['duration']
    if (delta<pd.Timedelta(short_time_limit)):
        print(idx,end=' ')
        del Summaries[idx]
print('\n Summaries',len(Summaries))

for idx, df in enumerate(DFs):
    Date=df.iloc[0]['Date']
    delta=datetime.combine( Date, df['Time'].max() ) - datetime.combine( Date,df['Time'].min() )
    if (delta<pd.Timedelta(short_time_limit)):
        print(idx,end=' ')
        del DFs[idx]
print('\n DFs',len(DFs))

2 9 11 15 19 23 25 30 37 42 43 45 56 59 62 67 99 113 114 125 139 144 154 155 156 162 163 172 175 179 181 192 193 194 196 197 199 201 207 214 215 221 230 239 243 247 250 251 252 256 261 268 273 281 288 293 298 300 303 304 312 322 327 331 346 348 354 
 Summaries 357
2 9 11 15 19 23 25 30 37 42 43 45 56 59 62 67 99 113 114 125 139 144 154 155 156 162 163 172 175 179 181 192 193 194 196 197 199 201 207 214 215 221 230 239 243 247 250 251 252 256 261 268 273 281 288 293 298 300 303 304 312 322 327 331 346 348 354 
 DFs 357


In [12]:
pickle.dump(DFs, open('DFs.pkl', 'wb'))
pickle.dump(Summaries, open('Summaries.pkl', 'wb'))