In [1]:
import pandas as pd
import datetime as dt
import numpy as np
import shutil
import os
from pathlib import Path

import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
p = Path.home()
cwd = Path.cwd()
readme = cwd / 'README.md'
activities_csv_downloaded = str(p / 'Downloads' / 'Activities.csv')
activities_csv_raw = str(cwd / 'data' / '01_raw' / 'Activities.csv')
activities_csv_processed = str(cwd / 'data' / '02_processed' / 'Activities_processed.csv')

In [4]:
activities_df_downloaded = pd.read_csv(activities_csv_downloaded)
activities_df_raw = pd.read_csv(activities_csv_raw)
activities_df_raw = pd.concat([activities_df_downloaded, activities_df_raw]).drop_duplicates(subset='Date')
activities_df_raw.to_csv(activities_csv_raw)
os.remove(activities_csv_downloaded)

In [42]:
def clean_and_convert(df):
    df = clean_time(df)
    df = cols_to_float(df)
    return df

def clean_run_pace(df):
    #df.best_pace = df.best_pace.apply(lambda x: dt.datetime.strptime(x, '%M:%S'))
    df.best_pace = pd.to_timedelta(df.best_pace)
    #df.best_pace = df.best_pace.dt.time
    df.avg_pace = pd.to_timedelta(df.avg_pace)
    #df.avg_pace = df.avg_pace.apply(lambda x: dt.datetime.strptime(x, '%M:%S'))
    #df.avg_pace = df.avg_pace.dt.time
    return df

def clean_bike_pace(df):
    df.avg_pace = df.avg_pace.astype(float)
    return df

def download_to_raw():
    activities_df_downloaded = pd.read_csv(activities_csv_downloaded)
    activities_df_raw = pd.read_csv(activities_csv_raw)
    activities_df_raw = pd.concat([activities_df_downloaded, activities_df_raw]).drop_duplicates(subset='Date')
    activities_df_raw.to_csv(activities_csv_raw, index=False)
    os.remove(activities_csv_downloaded)

def prep_df(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    df.drop(df.columns[17:], axis=1, inplace=True)
    df.drop(['favorite'], axis=1, inplace=True)
    df.replace('--', '0', inplace=True)
    df.date = pd.to_datetime(df.date)
    df.set_index('date', inplace=True)
    return df

def clean_time(df):
    df.time = df.time.apply(lambda x: x.split('.')[0])
    df.time = df.time.apply(lambda x: dt.datetime.strptime(x, '%H:%M:%S'))
    df.time = df.time.dt.time
    return df

def cols_to_float(df):
    df.calories = df.calories.str.replace(',', '').astype(float)
    df.iloc[:, 5:10] = df.iloc[:, 5:10].replace('--', '0')
    df.iloc[:, 5:10] = df.iloc[:, 5:10].astype(float)
    return df

def split_run_and_bike(df):
    run = df[df.activity_type == 'Running']
    bike = df[df.activity_type == 'Cycling']
    return run, bike

In [43]:
data = pd.read_csv(activities_csv_raw)
data = prep_df(data)
data = clean_and_convert(data)
run, bike = split_run_and_bike(data)
run = clean_run_pace(run)
bike = clean_bike_pace(bike)

In [44]:
type(data.calories[2])

numpy.float64

In [60]:
dataraw.Time = dataraw.Time.apply(lambda x: x.split('.')[0])

In [62]:
dataraw['timedelta'] = pd.to_timedelta(dataraw.Time)

In [88]:
rawrun = dataraw[dataraw['Activity Type'] == 'Running']

In [90]:
dataraw.Time = pd.to_timedelta(dataraw.Time)

In [87]:
dataraw.columns

Index(['Activity Type', 'Date', 'Favorite', 'Title', 'Distance', 'Calories',
       'Time', 'Avg HR', 'Max HR', 'Aerobic TE', 'Avg Run Cadence',
       'Max Run Cadence', 'Avg Pace', 'Best Pace', 'Elev Gain', 'Elev Loss',
       'Avg Stride Length', 'Avg Vertical Ratio', 'Avg Vertical Oscillation',
       'Training Stress Score®', 'Grit', 'Flow', 'Climb Time', 'Bottom Time',
       'Min Temp', 'Surface Interval', 'Decompression', 'Best Lap Time',
       'Number of Laps', 'Max Temp', 'Unnamed: 0', 'Unnamed: 0.1',
       'Total Strokes', 'Total Reps', 'Total Sets', 'timedelta',
       'calories per minute'],
      dtype='object')

In [74]:
dataraw['Calories'][2] / (dataraw['timedelta'][2].seconds / 60)

17.91416705122289

In [86]:
dataraw.timedelta.dt.seconds / 60 / 60

0      0.373889
1      0.611389
2      0.601944
3      0.790833
4      0.428333
         ...   
344    0.164722
345    4.080556
346    0.485556
347    0.974167
348    1.086944
Name: timedelta, Length: 349, dtype: float64

In [77]:
dataraw['calories per minute'] = dataraw['Calories'] / (dataraw.timedelta.dt.seconds / 60)

In [81]:
dataraw.sort_values(by='calories per minute', ascending=False)

Unnamed: 0.2,Activity Type,Date,Favorite,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,...,Best Lap Time,Number of Laps,Max Temp,Unnamed: 0,Unnamed: 0.1,Total Strokes,Total Reps,Total Sets,timedelta,calories per minute
57,Running,2020-08-15 18:57:52,False,Portland Running,1.01,142.0,00:07:37,173,181,2.4,...,00:03.33,2,0.0,53.0,51.0,0,0,0,0 days 00:07:37,18.643326
348,Running,2018-09-05 18:07:48,False,Portland Running,6.01,1205.0,01:05:13,171,197,5.0,...,00:00.00,7,0.0,344.0,342.0,0,0,0,0 days 01:05:13,18.476872
42,Running,2020-09-19 10:32:09,False,Portland Running,4.01,669.0,00:36:29,168,187,4.7,...,00:05.18,5,0.0,38.0,38.0,,,,0 days 00:36:29,18.337140
14,Running,2020-10-27 14:07:59,False,Portland Running,5.02,854.0,00:46:44,162,176,3.9,...,00:08.66,6,0.0,,,,,,0 days 00:46:44,18.273894
36,Running,2020-09-30 14:30:18,False,Portland Running,3.11,493.0,00:27:00,166,183,3.7,...,00:59.25,4,0.0,32.0,32.0,,,,0 days 00:27:00,18.259259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,Cycling,2020-02-22 10:51:52,False,Portland Cycling,40.22,614.0,02:59:27,80,159,1.7,...,01:08.15,9,0.0,176.0,174.0,0,0,0,0 days 02:59:27,3.421566
128,Cycling,2020-04-19 13:36:23,False,Portland Cycling,0.05,2.0,00:00:36,87,98,0.0,...,00:00.00,1,0.0,124.0,122.0,0,0,0,0 days 00:00:36,3.333333
150,Indoor Running,2020-03-08 19:35:20,False,Indoor Running,0.05,1.0,00:00:19,100,112,0.0,...,00:00.00,1,0.0,146.0,144.0,0,0,0,0 days 00:00:19,3.157895
334,Strength Training,2019-04-10 12:30:19,False,Strength,0.00,4.0,00:01:23,94,102,0.0,...,00:00.00,1,0.0,330.0,328.0,0,1070,5,0 days 00:01:23,2.891566


In [65]:
dataraw.replace('--', '0', inplace=True)
dataraw.Calories = dataraw.Calories.str.replace(',', '').astype(float)

In [67]:
dataraw['cals per minute'] = dataraw.Calories / dataraw.timedelta.

AttributeError: 'TimedeltaProperties' object has no attribute 'minute'

In [51]:
run['time'][4].minute

58

In [76]:
run.index[:1]

DatetimeIndex(['2020-11-10 15:12:40'], dtype='datetime64[ns]', name='date', freq=None)

In [84]:
run.columns

Index(['activity_type', 'title', 'distance', 'calories', 'time', 'avg_hr',
       'max_hr', 'aerobic_te', 'avg_run_cadence', 'max_run_cadence',
       'avg_pace', 'best_pace', 'elev_gain', 'elev_loss', 'avg_stride_length'],
      dtype='object')

In [104]:
run.avg_pace[2].strftime('%M:%S')

'08:53'

In [87]:
fig2 = px.scatter(run, y='avg_pace', hover_data=['distance', 'calories'])
fig2.show()

In [94]:
#y = [8, 8.5, 9, 8.5, 8, 9, 8, 8.5, 9, 10]

run_slice = run.iloc[:30]
x = run.index[:10]
y = run.avg_pace[:10].values
fig = px.scatter(data_frame=run_slice, 
                y='avg_pace', 
                hover_data=['distance', 'calories'], 
                labels=dict(avg_pace='Average Pace', date='Date')
                )

fig.show()

In [219]:
last_15_weeks = pd.DataFrame(data[data.activity_type == 'Running'].resample('W-MON').distance.sum().tail(15))

In [7]:
activity_choice_last_5 = data[data.activity_type == 'Running'].head(5)

In [16]:
activity_choice_last_5[['distance', 'calories', 'time', 'avg_pace']].copy()

Unnamed: 0_level_0,distance,calories,time,avg_pace
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-10 15:12:40,3.11,463,00:25:42,8:16
2020-11-08 16:14:17,6.26,1011,00:58:11,9:17
2020-11-06 14:33:08,4.35,703,00:38:38,8:53
2020-11-04 12:16:30,4.0,638,00:35:02,8:45
2020-11-03 14:17:58,4.01,623,00:35:40,8:54


In [15]:
bike.shape[0]

177

In [17]:
run[['avg_pace']].

Unnamed: 0_level_0,avg_pace
date,Unnamed: 1_level_1
2020-11-10 15:12:40,00:08:16
2020-11-08 16:14:17,00:09:17
2020-11-06 14:33:08,00:08:53
2020-11-04 12:16:30,00:08:45
2020-11-03 14:17:58,00:08:54
...,...
2018-09-20 18:03:06,00:09:49
2018-09-15 20:31:46,00:21:15
2018-09-10 18:09:12,00:12:47
2018-09-08 08:08:11,00:11:40


In [66]:
# I believe you're not supposed to have mixed datatypes in a dataframe.  that being said, if we do end up splitting the biking and running activities to multiple dataframes, we'll have to calculate the pace ourselves.  who am I even talking to right now? 
# final_df[final_df.activity_type == 'Cycling'].iloc[:, 10:12] = final_df[final_df.activity_type == 'Cycling'].iloc[:, 10:12].astype(float)
# final_df[final_df.activity_type == 'Running'].iloc[:, 10:12] = final_df[final_df.activity_type == 'Running'].iloc[:, 10:12].apply(lambda x: dt.datetime.strptime(x, '%M:%S'))

# 

### Checking for days I biked multiple times.  Every day I commuted....other days I did there and back rides....so that's a fair amount

In [None]:
# the keep=False keeps all the duplicates, instead of the first or the last dup
# Syntax 1
bike[bike.date.dt.date.duplicated(keep=False)]

# Syntax 2
#bike[bike.duplicated(subset=['date'], keep=False)].sort_index()

In [None]:
#run.resample('D').distance.sum()
last_15_weeks = pd.DataFrame(run.resample('W-MON').distance.sum().tail(15))

run.set_index("date", inplace=True)
last_15_weeks = pd.DataFrame(run.resample("W-MON").distance.sum().tail(15))