In [40]:
import pandas as pd
import datetime as dt
import numpy as np
import shutil
import os
from pathlib import Path

import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
p = Path.home()
cwd = Path.cwd()
readme = cwd / 'README.md'
activities_csv_downloaded = str(p / 'Downloads' / 'Activities.csv')
activities_csv_raw = str(cwd / 'data' / '01_raw' / 'Activities.csv')
activities_csv_processed = str(cwd / 'data' / '02_processed' / 'Activities_processed.csv')

In [184]:
activities_df_downloaded = pd.read_csv(activities_csv_downloaded)
activities_df_raw = pd.read_csv(activities_csv_raw)
activities_df_raw = pd.concat([activities_df_downloaded, activities_df_raw]).drop_duplicates(subset='Date')
activities_df_raw.to_csv(activities_csv_raw)
os.remove(activities_csv_downloaded)

In [74]:
    data = pd.read_csv(activities_csv_processed, index_col='date', parse_dates=True)
    data = clean_and_convert(data)
    run, bike = split_run_and_bike(data)
    run = clean_run_pace(run)
    bike = clean_bike_pace(bike)

In [76]:
run.index[:1]

DatetimeIndex(['2020-11-10 15:12:40'], dtype='datetime64[ns]', name='date', freq=None)

In [84]:
run.columns

Index(['activity_type', 'title', 'distance', 'calories', 'time', 'avg_hr',
       'max_hr', 'aerobic_te', 'avg_run_cadence', 'max_run_cadence',
       'avg_pace', 'best_pace', 'elev_gain', 'elev_loss', 'avg_stride_length'],
      dtype='object')

In [87]:
fig2 = px.scatter(run, y='avg_pace', hover_data=['distance', 'calories'])
fig2.show()

In [80]:
x[2].minute

33

In [88]:
#y = [8, 8.5, 9, 8.5, 8, 9, 8, 8.5, 9, 10]

run_slice = run.iloc[:10]
x = run.index[:10]
y = run.avg_pace[:10].values
fig = px.scatter(data_frame=run_slice, 
                y='avg_pace', 
                hover_data=['distance', 'calories']
                )

fig.show()

TypeError: scatter() got an unexpected keyword argument 'data'

In [73]:
def clean_and_convert(df):
    df = clean_time(df)
    df = cols_to_float(df)
    return df

def clean_run_pace(df):
    df.best_pace = df.best_pace.apply(lambda x: dt.datetime.strptime(x, '%M:%S'))
    #df.best_pace = df.best_pace.dt.time
    df.avg_pace = df.avg_pace.apply(lambda x: dt.datetime.strptime(x, '%M:%S'))
    #df.avg_pace = df.avg_pace.dt.time
    return df

def clean_bike_pace(df):
    df.avg_pace = df.avg_pace.astype(float)
    return df

def download_to_raw():
    activities_df_downloaded = pd.read_csv(activities_csv_downloaded)
    activities_df_raw = pd.read_csv(activities_csv_raw)
    activities_df_raw = pd.concat([activities_df_downloaded, activities_df_raw]).drop_duplicates(subset='Date')
    activities_df_raw.to_csv(activities_csv_raw)
    os.remove(activities_csv_downloaded)

def prep_df(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    df.drop(df.columns[17:], axis=1, inplace=True)
    df.drop(['favorite'], axis=1, inplace=True)
    df.replace('--', '0', inplace=True)
    df.date = pd.to_datetime(df.date)
    df.set_index('date', inplace=True)
    return df

def clean_time(df):
    df.time = df.time.apply(lambda x: x.split('.')[0])
    df.time = df.time.apply(lambda x: dt.datetime.strptime(x, '%H:%M:%S'))
    df.time = df.time.dt.time
    return df

def cols_to_float(df):
    df.calories.str.replace(',', '').astype(float)
    df.iloc[:, 5:10] = df.iloc[:, 5:10].replace('--', '0')
    df.iloc[:, 5:10] = df.iloc[:, 5:10].astype(float)
    return df

def split_run_and_bike(df):
    run = df[df.activity_type == 'Running']
    bike = df[df.activity_type == 'Cycling']
    return run, bike

In [219]:
last_15_weeks = pd.DataFrame(data[data.activity_type == 'Running'].resample('W-MON').distance.sum().tail(15))

In [7]:
activity_choice_last_5 = data[data.activity_type == 'Running'].head(5)

In [16]:
activity_choice_last_5[['distance', 'calories', 'time', 'avg_pace']].copy()

Unnamed: 0_level_0,distance,calories,time,avg_pace
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-10 15:12:40,3.11,463,00:25:42,8:16
2020-11-08 16:14:17,6.26,1011,00:58:11,9:17
2020-11-06 14:33:08,4.35,703,00:38:38,8:53
2020-11-04 12:16:30,4.0,638,00:35:02,8:45
2020-11-03 14:17:58,4.01,623,00:35:40,8:54


In [15]:
bike.shape[0]

177

In [17]:
run[['avg_pace']].

Unnamed: 0_level_0,avg_pace
date,Unnamed: 1_level_1
2020-11-10 15:12:40,00:08:16
2020-11-08 16:14:17,00:09:17
2020-11-06 14:33:08,00:08:53
2020-11-04 12:16:30,00:08:45
2020-11-03 14:17:58,00:08:54
...,...
2018-09-20 18:03:06,00:09:49
2018-09-15 20:31:46,00:21:15
2018-09-10 18:09:12,00:12:47
2018-09-08 08:08:11,00:11:40


In [66]:
# I believe you're not supposed to have mixed datatypes in a dataframe.  that being said, if we do end up splitting the biking and running activities to multiple dataframes, we'll have to calculate the pace ourselves.  who am I even talking to right now? 
# final_df[final_df.activity_type == 'Cycling'].iloc[:, 10:12] = final_df[final_df.activity_type == 'Cycling'].iloc[:, 10:12].astype(float)
# final_df[final_df.activity_type == 'Running'].iloc[:, 10:12] = final_df[final_df.activity_type == 'Running'].iloc[:, 10:12].apply(lambda x: dt.datetime.strptime(x, '%M:%S'))

# 

### Checking for days I biked multiple times.  Every day I commuted....other days I did there and back rides....so that's a fair amount

In [None]:
# the keep=False keeps all the duplicates, instead of the first or the last dup
# Syntax 1
bike[bike.date.dt.date.duplicated(keep=False)]

# Syntax 2
#bike[bike.duplicated(subset=['date'], keep=False)].sort_index()

In [None]:
#run.resample('D').distance.sum()
last_15_weeks = pd.DataFrame(run.resample('W-MON').distance.sum().tail(15))

run.set_index("date", inplace=True)
last_15_weeks = pd.DataFrame(run.resample("W-MON").distance.sum().tail(15))