In [43]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [2]:
df = pd.read_csv('./data/activities_20201001.csv')
df

Unnamed: 0,Activity Type,Date,Favorite,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,...,Grit,Flow,Climb Time,Bottom Time,Min Temp,Surface Interval,Decompression,Best Lap Time,Number of Laps,Max Temp
0,Running,2020-10-01 23:42:43,False,West Coast,12.01,628,01:19:02,165,186,3.7,...,0,0,1:19:02,0:00,29,0:00,No,00:02.02,13,0
1,Running,2020-09-28 23:08:06,False,West Coast,12.01,674,01:22:05,170,184,3.9,...,0,0,1:22:05,0:00,30,0:00,No,00:02.06,13,0
2,Other,2020-09-27 19:55:18,False,Table Tennis,0.00,612,01:55:53,140,180,3.1,...,0,0,1:55:53,0:00,28,0:00,No,01:55:53.43,1,0
3,Running,2020-09-24 23:40:59,False,West Coast,8.01,470,00:54:08,171,185,3.6,...,0,0,54:08,0:00,30,0:00,No,00:02.43,9,0
4,Other,2020-09-23 19:59:19,False,Table Tennis,0.00,873,01:51:55,156,171,3.6,...,0,0,1:51:55,0:00,31,0:00,No,01:51:55.17,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Running,2017-12-20 18:45:50,False,Marina Bay,3.95,--,00:23:55,178,194,--,...,0,0,23:55,0:00,0,0:00,No,00:00.00,1,0
366,Running,2017-12-08 18:20:38,False,Marina Bay,5.02,--,00:29:17,--,--,--,...,0,0,29:17,0:00,0,0:00,No,00:00.00,1,0
367,Running,2017-12-05 18:17:43,False,Marina Bay,4.61,--,00:28:19,--,--,--,...,0,0,28:19,0:00,0,0:00,No,00:00.00,1,0
368,Running,2017-11-30 18:16:51,False,Marina Bay,3.21,--,00:19:14,--,--,--,...,0,0,19:14,0:00,0,0:00,No,00:00.00,1,0


In [3]:
df.columns

Index(['Activity Type', 'Date', 'Favorite', 'Title', 'Distance', 'Calories',
       'Time', 'Avg HR', 'Max HR', 'Aerobic TE', 'Avg Run Cadence',
       'Max Run Cadence', 'Avg Pace', 'Best Pace', 'Elev Gain', 'Elev Loss',
       'Avg Stride Length', 'Avg Vertical Ratio', 'Avg Vertical Oscillation',
       'Avg Ground Contact Time', 'Avg GCT Balance', 'Avg Run Cadence.1',
       'Max Run Cadence.1', 'Training Stress Score®', 'Grit', 'Flow',
       'Climb Time', 'Bottom Time', 'Min Temp', 'Surface Interval',
       'Decompression', 'Best Lap Time', 'Number of Laps', 'Max Temp'],
      dtype='object')

In [4]:
# Drop irrelevant columns, and replace all missing data with np.nan

relevant_columns = [
    'Activity Type', 'Date', 'Title', 'Distance', 'Calories', 'Time', 'Avg HR', 
    'Max HR', 'Aerobic TE', 'Avg Run Cadence', 'Avg Pace', 'Elev Gain', 
    'Elev Loss', 'Avg Stride Length', 'Min Temp'
]

df = df[relevant_columns]

df = df.applymap(lambda x: np.nan if x == '--' else x)

df

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,Avg Run Cadence,Avg Pace,Elev Gain,Elev Loss,Avg Stride Length,Min Temp
0,Running,2020-10-01 23:42:43,West Coast,12.01,628,01:19:02,165,186,3.7,154,6:35,38,21,0.98,29
1,Running,2020-09-28 23:08:06,West Coast,12.01,674,01:22:05,170,184,3.9,153,6:50,30,28,0.95,30
2,Other,2020-09-27 19:55:18,Table Tennis,0.00,612,01:55:53,140,180,3.1,8,,,,0.00,28
3,Running,2020-09-24 23:40:59,West Coast,8.01,470,00:54:08,171,185,3.6,154,6:46,17,16,0.96,30
4,Other,2020-09-23 19:59:19,Table Tennis,0.00,873,01:51:55,156,171,3.6,8,,,,0.00,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,Running,2017-12-20 18:45:50,Marina Bay,3.95,,00:23:55,178,194,,,6:03,89,120,0.00,0
366,Running,2017-12-08 18:20:38,Marina Bay,5.02,,00:29:17,,,,,5:50,111,107,0.00,0
367,Running,2017-12-05 18:17:43,Marina Bay,4.61,,00:28:19,,,,,6:08,69,97,0.00,0
368,Running,2017-11-30 18:16:51,Marina Bay,3.21,,00:19:14,,,,,6:00,9,43,0.00,0


In [5]:
df.dtypes

Activity Type         object
Date                  object
Title                 object
Distance             float64
Calories              object
Time                  object
Avg HR                object
Max HR                object
Aerobic TE            object
Avg Run Cadence       object
Avg Pace              object
Elev Gain             object
Elev Loss             object
Avg Stride Length    float64
Min Temp               int64
dtype: object

In [47]:
# Change column types

col_types = {
    'Activity Type': 'category',
    'Title': 'category',
    'Calories': 'float64',
    'Avg HR': 'float64',
    'Max HR': 'float64',
    'Aerobic TE': 'float64',
    'Avg Run Cadence': 'float64',
    'Elev Gain': 'float64',
    'Elev Loss': 'float64'
}

df = df.astype(col_types)

df.Date = pd.to_datetime(df.Date)

df['Time'] = pd.to_timedelta(df['Time'])

# df.Calories = df.Calories.astype('Int64')
# df['Avg HR'] = df['Avg HR'].astype('Int64')
# df['Max HR'] = df['Max HR'].astype('Int64')
# df['Avg Run Cadence'] = df['Avg Run Cadence'].astype('Int64')
# df['Elev Gain'] = df['Elev Gain'].astype('Int64')
# df['Elev Loss'] = df['Elev Loss'].astype('Int64')

df.dtypes

Activity Type               category
Date                  datetime64[ns]
Title                       category
Distance                     float64
Calories                     float64
Time                 timedelta64[ns]
Avg HR                       float64
Max HR                       float64
Aerobic TE                   float64
Avg Run Cadence              float64
Avg Pace                      object
Elev Gain                    float64
Elev Loss                    float64
Avg Stride Length            float64
Min Temp                       int64
dtype: object

In [48]:
set(df['Activity Type'])

{'Cycling',
 'Hiking',
 'Indoor Cycling',
 'Indoor Running',
 'Other',
 'Running',
 'Treadmill Running'}

In [49]:
running_df = df[df['Activity Type'].isin(['Indoor Running', 'Running', 'Treadmill Running'])].reset_index(drop=True)

running_df['Avg Pace'] = pd.to_timedelta(running_df['Avg Pace'])

running_df

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,Avg Run Cadence,Avg Pace,Elev Gain,Elev Loss,Avg Stride Length,Min Temp
0,Running,2020-10-01 23:42:43,West Coast,12.01,628.0,0 days 01:19:02,165.0,186.0,3.7,154.0,0 days 00:06:35,38.0,21.0,0.98,29
1,Running,2020-09-28 23:08:06,West Coast,12.01,674.0,0 days 01:22:05,170.0,184.0,3.9,153.0,0 days 00:06:50,30.0,28.0,0.95,30
2,Running,2020-09-24 23:40:59,West Coast,8.01,470.0,0 days 00:54:08,171.0,185.0,3.6,154.0,0 days 00:06:46,17.0,16.0,0.96,30
3,Running,2020-09-19 23:28:36,West Coast,2.05,97.0,0 days 00:10:11,171.0,193.0,2.5,168.0,0 days 00:04:58,8.0,8.0,1.20,30
4,Running,2020-09-16 23:22:58,West Coast,12.01,664.0,0 days 01:19:10,165.0,177.0,3.6,157.0,0 days 00:06:36,40.0,27.0,0.97,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,Running,2017-12-20 18:45:50,Marina Bay,3.95,,0 days 00:23:55,178.0,194.0,,,0 days 00:06:03,89.0,120.0,0.00,0
230,Running,2017-12-08 18:20:38,Marina Bay,5.02,,0 days 00:29:17,,,,,0 days 00:05:50,111.0,107.0,0.00,0
231,Running,2017-12-05 18:17:43,Marina Bay,4.61,,0 days 00:28:19,,,,,0 days 00:06:08,69.0,97.0,0.00,0
232,Running,2017-11-30 18:16:51,Marina Bay,3.21,,0 days 00:19:14,,,,,0 days 00:06:00,9.0,43.0,0.00,0


In [52]:
fig = px.scatter(running_df, x='Distance', y='Avg Pace')
fig.show()

In [14]:
table_tennis_df = df[df['Title'] == 'Table Tennis'].reset_index(drop=True)

table_tennis_df

Unnamed: 0,Activity Type,Date,Title,Distance,Calories,Time,Avg HR,Max HR,Aerobic TE,Avg Run Cadence,Avg Pace,Elev Gain,Elev Loss,Avg Stride Length,Min Temp
0,Other,2020-09-27 19:55:18,Table Tennis,0.0,612.0,01:55:53,140.0,180.0,3.1,8.0,,,,0.0,28
1,Other,2020-09-23 19:59:19,Table Tennis,0.0,873.0,01:51:55,156.0,171.0,3.6,8.0,,,,0.0,31
2,Other,2020-09-20 19:59:32,Table Tennis,0.0,781.0,01:56:59,146.0,174.0,3.3,7.0,,,,0.0,29
3,Other,2020-09-13 20:06:34,Table Tennis,0.0,758.0,01:48:50,151.0,174.0,3.4,8.0,,,,0.0,27
4,Other,2020-09-06 20:01:04,Table Tennis,0.0,790.0,01:51:54,154.0,174.0,3.6,10.0,,,,0.0,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,Other,2018-04-28 15:07:56,Table Tennis,0.0,496.0,00:52:35,150.0,174.0,4.0,9.0,,,,0.0,31
126,Other,2018-04-21 15:57:45,Table Tennis,0.0,771.0,01:43:48,138.0,166.0,3.6,7.0,,,,0.0,29
127,Other,2018-04-08 13:12:22,Table Tennis,0.0,813.0,01:47:37,143.0,163.0,3.1,9.0,,,,0.0,29
128,Other,2018-03-31 21:09:09,Table Tennis,0.0,281.0,00:58:13,111.0,144.0,0.8,6.0,,,,0.0,28
