In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import scale

import warnings
warnings.filterwarnings('ignore')

In [83]:
filename = 'datasets/Activities.csv'

# All columns
# cols = ['Activity Type', 'Date', 'Title', 'Distance', 'Calories',
#        'Time', 'Avg HR', 'Max HR', 'Aerobic TE', 'Avg Run Cadence',
#        'Max Run Cadence', 'Avg Pace', 'Best Pace', 'Elev Gain', 'Elev Loss',
#        'Avg Stride Length', 'Avg Vertical Ratio', 'Avg Vertical Oscillation',
#        'Avg Ground Contact Time', 'Avg GCT Balance']

cols = ['Activity Type', 'Date', 'Title', 'Distance', 'Calories',
       'Time', 'Avg HR', 'Avg Run Cadence', 'Avg Pace']

colnames = ['type', 'date', 'title', 'distance', 'calories',
       'time', 'avg_hr', 'avg_cadence', 'avg_pace']

df = pd.read_csv(filename, parse_dates=['Date'], usecols=cols,
                na_values='--', thousands=',')
df.columns = colnames

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 9 columns):
type           705 non-null object
date           705 non-null datetime64[ns]
title          705 non-null object
distance       705 non-null float64
calories       686 non-null float64
time           705 non-null object
avg_hr         687 non-null float64
avg_cadence    593 non-null float64
avg_pace       693 non-null object
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 49.6+ KB


In [84]:
df = df[['type','title','date','distance','time']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 5 columns):
type        705 non-null object
title       705 non-null object
date        705 non-null datetime64[ns]
distance    705 non-null float64
time        705 non-null object
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 27.6+ KB


In [86]:
df['type'].value_counts()

running              560
indoor_cycling        60
treadmill_running     46
cycling               24
walking                8
elliptical             4
other                  3
Name: type, dtype: int64

In [95]:
df.title = df.title.str.lower()

df['easy'] = df.title.str.contains('easy')
df['tempo'] = df.title.str.contains('tempo')
df['interval'] = df.title.str.contains('interval')
df['long'] = np.logical_or(df.title.str.contains('long'), df.distance >= 13)
df['recovery'] = df.title.str.contains('recover')

def categorize_run(x):
    cat = ''
    if x['tempo']:
        cat = 'tempo'
    elif x['interval']:
        cat = 'interval'
    elif x['recovery']:
        cat = 'recovery'
    elif x['long']:
        cat = 'long'
    elif x['easy']:
        cat = 'easy'
    return cat

df['category'] = df.apply(categorize_run, axis=1)
df

Unnamed: 0,type,title,date,distance,time,easy,tempo,interval,long,recovery,category
0,running,easy run after sick,2018-10-12 21:10:12,5.04,34:49,True,False,False,False,False,easy
1,running,lat phrao run,2018-10-09 23:21:25,5.09,31:26,False,False,False,False,False,
2,running,lat phrao run,2018-10-07 22:21:38,11.01,1:08:30,False,False,False,False,False,
3,treadmill_running,treadmill,2018-10-06 21:24:32,2.40,30:38,False,False,False,False,False,
4,treadmill_running,treadmill,2018-10-05 21:01:16,5.07,32:19,False,False,False,False,False,
5,treadmill_running,treadmill,2018-10-04 20:51:56,3.13,30:26,False,False,False,False,False,
6,running,lat phrao run,2018-10-03 20:25:54,7.85,43:03,False,False,False,False,False,
7,running,easy run,2018-10-02 20:05:12,5.04,32:14,True,False,False,False,False,easy
8,running,chatuchak run,2018-09-30 16:08:15,4.76,29:16,False,False,False,False,False,
9,running,1600x2 800x2 run,2018-09-26 20:50:53,6.80,37:35,False,False,False,False,False,


### Export to manually tag running categories

In [96]:
df.to_csv('output/activity_labels_init.csv', encoding='utf-8')