# Strava Data Project
----

**Goals:**
- Develop a script to clean up the data and surface different charts and statistics
    - Totals, Avgs, Time series, Locations, Fastest segments, Longest Runs, All over
        - Speed, Distance, Time, Altitude, Calories, Activity Type
- Create dashboard using d3.js and other front end tools (html, css)
    - Focus on design and dynamics
- Create scrollable/slideshow on webpage to increase interactivity
    - Surface all the major statistics in a nice fashion
    - Dashboard at the end
- Reach Task: Create a live webpage where people can upload their 'activities.csv' data from strava and see their information afterwards
    - Questions
        - Data collections from stangers?
        - Making webpage constantly active?

**Results:**
- Python script to ingest and process data files
- html webpage to show all the information
- Website url for public access

**Future:**
- Weave in whoop data?

In [45]:
import pandas as pd
import os
import datetime

In [46]:
os.getcwd()

'/Users/Ty/Code/Pets/Strava/StravaDash/exploration'

In [47]:
data = pd.read_csv('../data/activities.csv')

In [48]:
df0 = data[['Activity ID', 'Activity Date', 'Activity Name', 'Activity Type', 'Elapsed Time',
           'Elapsed Time.1', 'Distance', 'Distance.1', 'Max Heart Rate', 'Relative Effort',
           'Moving Time', 'Max Speed', 'Average Speed', 'Elevation Gain',
           'Elevation Loss', 'Elevation Low', 'Elevation High', 'Max Grade', 'Average Grade',
           'Average Heart Rate', 'Calories']]
df0.head(2)

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Elapsed Time,Elapsed Time.1,Distance,Distance.1,Max Heart Rate,Relative Effort,...,Max Speed,Average Speed,Elevation Gain,Elevation Loss,Elevation Low,Elevation High,Max Grade,Average Grade,Average Heart Rate,Calories
0,4551300481,"Jan 1, 2021, 10:56:31 PM",Afternoon Run,Run,612,612.0,1.61,1609.400024,,,...,,2.629739,,,,,,0.0,,
1,4555170880,"Jan 2, 2021, 3:52:42 PM",Morning Run,Run,647,647.0,1.69,1698.300049,,,...,7.0,2.844724,28.823198,26.423201,58.400002,77.400002,8.7,0.141318,,


In [49]:
df0.columns = [i.lower().replace(' ', '_') for i in df0.columns]

In [50]:
df0.iloc[597]

activity_id                         6058411633
activity_date         Oct 3, 2021, 11:45:12 AM
activity_name                   Maine Marathon
activity_type                              Run
elapsed_time                             13595
elapsed_time.1                         13595.0
distance                                 42.94
distance.1                             42942.5
max_heart_rate                             NaN
relative_effort                            NaN
moving_time                            13549.0
max_speed                                  5.4
average_speed                         3.169422
elevation_gain                      256.183472
elevation_loss                       256.28299
elevation_low                              2.0
elevation_high                       51.599998
max_grade                                 10.2
average_grade                        -0.000233
average_heart_rate                         NaN
calories                                   NaN
Name: 597, dt

In [51]:
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   activity_id         660 non-null    int64  
 1   activity_date       660 non-null    object 
 2   activity_name       660 non-null    object 
 3   activity_type       660 non-null    object 
 4   elapsed_time        660 non-null    int64  
 5   elapsed_time.1      660 non-null    float64
 6   distance            660 non-null    float64
 7   distance.1          660 non-null    float64
 8   max_heart_rate      344 non-null    float64
 9   relative_effort     344 non-null    float64
 10  moving_time         660 non-null    float64
 11  max_speed           656 non-null    float64
 12  average_speed       660 non-null    float64
 13  elevation_gain      656 non-null    float64
 14  elevation_loss      311 non-null    float64
 15  elevation_low       311 non-null    float64
 16  elevatio

In [52]:
df = df0.copy()

#df['time_of_day'] = [i.split(',')[2].strip() for i in df['activity_date']]
#df['date_time'] = datetime.datetime.strptime(df['activity_date'], "%b %d, %Y %H:%M:%S%p")
df['date_time'] = pd.to_datetime(df['activity_date']) - pd.Timedelta(hours=4)
df['hour_of_day'] = df['date_time'].dt.hour
df['month_of_year'] = df['date_time'].dt.month

df['distance_miles'] = df['distance'] / 1.609
df['time_minutes'] = df['elapsed_time'] / 60
df['max_speed_mph'] = df['max_speed'] * 2.237
df['average_speed_mph'] = df['average_speed'] * 2.237

df['elevation_gain_ft'] = df['elevation_gain'] * 3.28084


In [53]:
df.head(3)

Unnamed: 0,activity_id,activity_date,activity_name,activity_type,elapsed_time,elapsed_time.1,distance,distance.1,max_heart_rate,relative_effort,...,average_heart_rate,calories,date_time,hour_of_day,month_of_year,distance_miles,time_minutes,max_speed_mph,average_speed_mph,elevation_gain_ft
0,4551300481,"Jan 1, 2021, 10:56:31 PM",Afternoon Run,Run,612,612.0,1.61,1609.400024,,,...,,,2021-01-01 18:56:31,18,1,1.000622,10.2,,5.882725,
1,4555170880,"Jan 2, 2021, 3:52:42 PM",Morning Run,Run,647,647.0,1.69,1698.300049,,,...,,,2021-01-02 11:52:42,11,1,1.050342,10.783333,15.659,6.363647,94.564302
2,4560941785,"Jan 3, 2021, 2:52:02 PM",Morning Run,Run,633,633.0,1.64,1646.58728,168.0,28.0,...,168.0,,2021-01-03 10:52:02,10,1,1.019267,10.55,11.4087,5.979571,92.280133


In [54]:
df = df.rename(columns={'distance':'distance_km', 'elapsed_time.1':'time_sec'})

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   activity_id         660 non-null    int64         
 1   activity_date       660 non-null    object        
 2   activity_name       660 non-null    object        
 3   activity_type       660 non-null    object        
 4   elapsed_time        660 non-null    int64         
 5   time_sec            660 non-null    float64       
 6   distance_km         660 non-null    float64       
 7   distance.1          660 non-null    float64       
 8   max_heart_rate      344 non-null    float64       
 9   relative_effort     344 non-null    float64       
 10  moving_time         660 non-null    float64       
 11  max_speed           656 non-null    float64       
 12  average_speed       660 non-null    float64       
 13  elevation_gain      656 non-null    float64       

In [56]:
df.columns

Index(['activity_id', 'activity_date', 'activity_name', 'activity_type',
       'elapsed_time', 'time_sec', 'distance_km', 'distance.1',
       'max_heart_rate', 'relative_effort', 'moving_time', 'max_speed',
       'average_speed', 'elevation_gain', 'elevation_loss', 'elevation_low',
       'elevation_high', 'max_grade', 'average_grade', 'average_heart_rate',
       'calories', 'date_time', 'hour_of_day', 'month_of_year',
       'distance_miles', 'time_minutes', 'max_speed_mph', 'average_speed_mph',
       'elevation_gain_ft'],
      dtype='object')

In [57]:
cols_to_keep = ['activity_id', 'date_time', 'activity_name', 'activity_type',
                'time_sec', 'distance_km', 'max_heart_rate', 'max_speed',
                'average_speed', 'elevation_gain', 'average_heart_rate',
                'calories', 'date_time', 'hour_of_day', 'month_of_year',
                'distance_miles', 'time_minutes', 'max_speed_mph', 'average_speed_mph',
                'elevation_gain_ft']
out_df = df[cols_to_keep]
out_df.head(2)

Unnamed: 0,activity_id,date_time,activity_name,activity_type,time_sec,distance_km,max_heart_rate,max_speed,average_speed,elevation_gain,average_heart_rate,calories,date_time.1,hour_of_day,month_of_year,distance_miles,time_minutes,max_speed_mph,average_speed_mph,elevation_gain_ft
0,4551300481,2021-01-01 18:56:31,Afternoon Run,Run,612.0,1.61,,,2.629739,,,,2021-01-01 18:56:31,18,1,1.000622,10.2,,5.882725,
1,4555170880,2021-01-02 11:52:42,Morning Run,Run,647.0,1.69,,7.0,2.844724,28.823198,,,2021-01-02 11:52:42,11,1,1.050342,10.783333,15.659,6.363647,94.564302


In [58]:
df['activity_type'].value_counts().reset_index()

Unnamed: 0,index,activity_type
0,Run,322
1,Workout,275
2,Weight Training,42
3,Ride,16
4,Rowing,3
5,Yoga,2


In [59]:
# Split off activity dfs
activity_dfs = {}

for a in df['activity_type'].unique():
    filt = df[df['activity_type'] == a]
    activity_dfs[a] = filt

----
### Top Statistics:

- Num Activities
- Top activity
- Activity counts (Pie?)
- Total Activity Time
- Total Activity Distance

In [61]:
num_activities = len(df)
top_activity = df['activity_type'].value_counts().reset_index()['index'].iloc[0]

active_counts_table = df['activity_type'].value_counts().reset_index()
active_counts_table.columns = ['Activity', 'Count']
#active_counts_table.to_csv('value_cnts.csv')

total_time_seconds = sum(df['time_sec'])
total_time_minutes = sum(df['time_minutes'])

total_miles = sum(df['distance_miles'])

### Speed and distance activities

In [None]:
# Group activity types


In [70]:
if 'Run' in df['activity_type'].unique():
    runs = activity_dfs['Run'] 
    run_dist_total_miles = sum(runs['distance_miles']) # recalc distance m
    run_speed_avg_mph = runs['average_speed_mph'].mean()
    run_count = len(runs) # calendar icon, animate filling up square behind it to reach %
    run_total_elevation = sum(runs['elevation_gain'].dropna()) # confirm measure
    run_hr_avg = runs['average_heart_rate'].mean()
    run_hr_max = max(runs['max_heart_rate'].dropna())
    run_cals_total = sum(runs['calories'].dropna()) # pizzas burned
    run_time_total = sum(runs['time_minutes']) # days running, times watched all of the skywalker saga
    
if 'Ride' in df['activity_type'].unique():
    pass

In [72]:
run_time_total

14282.63333333334

### Workout activities