In [36]:
import pandas as pd
import requests
import urllib3
import streamlit as st

import login as login

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

auth_url = 'https://www.strava.com/oauth/token'
activities_url = 'https://www.strava.com/api/v3/athlete/activities'
gear_url = 'https://www.strava.com/api/v3/gear/{id}'

payload = {
    'client_id': f'{login.client_id}',
    'client_secret': f'{login.client_secret}',
    'refresh_token': f'{login.refresh_token}',
    'grant_type': 'refresh_token',
    'f': 'json'
}

res = requests.post(auth_url, data=payload, verify=False)
access_token = res.json()['access_token']

header = {'Authorization': 'Bearer ' + access_token}

# Strava API only allows 200 results per page. This function loops thorugh until all results are collected
def get_activities_data():
    '''This function gets all activities data from Strava API'''
    # set value of page to start at page 1
    page = 1
    # create an empty list to store all data
    data = []
    # set new_results to True to start the loop
    new_results = True
    while new_results:
        # requests one page at a time (200 results)
        get_activities = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': page}).json()
        # feeback
        print(f"Fetching page {page}")
        print(f"Number of activities fetched: {len(get_activities)}")
        # if there are no results, the loop will stop
        new_results = get_activities
        # add the results to the data list
        data.extend(get_activities)
        # increment the page number
        page += 1
        
        if page > 20:
            print('Stopping after 20 pages to avoid excessive API calls')
            break
        
    return pd.json_normalize(data)
        
# get all activities data
activities = get_activities_data()

# convert meters to miles
activities.distance = (activities.distance / 1609.34).round(2)
# convert to mph
activities.average_speed = (activities.average_speed * 2.23694).round(2)
activities.max_speed = (activities.max_speed * 2.23694).round(2)
# convert to feet
activities.total_elevation_gain = (activities.total_elevation_gain * 3.28084).round(2)
activities.elev_high = (activities.elev_high * 3.28084).round(2)
activities.elev_low = (activities.elev_low * 3.28084).round(2)

activities_df = pd.DataFrame(activities)

# get distinct gear id's
gear_list = activities_df['gear_id'].unique()

gear_list = gear_list[~pd.isnull(gear_list)]

def get_gear_data(gear_list):
    '''This fuunction gets gear data from Strava API
    
    Args:
        gear_list (array): List of distinct gear ids
        
        Returns:
            data (JSON): JSON data of gear
        '''
    # create empty list to store gear data
    data = []
    # loop through gear_list and get gear data
    for gear_id in gear_list:
        get_gear = requests.get(gear_url.format(id=gear_id), headers=header).json()
        data.append(get_gear)
    return pd.json_normalize(data)

# get all geat data
gear = get_gear_data(gear_list)

# convert meters to miles
gear.distance = gear.distance / 1609.34

gear = gear.drop(columns=['converted_distance'])

# create base dataframe joining activity and gear data
pre_df = pd.merge(activities_df, gear, how='left', left_on='gear_id', right_on='id', suffixes=('_activity', '_gear')).drop(columns='id_gear')

# convert moving_time and elapsed time to H% M% S% format
pre_df['moving_time'] = pd.to_timedelta(pre_df['moving_time'], unit='s')
pre_df['elapsed_time'] = pd.to_timedelta(pre_df['elapsed_time'], unit='s')

# convert start_date and start_date_local to datetime
pre_df['start_date'] = pd.to_datetime(pd.to_datetime(pre_df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S'))
pre_df['start_date_local'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m-%d %H:%M:%S'))

# add start time for analysis and in am/pm format
pre_df['start_time_local_24h'] = pd.to_datetime(pre_df['start_date_local']).dt.time
pre_df['start_time_local_12h'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime("%I:%M %p")

# add day of week
pre_df['day_of_week'] = pd.to_datetime(pre_df['start_date_local']).dt.day_name()

# add month
pre_df['month'] = pd.to_datetime(pre_df['start_date_local']).dt.month_name()

# add month year
pre_df['month_year'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m'))

# add month year name
pre_df['month_year_name'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime('%b %Y')

# add year label
pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year

# add year label
pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year

df = pre_df.copy()

Fetching page 1
Number of activities fetched: 200
Fetching page 2
Number of activities fetched: 200
Fetching page 3
Number of activities fetched: 200
Fetching page 4
Number of activities fetched: 17
Fetching page 5
Number of activities fetched: 0


In [37]:
def get_strava_data() -> pd.DataFrame:
    '''This function builds the dataframe from Strava API data. It is used to then cache the dataframe for faster loading in the Streamlit app.
    
    Returns:
        pre_df (DataFrame): DataFrame of activities and gear data'''
    
    # Strava API only allows 200 results per page. This function loops through until all results are collected
    def get_activities_data() -> pd.DataFrame:
        '''This function gets all activities data from Strava API
        
        Returns:
            data (DataFrame): Normalized JSON data of activities'''
            
        # set value of page to start at page 1
        page = 1
        # create an empty list to store all data
        data = []
        # set new_results to True to start the loop
        new_results = True
        while new_results:
            # requests one page at a time (200 results)
            get_activities = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': page}).json()
            # feedback
            print(f"Fetching page {page}")
            print(f"Number of activities fetched: {len(get_activities)}")
            # if there are no results, the loop will stop
            new_results = get_activities
            # add the results to the data list
            data.extend(get_activities)
            # increment the page number
            page += 1

            if page > 20:
                print('Stopping after 20 pages to avoid excessive API calls')
                break
            
        return pd.json_normalize(data)
            
    # get all activities data
    activities = get_activities_data()

    # convert meters to miles
    activities.distance = (activities.distance / 1609.34).round(2)
    # convert to mph
    activities.average_speed = (activities.average_speed * 2.23694).round(2)
    activities.max_speed = (activities.max_speed * 2.23694).round(2)
    # convert to feet
    activities.total_elevation_gain = (activities.total_elevation_gain * 3.28084).round(2)
    activities.elev_high = (activities.elev_high * 3.28084).round(2)
    activities.elev_low = (activities.elev_low * 3.28084).round(2)

    activities_df = pd.DataFrame(activities)

    # get distinct gear id's
    gear_id_list = activities_df['gear_id'].unique()
    gear_id_list = gear_id_list[~pd.isnull(gear_id_list)]

    def get_gear_data(gear_list: list) -> pd.DataFrame:
        '''This function gets gear data from Strava API
        
        Args:
            gear_list (array): List of distinct gear ids
            
            Returns:
                data (DataFrame): Normalized JSON data of gear'''
            
        # create empty list to store gear data
        data = []
        # loop through gear_list and get gear data
        for gear_id in gear_list:
            get_gear = requests.get(gear_url.format(id=gear_id), headers=header).json()
            data.append(get_gear)
        return pd.json_normalize(data)

    # get all gear data
    gear = get_gear_data(gear_id_list)

    # convert meters to miles
    gear.distance = gear.distance / 1609.34

    gear = gear.drop(columns=['converted_distance'])

    ##### DATA CLEANING AND TRANSFORMATION #####
    # create base dataframe joining activity and gear data
    pre_df = pd.merge(activities_df,
                    gear, 
                    how='left',
                    left_on='gear_id',
                    right_on='id',
                    suffixes=('_activity', '_gear')).drop(columns='id_gear')

    # convert moving_time and elapsed time to H% M% S% format
    pre_df['moving_time'] = pd.to_timedelta(pd.to_datetime(pre_df['moving_time'], unit='s').dt.strftime('%H:%M:%S'))
    pre_df['elapsed_time'] = pd.to_timedelta(pd.to_datetime(pre_df['elapsed_time'], unit='s').dt.strftime('%H:%M:%S'))

    # convert start_date and start_date_local to datetime
    pre_df['start_date'] = pd.to_datetime(pd.to_datetime(pre_df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S'))
    pre_df['start_date_local'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m-%d %H:%M:%S'))

    # add start time for analysis and in am/pm format
    pre_df['start_time_local_24h'] = pd.to_datetime(pre_df['start_date_local']).dt.time
    pre_df['start_time_local_12h'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime("%I:%M %p")

    # add day of week
    pre_df['day_of_week'] = pd.to_datetime(pre_df['start_date_local']).dt.day_name()

    # add month
    pre_df['month'] = pd.to_datetime(pre_df['start_date_local']).dt.month_name()

    # add month year
    pre_df['month_year'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m'))
    
    # add month year name
    pre_df['month_year_name'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime('%b %Y')

    # add year label
    pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year
    
    return pre_df

df = get_strava_data()

Fetching page 1
Number of activities fetched: 200
Fetching page 2
Number of activities fetched: 200
Fetching page 3
Number of activities fetched: 200
Fetching page 4
Number of activities fetched: 17
Fetching page 5
Number of activities fetched: 0


In [38]:
df.to_csv('strava_data.csv', index=False)

In [39]:
df_read = pd.read_csv('data/strava_data.csv')

In [40]:
type(df_read['start_date_local'])

pandas.core.series.Series

In [4]:
df['month_year'] = pd.to_datetime(pd.to_datetime(df['start_date_local']).dt.strftime('%Y-%m'))

In [5]:
df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id_activity,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
0,2,Evening Run,6.01,0 days 00:58:25,0 days 01:00:13,218.50,Run,Run,,14034808781,...,250.0,,,18:54:10,06:54 PM,Sunday,March,2025-03-01,Mar 2025,2025
1,2,Morning Run,6.00,0 days 01:03:56,0 days 01:05:46,228.35,Run,Run,,14025560419,...,250.0,,,09:49:56,09:49 AM,Saturday,March,2025-03-01,Mar 2025,2025
2,2,Afternoon Run,8.17,0 days 01:22:00,0 days 01:22:08,292.65,Run,Run,,13968170136,...,250.0,,,14:58:53,02:58 PM,Sunday,March,2025-03-01,Mar 2025,2025
3,2,Evening Run,4.01,0 days 00:39:54,0 days 00:40:12,183.07,Run,Run,,13931770836,...,250.0,,,18:39:57,06:39 PM,Wednesday,March,2025-03-01,Mar 2025,2025
4,2,Afternoon Run,4.01,0 days 00:40:04,0 days 00:43:34,164.70,Run,Run,,13901597211,...,250.0,,,13:23:48,01:23 PM,Sunday,March,2025-03-01,Mar 2025,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,2,Afternoon Run,0.98,0 days 00:08:32,0 days 00:08:48,54.79,Run,Run,,3903866794,...,,,,17:02:13,05:02 PM,Friday,August,2020-08-01,Aug 2020,2020
612,2,Evening Run,1.01,0 days 00:10:53,0 days 00:10:56,201.12,Run,Run,,3903866817,...,,,,18:41:31,06:41 PM,Thursday,August,2020-08-01,Aug 2020,2020
613,2,Evening Run,1.02,0 days 00:09:25,0 days 00:09:25,51.51,Run,Run,,3903866790,...,,,,18:38:24,06:38 PM,Friday,July,2020-07-01,Jul 2020,2020
614,2,Morning Walk,1.26,0 days 00:22:22,0 days 00:22:22,30.18,Walk,Walk,,3391765082,...,,,,10:42:38,10:42 AM,Sunday,May,2020-05-01,May 2020,2020


In [6]:
pd.DataFrame(df.sort_values(by='month_year').groupby('month_year').size())

Unnamed: 0_level_0,0
month_year,Unnamed: 1_level_1
2020-05-01,2
2020-07-01,1
2020-08-01,11
2020-09-01,15
2020-10-01,11
2020-11-01,5
2020-12-01,4
2021-01-01,8
2021-02-01,2
2021-03-01,14


In [7]:
# max date
max_date = pd.to_datetime(df['start_date_local']).dt.strftime('%Y-%m-%d %I:%M %p').max()

# distict activity type list
act_type_filter = df['type'].value_counts().index.tolist()
act_type_filter = [activity if activity in ['Run', 'Hike', 'Walk', 'Ride'] else 'Other' for activity in act_type_filter]
act_type_filter = list(dict.fromkeys(act_type_filter))
act_type_filter.insert(0, 'All')
# distinct year list
year_filter = sorted(df['year'].unique().tolist(), reverse=True)
year_filter.insert(0, 'All')
year_filter.insert(1, 'Rolling 12 Months')
# rolling 12 mo variable
today = pd.to_datetime(max_date)
rolling_12_months = today - pd.DateOffset(months=12)

In [8]:
def df_query_builder(act_type_selection, year_selection, gear_selection=None):
    
    conditions = []
    
    # activity type filter
    if act_type_selection == 'All':
        conditions.append("type != 'None'")
    elif act_type_selection == 'Other':
        conditions.append("type not in @highlighted_activities")
    else:
        conditions.append("type == @act_type_selection")

    # year filter
    if year_selection == 'All':
        conditions.append("year != 'None'")
    elif year_selection == 'Rolling 12 Months':
        conditions.append("start_date_local >= @rolling_12_months")
    else:
        conditions.append("year == year_selection")
        
    # TODO gear filter

    query = ' and '.join(conditions)
    
    return df.query(query)

In [9]:
act_type_selection = 'Hike'
year_selection = 2025

In [10]:
conditions = []

if act_type_selection == 'All':
    conditions.append("type != 'None'")
elif act_type_selection == 'Other':
    conditions.append("type not in @highlighted_activities")
else:
    conditions.append("type == @act_type_selection")

In [11]:
conditions

['type == @act_type_selection']

In [12]:
type(pd.to_datetime(df['start_date_local']).dt.month_name())

pandas.core.series.Series

In [13]:
df['month'] = pd.to_datetime(df['start_date_local']).dt.strftime('%B')

In [14]:
df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id_activity,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
0,2,Evening Run,6.01,0 days 00:58:25,0 days 01:00:13,218.50,Run,Run,,14034808781,...,250.0,,,18:54:10,06:54 PM,Sunday,March,2025-03-01,Mar 2025,2025
1,2,Morning Run,6.00,0 days 01:03:56,0 days 01:05:46,228.35,Run,Run,,14025560419,...,250.0,,,09:49:56,09:49 AM,Saturday,March,2025-03-01,Mar 2025,2025
2,2,Afternoon Run,8.17,0 days 01:22:00,0 days 01:22:08,292.65,Run,Run,,13968170136,...,250.0,,,14:58:53,02:58 PM,Sunday,March,2025-03-01,Mar 2025,2025
3,2,Evening Run,4.01,0 days 00:39:54,0 days 00:40:12,183.07,Run,Run,,13931770836,...,250.0,,,18:39:57,06:39 PM,Wednesday,March,2025-03-01,Mar 2025,2025
4,2,Afternoon Run,4.01,0 days 00:40:04,0 days 00:43:34,164.70,Run,Run,,13901597211,...,250.0,,,13:23:48,01:23 PM,Sunday,March,2025-03-01,Mar 2025,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,2,Afternoon Run,0.98,0 days 00:08:32,0 days 00:08:48,54.79,Run,Run,,3903866794,...,,,,17:02:13,05:02 PM,Friday,August,2020-08-01,Aug 2020,2020
612,2,Evening Run,1.01,0 days 00:10:53,0 days 00:10:56,201.12,Run,Run,,3903866817,...,,,,18:41:31,06:41 PM,Thursday,August,2020-08-01,Aug 2020,2020
613,2,Evening Run,1.02,0 days 00:09:25,0 days 00:09:25,51.51,Run,Run,,3903866790,...,,,,18:38:24,06:38 PM,Friday,July,2020-07-01,Jul 2020,2020
614,2,Morning Walk,1.26,0 days 00:22:22,0 days 00:22:22,30.18,Walk,Walk,,3391765082,...,,,,10:42:38,10:42 AM,Sunday,May,2020-05-01,May 2020,2020


In [15]:
df['month_year'] = pd.to_datetime(df['start_date_local']).dt.to_period('M').dt.start_time.dt.date

In [16]:
type(df['month_year'])

pandas.core.series.Series

In [35]:
temp_df = df.query("type == 'Run' and year == 2024").groupby(['brand_name', 'name_gear'], sort=False).agg({'upload_id': 'count'}).reset_index().rename(columns={'upload_id': 'Activities'})
temp_df = temp_df.sort_values(by='Activities', ascending=False)
temp_df

Unnamed: 0,brand_name,name_gear,Activities
1,Altra,Altra Lone Peak 6,41
0,Merrell,Merrell Vapor Glove 5,39
2,Altra,Altra Escalante 4,14


In [32]:
temp_df

In [22]:
temp_df['Time'].dt.total_seconds() / 3600

0    9.706944
1    5.969722
2    8.522778
Name: Time, dtype: float64

In [89]:
df.query("type == 'Run' and year == 2025") \
    .assign(month_year=pd.to_datetime(df['start_date_local']).dt.to_period('M').dt.to_timestamp()) \
    .sort_values(by='month_year') \
    .groupby('month_year_name', as_index=False) \
    .size()

Unnamed: 0,month_year_name,size
0,Feb 2025,7
1,Jan 2025,10
2,Mar 2025,9


In [123]:
df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id_activity,...,brand_name,model_name,description,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,year
0,2,Evening Run,4.01,0 days 00:39:54,00:40:12,183.07,Run,Run,,13931770836,...,Merrell,Vapor Glove 5,,250.0,,,18:39:57,06:39 PM,Wednesday,2025
1,2,Afternoon Run,4.01,0 days 00:40:04,00:43:34,164.70,Run,Run,,13901597211,...,Merrell,Vapor Glove 5,,250.0,,,13:23:48,01:23 PM,Sunday,2025
2,2,Afternoon Run,4.06,0 days 00:38:52,00:38:56,177.17,Run,Run,,13884288515,...,Merrell,Vapor Glove 5,,250.0,,,17:48:36,05:48 PM,Friday,2025
3,2,Evening Run,2.36,0 days 00:21:18,00:21:18,147.64,Run,Run,,13866802472,...,Merrell,Vapor Glove 5,,250.0,,,19:21:29,07:21 PM,Wednesday,2025
4,2,Afternoon Run,7.09,0 days 01:09:15,01:09:32,320.21,Run,Run,,13838427132,...,Merrell,Vapor Glove 5,,250.0,,,17:22:17,05:22 PM,Sunday,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608,2,Afternoon Run,0.98,0 days 00:08:32,00:08:48,54.79,Run,Run,,3903866794,...,,,,,,,17:02:13,05:02 PM,Friday,2020
609,2,Evening Run,1.01,0 days 00:10:53,00:10:56,201.12,Run,Run,,3903866817,...,,,,,,,18:41:31,06:41 PM,Thursday,2020
610,2,Evening Run,1.02,0 days 00:09:25,00:09:25,51.51,Run,Run,,3903866790,...,,,,,,,18:38:24,06:38 PM,Friday,2020
611,2,Morning Walk,1.26,0 days 00:22:22,00:22:22,30.18,Walk,Walk,,3391765082,...,,,,,,,10:42:38,10:42 AM,Sunday,2020
