In [53]:
import requests
import urllib3
import streamlit as st
import pandas as pd
import warnings

from meteostat import Point, Hourly, units
from concurrent.futures import ThreadPoolExecutor

warnings.simplefilter(action='ignore', category=FutureWarning)

##### STRAVA API DATA EXTRACTION ####
# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

auth_url = 'https://www.strava.com/oauth/token'

payload = {
    'client_id': st.secrets['client_id'],
    'client_secret': st.secrets['client_secret'],
    'refresh_token': st.secrets['refresh_token'],
    'grant_type': 'refresh_token',
    'f': 'json'
}

res = requests.post(auth_url, data=payload, verify=False)
access_token = res.json()['access_token']

header = {'Authorization': 'Bearer ' + access_token}

def get_strava_data() -> pd.DataFrame:
    '''This function builds the dataframe from Strava API data. It is used to then cache the dataframe for faster loading in the Streamlit app.
    
    Returns:
        pre_df (DataFrame): DataFrame of activities and gear data'''
        
    # Strava API only allows 200 results per page. This function loops through until all results are collected
    def get_activities_data() -> pd.DataFrame:
        '''This function gets all activities data from Strava API
        
        Returns:
            data (DataFrame): Normalized JSON data of activities'''
            
        # set the URL for the Strava API
        activities_url = 'https://www.strava.com/api/v3/athlete/activities'
        # set value of page to start at page 1
        page = 1
        # create an empty list to store all data
        data = []
        # set new_results to True to start the loop
        new_results = True
        
        while new_results:
            # requests one page at a time (200 results)
            get_activities = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': page}).json()
            # feedback
            print(f"Fetching page {page}")
            print(f"Number of activities fetched: {len(get_activities)}")
            # if there are no results, the loop will stop
            new_results = get_activities
            # add the results to the data list
            data.extend(get_activities)
            # increment the page number
            page += 1

            if page > 20:
                print('Stopping after 20 pages to avoid excessive API calls')
                # TODO add backup csv file to load if the API breaks
                break
            
        return pd.read_pickle('./data/activity_data_backup.pkl')
            
    # get all activities data
    activities = get_activities_data()
    
    # convert meters to miles
    activities.distance = (activities.distance / 1609.34).round(2)
    # convert to mph
    activities.average_speed = (activities.average_speed * 2.23694).round(2)
    activities.max_speed = (activities.max_speed * 2.23694).round(2)
    # convert to feet
    activities.total_elevation_gain = (activities.total_elevation_gain * 3.28084).round(2)
    activities.elev_high = (activities.elev_high * 3.28084).round(2)
    activities.elev_low = (activities.elev_low * 3.28084).round(2)

    activities_df = pd.DataFrame(activities)
    
    def add_weather_data(df: pd.DataFrame, max_workers=30) -> pd.DataFrame:
        '''This function gets weather data from Meteostat and adds it onto the activities DataFrame
        
        Args:
            df (DataFrame): Activities data frame that uses latitude, longitude, and timestamps to get weather data
            max_worker (int): Number of threads to use in the multi-threading process
            
        Returns:
            df (DataFrame): Original df with weatehr data appended'''
            
        def get_weather(row):
            '''This function takes the latitude, longitude, and timestamp for each row and calls the Meteostat API for data
            
            Args:
                row: The row in the DataFrame used in the parent function
                
            Returns:
                weather_data (dict): The temperature and relative humidity of the row's activity as a dictionary'''
            
            # get the location of the activity
            location = Point(row['start_latitude'], row['start_longitude'])
            # get the time of the activity
            timestamp = pd.to_datetime(row['start_date_local'])
            # only use the hour it started
            start = end = timestamp.replace(tzinfo=None, minute=0, second=0, microsecond=0)

            # call meteostat API
            try:
                data = Hourly(location, start, end)
                data = data.convert(units.imperial).fetch()
                if not data.empty:
                    # only get the first row of data
                    weather = data[['temp', 'rhum']].iloc[0]
                    return {'temp': weather['temp'], 'rhum': weather['rhum']}
                else:
                    return {'temp': None, 'rhum': None}
            except Exception as e:
                print(f"Error fetching weather for {timestamp}: {e}")
                return {'temp': None, 'rhum': None}
            
        # separate the latitude and longitude from the activity data
        df[['start_latitude', 'start_longitude']] = pd.DataFrame(df['start_latlng'].tolist(), index=df.index)

        # multi-threading so the function can call the API and iterate through rows faster
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            weather_data = list(executor.map(get_weather, [row for _, row in df.iterrows()]))

        # get the weatehr data and concat the two DataFrames
        weather_df = pd.DataFrame(weather_data)
        Hourly.clear_cache()
        return pd.concat([df.reset_index(drop=True), weather_df.reset_index(drop=True)], axis=1)
    
    activities_df = add_weather_data(activities_df)

    # get distinct gear id's
    gear_id_list = activities_df['gear_id'].unique()
    gear_id_list = gear_id_list[~pd.isnull(gear_id_list)]

    def get_gear_data(gear_list: list) -> pd.DataFrame:
        '''This function gets gear data from Strava API
        
        Args:
            gear_list (array): List of distinct gear ids
            
            Returns:
                data (DataFrame): Normalized JSON data of gear'''
        # set the URL for the Strava API
        gear_url = 'https://www.strava.com/api/v3/gear/{id}'
        # create empty list to store gear data
        data = []
        # loop through gear_list and get gear data
        for gear_id in gear_list:
            get_gear = requests.get(gear_url.format(id=gear_id), headers=header).json()
            data.append(get_gear)
        return pd.json_normalize(data)
    
    # get all gear data
    gear = get_gear_data(gear_id_list)

    # convert meters to miles
    gear.distance = gear.distance / 1609.34

    gear = gear.drop(columns=['converted_distance'])

    ##### DATA CLEANING AND TRANSFORMATION #####
    # create base dataframe joining activity and gear data
    pre_df = pd.merge(activities_df,
                    gear, 
                    how='left',
                    left_on='gear_id',
                    right_on='id',
                    suffixes=('_activity', '_gear')).drop(columns='id_gear')

    # convert moving_time and elapsed time to H% M% S% format
    pre_df['moving_time'] = pd.to_timedelta(pd.to_datetime(pre_df['moving_time'], unit='s').dt.strftime('%H:%M:%S'))
    pre_df['elapsed_time'] = pd.to_timedelta(pd.to_datetime(pre_df['elapsed_time'], unit='s').dt.strftime('%H:%M:%S'))

    # convert start_date and start_date_local to datetime
    pre_df['start_date'] = pd.to_datetime(pd.to_datetime(pre_df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S'))
    pre_df['start_date_local'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m-%d %H:%M:%S'))

    # add start time for analysis and in am/pm format
    pre_df['start_time_local_24h'] = pd.to_datetime(pre_df['start_date_local']).dt.time
    pre_df['start_time_local_12h'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime("%I:%M %p")

    # add day of week
    pre_df['day_of_week'] = pd.to_datetime(pre_df['start_date_local']).dt.day_name()

    # add month
    pre_df['month'] = pd.to_datetime(pre_df['start_date_local']).dt.month_name()

    # add month year
    pre_df['month_year'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m'))
    
    # add month year name
    pre_df['month_year_name'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime('%b %Y')

    # add year label
    pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year
    
    pre_df.drop(columns=['start_latlng', 'end_latlng', 'start_latitude', 'start_longitude'], inplace=True)
    
    return pre_df

In [54]:
test_data = get_strava_data()

Fetching page 1
Number of activities fetched: 200
Fetching page 2
Number of activities fetched: 200
Fetching page 3
Number of activities fetched: 200
Fetching page 4
Number of activities fetched: 25
Fetching page 5
Number of activities fetched: 0


In [None]:
data = get_strava_data()
data.to_csv('data/data_backup.csv', index=False)

In [17]:
data

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id_activity,start_date,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
0,2,Evening Walk,0.86,0 days 00:23:59,0 days 00:26:32,118.77,Walk,Walk,14228235599,2025-04-19 23:46:55,...,300.0,,,19:46:55,07:46 PM,Saturday,April,2025-04-01,Apr 2025,2025
1,2,Evening Run,4.00,0 days 00:42:11,0 days 00:45:45,167.32,Run,Run,14228235172,2025-04-18 23:32:51,...,250.0,,,19:32:51,07:32 PM,Friday,April,2025-04-01,Apr 2025,2025
2,2,Afternoon Hike,1.55,0 days 01:02:04,0 days 01:14:12,395.67,Hike,Hike,14216995896,2025-04-18 19:26:21,...,300.0,,,15:26:21,03:26 PM,Friday,April,2025-04-01,Apr 2025,2025
3,2,Evening Run,4.37,0 days 00:43:21,0 days 00:43:26,273.62,Run,Run,14179770912,2025-04-14 22:31:35,...,250.0,,,18:31:35,06:31 PM,Monday,April,2025-04-01,Apr 2025,2025
4,2,Afternoon Run,8.01,0 days 01:23:13,0 days 01:32:30,341.21,Run,Run,14171324956,2025-04-13 21:38:21,...,250.0,,,17:38:21,05:38 PM,Sunday,April,2025-04-01,Apr 2025,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,2,Afternoon Run,0.98,0 days 00:08:32,0 days 00:08:48,54.79,Run,Run,3903866794,2020-08-07 21:02:13,...,,,,17:02:13,05:02 PM,Friday,August,2020-08-01,Aug 2020,2020
621,2,Evening Run,1.01,0 days 00:10:53,0 days 00:10:56,201.12,Run,Run,3903866817,2020-08-06 22:41:31,...,,,,18:41:31,06:41 PM,Thursday,August,2020-08-01,Aug 2020,2020
622,2,Evening Run,1.02,0 days 00:09:25,0 days 00:09:25,51.51,Run,Run,3903866790,2020-07-31 22:38:24,...,,,,18:38:24,06:38 PM,Friday,July,2020-07-01,Jul 2020,2020
623,2,Morning Walk,1.26,0 days 00:22:22,0 days 00:22:22,30.18,Walk,Walk,3391765082,2020-05-03 14:42:38,...,,,,10:42:38,10:42 AM,Sunday,May,2020-05-01,May 2020,2020


In [32]:
test_df = pd.read_csv('data/data_backup.csv')

In [None]:
test_df['moving_time'] = pd.to_timedelta(test_df['moving_time'])
test_df['elapsed_time'] = pd.to_timedelta(test_df['elapsed_time'])
test_df['start_date'] = pd.to_datetime(test_df['start_date'])
test_df['start_date_local'] = pd.to_datetime(test_df['start_date_local'])
test_df['month_year'] = pd.to_datetime(test_df['start_date_local'])
test_df['start_time_local_24h'] = pd.to_datetime(test_df['start_time_local_24h']).dt.time

In [34]:
test_df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id_activity,start_date,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
0,2,Evening Walk,0.86,0 days 00:23:59,0 days 00:26:32,118.77,Walk,Walk,14228235599,2025-04-19 23:46:55,...,300.0,,,19:46:55,07:46 PM,Saturday,April,2025-04-19 19:46:55,Apr 2025,2025
1,2,Evening Run,4.00,0 days 00:42:11,0 days 00:45:45,167.32,Run,Run,14228235172,2025-04-18 23:32:51,...,250.0,,,19:32:51,07:32 PM,Friday,April,2025-04-18 19:32:51,Apr 2025,2025
2,2,Afternoon Hike,1.55,0 days 01:02:04,0 days 01:14:12,395.67,Hike,Hike,14216995896,2025-04-18 19:26:21,...,300.0,,,15:26:21,03:26 PM,Friday,April,2025-04-18 15:26:21,Apr 2025,2025
3,2,Evening Run,4.37,0 days 00:43:21,0 days 00:43:26,273.62,Run,Run,14179770912,2025-04-14 22:31:35,...,250.0,,,18:31:35,06:31 PM,Monday,April,2025-04-14 18:31:35,Apr 2025,2025
4,2,Afternoon Run,8.01,0 days 01:23:13,0 days 01:32:30,341.21,Run,Run,14171324956,2025-04-13 21:38:21,...,250.0,,,17:38:21,05:38 PM,Sunday,April,2025-04-13 17:38:21,Apr 2025,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,2,Afternoon Run,0.98,0 days 00:08:32,0 days 00:08:48,54.79,Run,Run,3903866794,2020-08-07 21:02:13,...,,,,17:02:13,05:02 PM,Friday,August,2020-08-07 17:02:13,Aug 2020,2020
621,2,Evening Run,1.01,0 days 00:10:53,0 days 00:10:56,201.12,Run,Run,3903866817,2020-08-06 22:41:31,...,,,,18:41:31,06:41 PM,Thursday,August,2020-08-06 18:41:31,Aug 2020,2020
622,2,Evening Run,1.02,0 days 00:09:25,0 days 00:09:25,51.51,Run,Run,3903866790,2020-07-31 22:38:24,...,,,,18:38:24,06:38 PM,Friday,July,2020-07-31 18:38:24,Jul 2020,2020
623,2,Morning Walk,1.26,0 days 00:22:22,0 days 00:22:22,30.18,Walk,Walk,3391765082,2020-05-03 14:42:38,...,,,,10:42:38,10:42 AM,Sunday,May,2020-05-03 10:42:38,May 2020,2020


In [50]:
def get_activities_data() -> pd.DataFrame:
    '''This function gets all activities data from Strava API
    
    Returns:
        data (DataFrame): Normalized JSON data of activities'''
        
    # set the URL for the Strava API
    activities_url = 'https://www.strava.com/api/v3/athlete/activities'
    # set value of page to start at page 1
    page = 1
    # create an empty list to store all data
    data = []
    # set new_results to True to start the loop
    new_results = True
    
    while new_results:
        # requests one page at a time (200 results)
        get_activities = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': page}).json()
        # feedback
        print(f"Fetching page {page}")
        print(f"Number of activities fetched: {len(get_activities)}")
        # if there are no results, the loop will stop
        new_results = get_activities
        # add the results to the data list
        data.extend(get_activities)
        # increment the page number
        page += 1

        if page > 20:
            print('Stopping after 20 pages to avoid excessive API calls')
            # TODO add backup csv file to load if the API breaks
            break
        
    return pd.json_normalize(data)

In [57]:
backup_activity_data = get_activities_data()

Fetching page 1
Number of activities fetched: 200
Fetching page 2
Number of activities fetched: 200
Fetching page 3
Number of activities fetched: 200
Fetching page 4
Number of activities fetched: 25
Fetching page 5
Number of activities fetched: 0


In [58]:
backup_activity_data.to_pickle('data/activity_data_backup.pkl')

In [39]:
strava_data

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id,start_date,...,map.id,map.summary_polyline,map.resource_state,workout_type,device_watts,kilojoules,average_cadence,average_watts,max_watts,weighted_average_watts
0,2,Evening Walk,1388.4,1439,1592,36.2,Walk,Walk,14228235599,2025-04-19T23:46:55Z,...,a14228235599,ejiuFferfMYa@Yo@COKWGi@]q@E_@@@}@yBO}@?KC[Di@@...,2,,,,,,,
1,2,Evening Run,6443.6,2531,2745,51.0,Run,Run,14228235172,2025-04-18T23:32:51Z,...,a14228235172,uw|sFtw}kMm@aCa@sBe@uAEEs@L_@DSAgAWwAg@ICo@@gA...,2,,,,,,,
2,2,Afternoon Hike,2497.2,3724,4452,120.6,Hike,Hike,14216995896,2025-04-18T19:26:21Z,...,a14216995896,wqvsFdleiMKKOAIIKg@OU[QEACBGGME]g@EQA?BBOKE@WQ...,2,,,,,,,
3,2,Evening Run,7027.7,2601,2606,83.4,Run,Run,14179770912,2025-04-14T22:31:35Z,...,a14179770912,ug|sFxr~kMj@lA^j@Xp@t@tAjAjCr@nA`ArBb@t@FNNh@`...,2,,,,,,,
4,2,Afternoon Run,12886.5,4993,5550,104.0,Run,Run,14171324956,2025-04-13T21:38:21Z,...,a14171324956,qw|sF|w}kMa@oBMa@S}@u@iCIAs@LyAIi@LIAiE{AoCgAW...,2,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,2,Afternoon Run,1573.2,512,528,16.7,Run,Run,3903866794,2020-08-07T21:02:13Z,...,a3903866794,avusFnkjiMHZl@v@BFAFo@`Ao@vAs@~@cA~@iArA_@n@Ud...,2,,,,,,,
621,2,Evening Run,1621.6,653,656,61.3,Run,Run,3903866817,2020-08-06T22:41:31Z,...,a3903866817,{musFhheiMEX@`@Jr@V~AHhACPIJQF_@J]Ns@Ru@j@_@J[...,2,,,,,,,
622,2,Evening Run,1641.1,565,565,15.7,Run,Run,3903866790,2020-07-31T22:38:24Z,...,a3903866790,kmvsFlujiMt@z@Vl@hAbB\|@\t@l@v@`@b@~@tAVf@Rf@d...,2,,,,,,,
623,2,Morning Walk,2022.3,1342,1342,9.2,Walk,Walk,3391765082,2020-05-03T14:42:38Z,...,a3391765082,_wvsFvhliMHBVh@JG@KNUAGHCBRCPDVDL@ZFLJbBFb@`@r...,2,,,,,,,


In [40]:
from_csv = pd.read_csv('./data/activity_data_backup.csv')