In [13]:
import pandas as pd
import requests
import urllib3
import streamlit as st

import login as login

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

auth_url = 'https://www.strava.com/oauth/token'
gear_url = 'https://www.strava.com/api/v3/gear/{id}'

payload = {
    'client_id': f'{login.client_id}',
    'client_secret': f'{login.client_secret}',
    'refresh_token': f'{login.refresh_token}',
    'grant_type': 'refresh_token',
    'f': 'json'
}

res = requests.post(auth_url, data=payload, verify=False)
access_token = res.json()['access_token']

header = {'Authorization': 'Bearer ' + access_token}

# Strava API only allows 200 results per page. This function loops thorugh until all results are collected
def get_activities_data():
    '''This function gets all activities data from Strava API
    
    Args:
        None
        
    Returns:
        data (JSON): JSON data of activities'''
    # set the URL for the activities endpoint
    activities_url = 'https://www.strava.com/api/v3/athlete/activities'
    # set value of page to start at page 1
    page = 1
    # create an empty list to store all data
    data = []
    # set new_results to True to start the loop
    new_results = True
    while new_results:
        # requests one page at a time (200 results)
        get_activities = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': page}).json()
        # feeback
        print(f"Fetching page {page}")
        print(f"Number of activities fetched: {len(get_activities)}")
        # if there are no results, the loop will stop
        new_results = get_activities
        # add the results to the data list
        data.extend(get_activities)
        # increment the page number
        page += 1
        
        if page > 20:
            print('Stopping after 20 pages to avoid excessive API calls')
            break
        
    return pd.json_normalize(data)
        
# get all activities data
activities = get_activities_data()

# convert meters to miles
activities.distance = (activities.distance / 1609.34).round(2)
# convert to mph
activities.average_speed = (activities.average_speed * 2.23694).round(2)
activities.max_speed = (activities.max_speed * 2.23694).round(2)
# convert to feet
activities.total_elevation_gain = (activities.total_elevation_gain * 3.28084).round(2)
activities.elev_high = (activities.elev_high * 3.28084).round(2)
activities.elev_low = (activities.elev_low * 3.28084).round(2)

activities_df = pd.DataFrame(activities)

# get distinct gear id's
gear_list = activities_df['gear_id'].unique()

gear_list = gear_list[~pd.isnull(gear_list)]

def get_gear_data(gear_list):
    '''This fuunction gets gear data from Strava API
    
    Args:
        gear_list (array): List of distinct gear ids
        
        Returns:
            data (JSON): JSON data of gear
        '''
    # create empty list to store gear data
    data = []
    # loop through gear_list and get gear data
    for gear_id in gear_list:
        get_gear = requests.get(gear_url.format(id=gear_id), headers=header).json()
        data.append(get_gear)
    return pd.json_normalize(data)

# get all geat data
gear = get_gear_data(gear_list)

# convert meters to miles
gear.distance = gear.distance / 1609.34

gear = gear.drop(columns=['converted_distance'])

# create base dataframe joining activity and gear data
pre_df = pd.merge(activities_df, gear, how='left', left_on='gear_id', right_on='id', suffixes=('_activity', '_gear')).drop(columns='id_gear')

# convert moving_time and elapsed time to H% M% S% format
pre_df['moving_time'] = pd.to_timedelta(pre_df['moving_time'], unit='s')
pre_df['elapsed_time'] = pd.to_timedelta(pre_df['elapsed_time'], unit='s')

# convert start_date and start_date_local to datetime
pre_df['start_date'] = pd.to_datetime(pd.to_datetime(pre_df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S'))
pre_df['start_date_local'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m-%d %H:%M:%S'))

# add start time for analysis and in am/pm format
pre_df['start_time_local_24h'] = pd.to_datetime(pre_df['start_date_local']).dt.time
pre_df['start_time_local_12h'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime("%I:%M %p")

# add day of week
pre_df['day_of_week'] = pd.to_datetime(pre_df['start_date_local']).dt.day_name()

# add month
pre_df['month'] = pd.to_datetime(pre_df['start_date_local']).dt.month_name()

# add month year
pre_df['month_year'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m'))

# add month year name
pre_df['month_year_name'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime('%b %Y')

# add year label
pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year

# add year label
pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year

df = pre_df.copy()

Fetching page 1
Number of activities fetched: 200
Fetching page 2
Number of activities fetched: 200
Fetching page 3
Number of activities fetched: 200
Fetching page 4
Number of activities fetched: 22
Fetching page 5
Number of activities fetched: 0


In [16]:
df[['start_latitude', 'start_longitude']] = pd.DataFrame(df['start_latlng'].tolist(), index=df.index)

In [18]:
import warnings

from meteostat import Point, Hourly, units
from concurrent.futures import ThreadPoolExecutor

warnings.simplefilter(action='ignore', category=FutureWarning)

def add_weather_data(df: pd.DataFrame, max_workers=20) -> pd.DataFrame:
            '''This function gets weather data from Meteostat and adds it onto the activities DataFrame
            
            Args:
                df (DataFrame): Activities data frame that uses latitude, longitude, and timestamps to get weather data
                max_worker (int): Number of threads to use in the multi-threading process
                
            Returns:
                df (DataFrame): Original df with weatehr data appended'''
                
            def get_weather(row):
                '''This function takes the latitude, longitude, and timestamp for each row and calls the Meteostat API for data
                
                Args:
                    row: The row in the DataFrame used in the parent function
                    
                Returns:
                    weather_data (dict): The temperature and relative humidity of the row's activity as a dictionary'''
                
                # separate the latitude and longitude from the activity data
                # get the location of the activity
                location = Point(row['start_latitude'], row['start_longitude'])
                # get the time of the activity
                timestamp = pd.to_datetime(row['start_date_local'])
                # only use the hour it started
                start = end = timestamp.replace(tzinfo=None, minute=0, second=0, microsecond=0)

                # call meteostat API
                try:
                    data = Hourly(location, start, end)
                    data = data.convert(units.imperial).fetch()
                    if not data.empty:
                        # only get the first row of data
                        weather = data[['temp', 'rhum']].iloc[0]
                        return {'temp': weather['temp'], 'rhum': weather['rhum']}
                    else:
                        return {'temp': None, 'rhum': None}
                except Exception as e:
                    print(f"Error fetching weather for {timestamp}: {e}")
                    return {'temp': None, 'rhum': None}
                
            df[['start_latitude', 'start_longitude']] = pd.DataFrame(df['start_latlng'].tolist(), index=df.index)

            # multi-threading so the function can call the API and iterate through rows faster
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                weather_data = list(executor.map(get_weather, [row for _, row in df.iterrows()]))

            # get the weatehr data and concat the two DataFrames
            weather_df = pd.DataFrame(weather_data)
            return pd.concat([df.reset_index(drop=True), weather_df.reset_index(drop=True)], axis=1)

In [19]:
activities_df = add_weather_data(df)

In [None]:
activities_df

In [None]:
def add_weather_data(df: pd.DataFrame, max_workers=20) -> pd.DataFrame:
    '''This function gets weather data from Meteostat and adds it onto the activities DataFrame
    
    Args:
        df (DataFrame): Activities data frame that uses latitude, longitude, and timestamps to get weather data
        max_worker (int): Number of threads to use in the multi-threading process
        
    Returns:
        df (DataFrame): Original df with weatehr data appended'''
        
    def get_weather(row):
        '''This function takes the latitude, longitude, and timestamp for each row and calls the Meteostat API for data
        
        Args:
            row: The row in the DataFrame used in the parent function
            
        Returns:
            weather_data (dict): The temperature and relative humidity of the row's activity as a dictionary'''
            
        # get the location of the activity
        location = Point(row['start_latitude'], row['start_longitude'])
        # get the time of the activity
        timestamp = pd.to_datetime(row['start_date_local'])
        # only use the hour it started
        start = end = timestamp.replace(tzinfo=None, minute=0, second=0, microsecond=0)

        # call meteostat API
        try:
            data = Hourly(location, start, end)
            data = data.convert(units.imperial).fetch()
            if not data.empty:
                # only get the first row of data
                weather = data[['temp', 'rhum']].iloc[0]
                return {'temp': weather['temp'], 'rhum': weather['rhum']}
            else:
                return {'temp': None, 'rhum': None}
        except Exception as e:
            print(f"Error fetching weather for {timestamp}: {e}")
            return {'temp': None, 'rhum': None}

    # multi-threading so the function can call the API and iterate through rows faster
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        weather_data = list(executor.map(get_weather, [row for _, row in df.iterrows()]))

    # get the weatehr data and concat the two DataFrames
    weather_df = pd.DataFrame(weather_data)
    return pd.concat([df.reset_index(drop=True), weather_df.reset_index(drop=True)], axis=1)

In [10]:
activities_df = add_weather_data(activities_df)

In [11]:
activities_df

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,...,average_cadence,average_watts,max_watts,weighted_average_watts,start_latitude,start_longitude,temp,rhum,temp.1,rhum.1
0,2,Evening Run,4.37,2601,2606,273.62,Run,Run,,14179770912,...,,,,,40.127953,-75.524761,64.0,38.0,64.0,38.0
1,2,Afternoon Run,8.01,4993,5550,341.21,Run,Run,,14171324956,...,,,,,40.128811,-75.524271,57.2,49.0,57.2,49.0
2,2,Afternoon Hike,5.82,9649,10019,1524.28,Hike,Hike,,14159607492,...,,,,,40.209286,-75.796622,41.0,86.0,41.0,86.0
3,2,Evening Run,2.34,1349,1353,156.17,Run,Run,,14159607178,...,,,,,40.127958,-75.524613,52.0,74.0,52.0,74.0
4,2,Evening Run,4.01,2333,2481,148.95,Run,Run,,14101976311,...,,,,,40.128737,-75.524267,54.0,72.0,54.0,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,2,Afternoon Run,0.98,512,528,54.79,Run,Run,,3903866794,...,,,,,40.093294,-75.097036,77.0,79.0,77.0,79.0
618,2,Evening Run,1.01,653,656,201.12,Run,Run,,3903866817,...,,,,,40.091981,-75.070921,78.1,71.0,78.1,71.0
619,2,Evening Run,1.02,565,565,51.51,Run,Run,,3903866790,...,,,,,40.097020,-75.098625,78.1,76.0,78.1,76.0
620,2,Morning Walk,1.26,1342,1342,30.18,Walk,Walk,,3391765082,...,,,,,40.098561,-75.106831,57.0,87.0,57.0,87.0


In [None]:
activities_df

In [111]:
activities_df[~activities_df['start_latlng'].apply(lambda x: isinstance(x, list) and len(x) == 2)]

Unnamed: 0,resource_state,name,distance,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id,...,athlete.resource_state,map.id,map.summary_polyline,map.resource_state,device_watts,kilojoules,average_cadence,average_watts,max_watts,weighted_average_watts
31,2,Evening Ride,0.0,2528,2528,0.0,Ride,Ride,10.0,13304390290,...,1,a13304390290,,2,,,,,,
96,2,Peloton - 30 min Just Ride,8.17,1799,1799,0.0,Ride,Ride,10.0,11956281062,...,1,a11956281062,,2,True,191.1,52.0,106.2,215.0,108.0
155,2,30 min Oregon Ride with Emma Lovewell,8.16,1800,1800,0.0,Ride,Ride,,10877785670,...,1,a10877785670,,2,True,206.2,61.0,114.5,504.0,134.0
168,2,30 min HIIT & Hills Ride with Ben Alldis,8.8,1800,1800,0.0,Ride,Ride,,10617577342,...,1,a10617577342,,2,True,230.0,70.4,127.8,255.0,136.0
169,2,30 min HIIT & Hills Ride with Camila Ramón,8.6,1800,1800,0.0,Ride,Ride,,10604849038,...,1,a10604849038,,2,True,227.8,71.7,126.6,468.0,147.0
170,2,45 min HIIT & Hills Ride with Camila Ramón,12.94,2701,2701,0.0,Ride,Ride,,10591055155,...,1,a10591055155,,2,True,332.2,67.1,123.0,435.0,136.0
171,2,45 min HIIT & Hills Ride with Kendall Toole,11.48,2479,2479,0.0,Ride,Ride,,10573036300,...,1,a10573036300,,2,True,286.3,69.8,115.5,266.0,127.0
172,2,45 min HIIT & Hills Ride with Kendall Toole,12.48,2700,2700,0.0,Ride,Ride,,10546991803,...,1,a10546991803,,2,True,308.4,69.6,114.2,416.0,125.0
174,2,30 min HIIT & Hills Ride with Hannah Frankson,7.82,1800,1800,0.0,Ride,Ride,,10528374121,...,1,a10528374121,,2,True,182.6,75.0,101.4,412.0,116.0
176,2,45 min HIIT & Hills Ride with Kendall Toole,12.18,2700,2700,0.0,Ride,Ride,,10501940239,...,1,a10501940239,,2,True,286.6,68.9,106.1,204.0,114.0


In [110]:
activities_df['latitude'] = df['start_latlng'].apply(lambda x: x[0])

IndexError: list index out of range

In [None]:
activity_id_list = activities_df['id'].unique()
streams_url = 'https://www.strava.com/api/v3/activities/{id}/streams?keys=temp&key_by_type=true'

In [103]:
streams_url = 'https://www.strava.com/api/v3/activities/{id}/streams?keys=temp&key_by_type=true&access_token={access_token}'

In [108]:
def get_weather_data(activities_df):
    import datetime
    
    activity_time = activities_df['start_date_local']
    latitude = activities_df['start_latitude']
    longitude = activities_df['start_longitude']
    
    timestamp = int(datetime.fromisoformat(activity_time).timestamp())
    
    api_key = '5ae57afe11a0842ce174726ddbd8e67f'
    url = f'https://api.openweathermap.org/data/3.0/onecall/timemachine'
    params = {
        'lat': latitude,
        'lon': longitude,
        'dt': timestamp,
        'appid': api_key,
        'units': 'metric'  # or 'imperial'
    }

    weather_data = requests.get(url, params=params).json()
    return pd.json_normalize(weather_data)

In [109]:
weather_data = get_weather_data(activities_df)

KeyError: 'start_latitude'

In [85]:
activity_id_list

array([14179770912, 14171324956, 14159607492, 14159607178, 14101976311,
       14082616039, 14034808781, 14025560419, 13968170136, 13931770836,
       13901597211, 13884288515, 13866802472, 13838427132, 13819421295,
       13791919418, 13773562339, 13754433602, 13719038524, 13698446713,
       13634060999, 13585885674, 13567948439, 13513567302, 13488377310,
       13462055474, 13451415128, 13398803954, 13383971557, 13329496412,
       13320185007, 13304390290, 13276707143, 13268391937, 13241023325,
       13203763599, 13189758388, 13170261399, 13164372357, 13138821040,
       13117729068, 13116672941, 13110013744, 13075684158, 13062852018,
       13017990711, 13005548081, 12976663188, 12976663496, 12925490056,
       12925490097, 12909412425, 12871533607, 12855979172, 12817694787,
       12817698860, 12800844829, 12769194616, 12761707354, 12751989838,
       12744495741, 12730188343, 12703516948, 12688136969, 12657127714,
       12629303032, 12624811419, 12601916733, 12592879074, 12584

In [106]:
requests.get(streams_url.format(id=9420664195, access_token=access_token)).json()

{'distance': {'data': [3.2,
   3.9,
   4.7,
   5.4,
   6.1,
   6.9,
   7.6,
   8.3,
   9.0,
   10.9,
   12.7,
   14.5,
   14.5,
   17.9,
   19.6,
   21.2,
   22.9,
   24.8,
   26.7,
   28.6,
   33.8,
   39.1,
   40.8,
   42.5,
   44.1,
   44.1,
   47.8,
   47.8,
   47.8,
   51.8,
   55.3,
   58.8,
   61.6,
   64.5,
   67.3,
   69.9,
   72.6,
   74.3,
   76.0,
   77.7,
   80.0,
   82.3,
   84.5,
   84.5,
   88.4,
   93.2,
   97.9,
   102.7,
   104.9,
   107.0,
   110.5,
   114.0,
   117.4,
   120.8,
   124.2,
   127.3,
   130.4,
   133.5,
   133.5,
   133.5,
   136.6,
   136.6,
   136.6,
   139.3,
   141.5,
   143.7,
   146.9,
   150.0,
   153.2,
   158.0,
   162.8,
   166.0,
   169.2,
   172.5,
   176.4,
   180.3,
   183.3,
   186.4,
   189.4,
   193.2,
   197.0,
   199.5,
   202.0,
   204.4,
   206.5,
   208.6,
   210.7,
   213.6,
   216.5,
   218.8,
   221.2,
   223.5,
   226.7,
   229.9,
   232.1,
   234.3,
   236.5,
   240.1,
   243.8,
   246.2,
   248.6,
   251.0,
   254.8,
   258

In [None]:
data

In [80]:
data = []
for activity_id in activity_id_list:
        streams = requests.get(streams_url.format(id=activity_id), headers=header).json()
        data.append(streams)
        
pd.json_normalize(data)


Unnamed: 0,distance.data,distance.series_type,distance.original_size,distance.resolution,time.data,time.series_type,time.original_size,time.resolution,message,errors
0,"[0.0, 2.6, 5.1, 8.5, 11.9, 14.5, 17.1, 19.7, 2...",distance,2602.0,high,,,,,,
1,"[0.0, 1.3, 2.6, 3.9, 5.1, 6.4, 7.7, 10.7, 13.6...",distance,4990.0,high,,,,,,
2,"[0.0, 0.0, 0.4, 1.0, 1.5, 2.1, 2.7, 3.6, 4.7, ...",distance,10010.0,high,,,,,,
3,"[0.0, 1.7, 3.4, 5.0, 6.7, 8.4, 10.9, 13.4, 15....",distance,1351.0,high,,,,,,
4,"[0.0, 1.1, 2.2, 3.4, 4.5, 5.6, 7.4, 9.1, 10.9,...",distance,2335.0,high,,,,,,
...,...,...,...,...,...,...,...,...,...,...
617,,,,,,,,,Rate Limit Exceeded,"[{'resource': 'Application', 'field': 'overall..."
618,,,,,,,,,Rate Limit Exceeded,"[{'resource': 'Application', 'field': 'overall..."
619,,,,,,,,,Rate Limit Exceeded,"[{'resource': 'Application', 'field': 'overall..."
620,,,,,,,,,Rate Limit Exceeded,"[{'resource': 'Application', 'field': 'overall..."


In [55]:
streams_df = pd.DataFrame(data)

In [56]:
streams_df

Unnamed: 0,0,1
0,"{'type': 'distance', 'data': [0.0, 2.6, 5.1, 8...",
1,"{'type': 'distance', 'data': [0.0, 1.3, 2.6, 3...",
2,"{'type': 'distance', 'data': [0.0, 0.0, 0.4, 1...",
3,"{'type': 'distance', 'data': [0.0, 1.7, 3.4, 5...",
4,"{'type': 'distance', 'data': [0.0, 1.1, 2.2, 3...",
...,...,...
617,message,errors
618,message,errors
619,message,errors
620,message,errors


In [49]:
streams_df = get_streams_data(activity_id_list)

In [50]:
streams_df

Unnamed: 0,0,1
0,"{'type': 'distance', 'data': [0.0, 2.6, 5.1, 8...",
1,"{'type': 'distance', 'data': [0.0, 1.3, 2.6, 3...",
2,"{'type': 'distance', 'data': [0.0, 0.0, 0.4, 1...",
3,"{'type': 'distance', 'data': [0.0, 1.7, 3.4, 5...",
4,"{'type': 'distance', 'data': [0.0, 1.1, 2.2, 3...",
...,...,...
617,message,errors
618,message,errors
619,message,errors
620,message,errors


In [None]:
def get_strava_data() -> pd.DataFrame:
    '''This function builds the dataframe from Strava API data. It is used to then cache the dataframe for faster loading in the Streamlit app.
    
    Returns:
        pre_df (DataFrame): DataFrame of activities and gear data'''
    
    with st.status('Downloading Data...', expanded=True) as status:
        
        # Strava API only allows 200 results per page. This function loops through until all results are collected
        def get_activities_data() -> pd.DataFrame:
            '''This function gets all activities data from Strava API
            
            Returns:
                data (DataFrame): Normalized JSON data of activities'''
                
            # set the URL for the Strava API
            activities_url = 'https://www.strava.com/api/v3/athlete/activities'
            # set value of page to start at page 1
            page = 1
            # create an empty list to store all data
            data = []
            # set new_results to True to start the loop
            new_results = True
            
            st.write('Fetching Activities...')
            
            while new_results:
                # requests one page at a time (200 results)
                get_activities = requests.get(activities_url, headers=header, params={'per_page': 200, 'page': page}).json()
                # feedback
                print(f"Fetching page {page}")
                print(f"Number of activities fetched: {len(get_activities)}")
                # if there are no results, the loop will stop
                new_results = get_activities
                # add the results to the data list
                data.extend(get_activities)
                # increment the page number
                page += 1

                if page > 20:
                    print('Stopping after 20 pages to avoid excessive API calls')
                    break
                
            return pd.json_normalize(data)
              
        # get all activities data
        activities = get_activities_data()

        st.write('Assembling Activity Data...')
        
        # convert meters to miles
        activities.distance = (activities.distance / 1609.34).round(2)
        # convert to mph
        activities.average_speed = (activities.average_speed * 2.23694).round(2)
        activities.max_speed = (activities.max_speed * 2.23694).round(2)
        # convert to feet
        activities.total_elevation_gain = (activities.total_elevation_gain * 3.28084).round(2)
        activities.elev_high = (activities.elev_high * 3.28084).round(2)
        activities.elev_low = (activities.elev_low * 3.28084).round(2)

        activities_df = pd.DataFrame(activities)

        # get distinct gear id's
        gear_id_list = activities_df['gear_id'].unique()
        gear_id_list = gear_id_list[~pd.isnull(gear_id_list)]

        def get_gear_data(gear_list: list) -> pd.DataFrame:
            '''This function gets gear data from Strava API
            
            Args:
                gear_list (array): List of distinct gear ids
                
                Returns:
                    data (DataFrame): Normalized JSON data of gear'''
            # set the URL for the Strava API
            gear_url = 'https://www.strava.com/api/v3/gear/{id}'
            # create empty list to store gear data
            data = []
            # loop through gear_list and get gear data
            for gear_id in gear_list:
                get_gear = requests.get(gear_url.format(id=gear_id), headers=header).json()
                data.append(get_gear)
            return pd.json_normalize(data)

        # get all gear data
        gear = get_gear_data(gear_id_list)

        # convert meters to miles
        gear.distance = gear.distance / 1609.34

        gear = gear.drop(columns=['converted_distance'])
        
        # get distinct activity id's
        activity_id_list = activities_df['id_activity'].unique()
        
        def get_streams_data(activity_id_list: list) -> pd.DataFrame:
            '''This function gets streams data from Strava API
            
            Args:
                upload_id (int): Activity ID
                
            Returns:
                data (JSON): JSON data of streams'''
              # set the URL for the Strava API 
            streams_url = 'https://www.strava.com/api/v3/activities/{id}/streams?keys=&key_by_type='
            
            data = []
            
            for activity_id in activity_id_list:
                streams = requests.get(streams_url.format(id=activity_id), headers=header).json()
                data.append(streams)
            return pd.json_normalize(data)

        ##### DATA CLEANING AND TRANSFORMATION #####
        # create base dataframe joining activity and gear data
        pre_df = pd.merge(activities_df,
                        gear, 
                        how='left',
                        left_on='gear_id',
                        right_on='id',
                        suffixes=('_activity', '_gear')).drop(columns='id_gear')

        # convert moving_time and elapsed time to H% M% S% format
        pre_df['moving_time'] = pd.to_timedelta(pd.to_datetime(pre_df['moving_time'], unit='s').dt.strftime('%H:%M:%S'))
        pre_df['elapsed_time'] = pd.to_timedelta(pd.to_datetime(pre_df['elapsed_time'], unit='s').dt.strftime('%H:%M:%S'))

        # convert start_date and start_date_local to datetime
        pre_df['start_date'] = pd.to_datetime(pd.to_datetime(pre_df['start_date']).dt.strftime('%Y-%m-%d %H:%M:%S'))
        pre_df['start_date_local'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m-%d %H:%M:%S'))

        # add start time for analysis and in am/pm format
        pre_df['start_time_local_24h'] = pd.to_datetime(pre_df['start_date_local']).dt.time
        pre_df['start_time_local_12h'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime("%I:%M %p")

        # add day of week
        pre_df['day_of_week'] = pd.to_datetime(pre_df['start_date_local']).dt.day_name()

        # add month
        pre_df['month'] = pd.to_datetime(pre_df['start_date_local']).dt.month_name()

        # add month year
        pre_df['month_year'] = pd.to_datetime(pd.to_datetime(pre_df['start_date_local']).dt.strftime('%Y-%m'))
        
        # add month year name
        pre_df['month_year_name'] = pd.to_datetime(pre_df['start_date_local']).dt.strftime('%b %Y')

        # add year label
        pre_df['year'] = pd.to_datetime(pre_df['start_date_local']).dt.year
        
        pre_df.drop(columns=['start_latlng', 'end_latlng'], inplace=True)
    
        status.update(label='Data is Served!', state='complete', expanded=False)
        
    return pre_df


Fetching page 1
Number of activities fetched: 200
Fetching page 2
Number of activities fetched: 200
Fetching page 3
Number of activities fetched: 200
Fetching page 4
Number of activities fetched: 20
Fetching page 5
Number of activities fetched: 0


In [4]:
refresh_datetime = pd.Timestamp.now()
refresh_datetime

Timestamp('2025-04-12 21:17:42.199365')

In [7]:
refresh_datetime = pd.Timestamp.now()
refresh_datetime = refresh_datetime.strftime('%Y-%m-%d %I:%M %p')

In [None]:
df

In [None]:
df.to_csv('strava_data.csv', index=False)

In [None]:
df_read = pd.read_csv('data/strava_data.csv')

In [None]:
type(df_read['start_date_local'])

pandas.core.series.Series

In [None]:
df['month_year'] = pd.to_datetime(pd.to_datetime(df['start_date_local']).dt.strftime('%Y-%m'))

In [None]:
df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id_activity,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
0,2,Evening Run,6.01,0 days 00:58:25,0 days 01:00:13,218.50,Run,Run,,14034808781,...,250.0,,,18:54:10,06:54 PM,Sunday,March,2025-03-01,Mar 2025,2025
1,2,Morning Run,6.00,0 days 01:03:56,0 days 01:05:46,228.35,Run,Run,,14025560419,...,250.0,,,09:49:56,09:49 AM,Saturday,March,2025-03-01,Mar 2025,2025
2,2,Afternoon Run,8.17,0 days 01:22:00,0 days 01:22:08,292.65,Run,Run,,13968170136,...,250.0,,,14:58:53,02:58 PM,Sunday,March,2025-03-01,Mar 2025,2025
3,2,Evening Run,4.01,0 days 00:39:54,0 days 00:40:12,183.07,Run,Run,,13931770836,...,250.0,,,18:39:57,06:39 PM,Wednesday,March,2025-03-01,Mar 2025,2025
4,2,Afternoon Run,4.01,0 days 00:40:04,0 days 00:43:34,164.70,Run,Run,,13901597211,...,250.0,,,13:23:48,01:23 PM,Sunday,March,2025-03-01,Mar 2025,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,2,Afternoon Run,0.98,0 days 00:08:32,0 days 00:08:48,54.79,Run,Run,,3903866794,...,,,,17:02:13,05:02 PM,Friday,August,2020-08-01,Aug 2020,2020
612,2,Evening Run,1.01,0 days 00:10:53,0 days 00:10:56,201.12,Run,Run,,3903866817,...,,,,18:41:31,06:41 PM,Thursday,August,2020-08-01,Aug 2020,2020
613,2,Evening Run,1.02,0 days 00:09:25,0 days 00:09:25,51.51,Run,Run,,3903866790,...,,,,18:38:24,06:38 PM,Friday,July,2020-07-01,Jul 2020,2020
614,2,Morning Walk,1.26,0 days 00:22:22,0 days 00:22:22,30.18,Walk,Walk,,3391765082,...,,,,10:42:38,10:42 AM,Sunday,May,2020-05-01,May 2020,2020


In [None]:
pd.DataFrame(df.sort_values(by='month_year').groupby('month_year').size())

Unnamed: 0_level_0,0
month_year,Unnamed: 1_level_1
2020-05-01,2
2020-07-01,1
2020-08-01,11
2020-09-01,15
2020-10-01,11
2020-11-01,5
2020-12-01,4
2021-01-01,8
2021-02-01,2
2021-03-01,14


In [None]:
# max date
max_date = pd.to_datetime(df['start_date_local']).dt.strftime('%Y-%m-%d %I:%M %p').max()

# distict activity type list
act_type_filter = df['type'].value_counts().index.tolist()
act_type_filter = [activity if activity in ['Run', 'Hike', 'Walk', 'Ride'] else 'Other' for activity in act_type_filter]
act_type_filter = list(dict.fromkeys(act_type_filter))
act_type_filter.insert(0, 'All')
# distinct year list
year_filter = sorted(df['year'].unique().tolist(), reverse=True)
year_filter.insert(0, 'All')
year_filter.insert(1, 'Rolling 12 Months')
# rolling 12 mo variable
today = pd.to_datetime(max_date)
rolling_12_months = today - pd.DateOffset(months=12)

In [8]:
def df_query_builder(act_type_selection, year_selection, gear_selection=None):
    
    conditions = []
    
    # activity type filter
    if act_type_selection == 'All':
        conditions.append("type != 'None'")
    elif act_type_selection == 'Other':
        conditions.append("type not in @highlighted_activities")
    else:
        conditions.append("type == @act_type_selection")

    # year filter
    if year_selection == 'All':
        conditions.append("year != 'None'")
    elif year_selection == 'Rolling 12 Months':
        conditions.append("start_date_local >= @rolling_12_months")
    else:
        conditions.append("year == year_selection")
        
    # TODO gear filter

    query = ' and '.join(conditions)
    
    return df.query(query)

In [None]:
act_type_selection = 'Hike'
year_selection = 2025

In [None]:
conditions = []

if act_type_selection == 'All':
    conditions.append("type != 'None'")
elif act_type_selection == 'Other':
    conditions.append("type not in @highlighted_activities")
else:
    conditions.append("type == @act_type_selection")

In [None]:
conditions

['type == @act_type_selection']

In [None]:
type(pd.to_datetime(df['start_date_local']).dt.month_name())

pandas.core.series.Series

In [None]:
df['month'] = pd.to_datetime(df['start_date_local']).dt.strftime('%B')

In [None]:
df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id_activity,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
0,2,Evening Run,6.01,0 days 00:58:25,0 days 01:00:13,218.50,Run,Run,,14034808781,...,250.0,,,18:54:10,06:54 PM,Sunday,March,2025-03-01,Mar 2025,2025
1,2,Morning Run,6.00,0 days 01:03:56,0 days 01:05:46,228.35,Run,Run,,14025560419,...,250.0,,,09:49:56,09:49 AM,Saturday,March,2025-03-01,Mar 2025,2025
2,2,Afternoon Run,8.17,0 days 01:22:00,0 days 01:22:08,292.65,Run,Run,,13968170136,...,250.0,,,14:58:53,02:58 PM,Sunday,March,2025-03-01,Mar 2025,2025
3,2,Evening Run,4.01,0 days 00:39:54,0 days 00:40:12,183.07,Run,Run,,13931770836,...,250.0,,,18:39:57,06:39 PM,Wednesday,March,2025-03-01,Mar 2025,2025
4,2,Afternoon Run,4.01,0 days 00:40:04,0 days 00:43:34,164.70,Run,Run,,13901597211,...,250.0,,,13:23:48,01:23 PM,Sunday,March,2025-03-01,Mar 2025,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,2,Afternoon Run,0.98,0 days 00:08:32,0 days 00:08:48,54.79,Run,Run,,3903866794,...,,,,17:02:13,05:02 PM,Friday,August,2020-08-01,Aug 2020,2020
612,2,Evening Run,1.01,0 days 00:10:53,0 days 00:10:56,201.12,Run,Run,,3903866817,...,,,,18:41:31,06:41 PM,Thursday,August,2020-08-01,Aug 2020,2020
613,2,Evening Run,1.02,0 days 00:09:25,0 days 00:09:25,51.51,Run,Run,,3903866790,...,,,,18:38:24,06:38 PM,Friday,July,2020-07-01,Jul 2020,2020
614,2,Morning Walk,1.26,0 days 00:22:22,0 days 00:22:22,30.18,Walk,Walk,,3391765082,...,,,,10:42:38,10:42 AM,Sunday,May,2020-05-01,May 2020,2020


In [None]:
df['month_year'] = pd.to_datetime(df['start_date_local']).dt.to_period('M').dt.start_time.dt.date

In [None]:
type(df['month_year'])

pandas.core.series.Series

In [19]:
temp_df = df.query("type == 'Run' and year == 2024")
#temp_df = temp_df.sort_values(by='Activities', ascending=False)
temp_df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,id_activity,start_date,...,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,month,month_year,month_year_name,year
33,2,Afternoon Run,6.00,0 days 00:53:21,0 days 00:53:33,340.55,Run,Run,13203763599,2024-12-27 21:00:42,...,250.0,,,16:00:42,04:00 PM,Friday,December,2024-12-01,Dec 2024,2024
34,2,Afternoon Run,6.01,0 days 00:54:00,0 days 00:54:17,350.39,Run,Run,13189758388,2024-12-25 20:52:02,...,250.0,,,15:52:02,03:52 PM,Wednesday,December,2024-12-01,Dec 2024,2024
35,2,Afternoon Run,4.01,0 days 00:35:59,0 days 00:36:12,152.23,Run,Run,13170261399,2024-12-22 18:08:53,...,250.0,,,13:08:53,01:08 PM,Sunday,December,2024-12-01,Dec 2024,2024
36,2,Afternoon Run,4.21,0 days 00:39:08,0 days 00:39:08,150.92,Run,Run,13164372357,2024-12-21 21:24:40,...,250.0,,,16:24:40,04:24 PM,Saturday,December,2024-12-01,Dec 2024,2024
37,2,Evening Run,4.12,0 days 00:38:49,0 days 00:38:53,263.78,Run,Run,13138821040,2024-12-17 23:08:54,...,250.0,,,18:08:54,06:08 PM,Tuesday,December,2024-12-01,Dec 2024,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,2,Afternoon Run,1.06,0 days 00:08:59,0 days 00:10:06,38.71,Run,Run,10782347431,2024-02-12 22:34:13,...,300.0,,,17:34:13,05:34 PM,Monday,February,2024-02-01,Feb 2024,2024
161,2,Afternoon Run,4.02,0 days 00:35:52,0 days 00:36:37,154.86,Run,Run,10730957918,2024-02-09 22:37:00,...,300.0,,,17:37:00,05:37 PM,Friday,February,2024-02-01,Feb 2024,2024
162,2,Afternoon Run,4.03,0 days 00:36:07,0 days 00:36:21,153.54,Run,Run,10704817389,2024-02-05 22:30:51,...,300.0,,,17:30:51,05:30 PM,Monday,February,2024-02-01,Feb 2024,2024
164,2,Afternoon Run,4.01,0 days 00:34:50,0 days 00:35:07,165.35,Run,Run,10683678247,2024-02-02 22:16:52,...,300.0,,,17:16:52,05:16 PM,Friday,February,2024-02-01,Feb 2024,2024


In [44]:
temp_df = df.groupby(['brand_name', 'name_gear', 'retired']).agg(
    Total_Activities=('upload_id', 'count'),
    Total_Distance=('distance_activity', 'sum'),
    Max_Distance=('distance_activity', 'max'),
    Total_Elevation=('total_elevation_gain', 'sum'),
    Max_Elevation=('total_elevation_gain', 'max'),
    Total_Time=('moving_time', 'sum'),
    Max_Time=('moving_time', 'max'),
    First_Activity_Date=('start_date_local', 'min'),
    Last_Activity_Date=('start_date_local', 'max')).reset_index().sort_values(by=['retired', 'Last_Activity_Date'], ascending=[True, False]).round(2)
temp_df.set_index('brand_name', inplace=True)
temp_df.columns = temp_df.columns.str.replace('_', ' ').str.title()

temp_df

Unnamed: 0_level_0,Name Gear,Retired,Total Activities,Total Distance,Max Distance,Total Elevation,Max Elevation,Total Time,Max Time,First Activity Date,Last Activity Date
brand_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Altra,Altra Lone Peak 9 Waterproof Mid Hiking Boots,False,1,5.82,5.82,1524.28,1524.28,0 days 02:40:49,0 days 02:40:49,2025-04-12 13:55:17,2025-04-12 13:55:17
Merrell,Merrell Vapor Glove 5,False,70,326.38,11.01,16865.46,936.35,2 days 04:52:31,0 days 01:45:23,2024-07-23 20:16:21,2025-04-06 18:06:46
Altra,Altra Lone Peak 6,False,74,294.59,8.73,25712.58,2277.56,2 days 11:47:42,0 days 03:39:37,2023-08-31 19:35:22,2024-11-17 11:15:31
Tern,Tern D7i,False,19,163.65,20.1,5213.27,543.96,0 days 16:52:38,0 days 01:54:03,2024-01-12 13:50:32,2024-10-20 11:38:57
Altra,Altra Escalante 4,False,14,49.04,7.51,1858.26,267.06,0 days 07:35:39,0 days 01:11:20,2024-06-03 17:29:56,2024-07-19 20:30:47
Altra,Altra lone peak 5,True,97,353.31,8.12,29762.11,1713.25,2 days 23:04:54,0 days 03:11:54,2021-12-26 11:29:13,2023-12-08 16:39:04
Altra,Altra Escalate,True,82,311.8,9.43,17275.3,1283.46,2 days 09:09:27,0 days 03:13:40,2022-02-21 15:32:56,2023-10-22 18:15:28
Altra,Altra LONE PEAK ALL-WTHR MID Hiking boots,True,42,262.48,13.72,56642.41,3459.97,4 days 13:24:21,0 days 05:54:38,2021-06-20 11:23:15,2023-09-13 09:35:15
Altra,Altra Olympus,True,8,16.2,4.03,1434.38,312.34,0 days 02:36:01,0 days 00:36:18,2023-03-21 19:04:19,2023-04-12 19:15:37
Altra,Altra Lone Peak 4.5,True,34,152.24,8.02,12934.07,1630.25,1 days 01:54:50,0 days 02:25:25,2021-08-30 17:34:38,2021-12-19 15:31:05


In [14]:
import plotly.express as px

In [22]:
px.box(temp_df, x='name_gear', y='distance_activity', color='name_gear', points='all')

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [32]:
temp_df

In [22]:
temp_df['Time'].dt.total_seconds() / 3600

0    9.706944
1    5.969722
2    8.522778
Name: Time, dtype: float64

In [89]:
df.query("type == 'Run' and year == 2025") \
    .assign(month_year=pd.to_datetime(df['start_date_local']).dt.to_period('M').dt.to_timestamp()) \
    .sort_values(by='month_year') \
    .groupby('month_year_name', as_index=False) \
    .size()

Unnamed: 0,month_year_name,size
0,Feb 2025,7
1,Jan 2025,10
2,Mar 2025,9


In [123]:
df

Unnamed: 0,resource_state_activity,name_activity,distance_activity,moving_time,elapsed_time,total_elevation_gain,type,sport_type,workout_type,id_activity,...,brand_name,model_name,description,notification_distance,frame_type,weight,start_time_local_24h,start_time_local_12h,day_of_week,year
0,2,Evening Run,4.01,0 days 00:39:54,00:40:12,183.07,Run,Run,,13931770836,...,Merrell,Vapor Glove 5,,250.0,,,18:39:57,06:39 PM,Wednesday,2025
1,2,Afternoon Run,4.01,0 days 00:40:04,00:43:34,164.70,Run,Run,,13901597211,...,Merrell,Vapor Glove 5,,250.0,,,13:23:48,01:23 PM,Sunday,2025
2,2,Afternoon Run,4.06,0 days 00:38:52,00:38:56,177.17,Run,Run,,13884288515,...,Merrell,Vapor Glove 5,,250.0,,,17:48:36,05:48 PM,Friday,2025
3,2,Evening Run,2.36,0 days 00:21:18,00:21:18,147.64,Run,Run,,13866802472,...,Merrell,Vapor Glove 5,,250.0,,,19:21:29,07:21 PM,Wednesday,2025
4,2,Afternoon Run,7.09,0 days 01:09:15,01:09:32,320.21,Run,Run,,13838427132,...,Merrell,Vapor Glove 5,,250.0,,,17:22:17,05:22 PM,Sunday,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608,2,Afternoon Run,0.98,0 days 00:08:32,00:08:48,54.79,Run,Run,,3903866794,...,,,,,,,17:02:13,05:02 PM,Friday,2020
609,2,Evening Run,1.01,0 days 00:10:53,00:10:56,201.12,Run,Run,,3903866817,...,,,,,,,18:41:31,06:41 PM,Thursday,2020
610,2,Evening Run,1.02,0 days 00:09:25,00:09:25,51.51,Run,Run,,3903866790,...,,,,,,,18:38:24,06:38 PM,Friday,2020
611,2,Morning Walk,1.26,0 days 00:22:22,00:22:22,30.18,Walk,Walk,,3391765082,...,,,,,,,10:42:38,10:42 AM,Sunday,2020
