# Strava scraper

This is the first attempt at scraping all cycling routes posted on strava located in **Spain**. We only need to download the *gpx* files.

In [1]:
#Importing libraries.

import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [5]:
#Initializing our webdriver (Chrome).

driver = webdriver.Chrome()

In [44]:
#Entering a ride.

driver.get('https://www.strava.com/activities/5798394213')

In [8]:
#Clicking the three-dot menu.

menu = driver.find_element_by_xpath('/html/body/div[2]/div[3]/nav/div/div')
menu.click()

In [9]:
#Downloading the gpx.

gpx = driver.find_element_by_xpath('/html/body/div[2]/div[3]/nav/div/div/ul/li[3]')
gpx.click()

In [39]:
#Obtaining the ride type. This will only match "Bicicleta" if it's a bike ride.

ridetype = driver.find_element_by_xpath('//*[@id="heading"]/header/h2/span').text[-9:]
ridetype

'Bicicleta'

In [28]:
#Obtaining the location:

location = driver.find_element_by_xpath('//*[@id="heading"]/div/div/div[1]/div/div/span').text
location

'Londres, Reino Unido'

In [None]:
#Naming schema for download links.

'http://strava.com/activities/5798394100/export_gpx'

Names of spanish CCAAs (or other territories) as they are used on **Strava**:

España
Asturias
Cataluña
Madrid
Aragón
La Rioja


In [84]:
#Making a list of the names we want to detect:

names = ['España', 'Asturias', 'Cataluña', 'Madrid', 'Aragón', 'La Rioja', 'Andalucía', 'Comunidad Valenciana', 'Andorra']

In [82]:
start = time.time()

for i in range(10):
    try:
        base_url = 'https://www.strava.com/activities/'
        base_number = 3981642097
        url = base_url + str((base_number + i))
        time.sleep(0.1)
        driver.get(url)
        time.sleep(0.1)
        if driver.find_element_by_xpath('//*[@id="heading"]/header/h2/span').text[-9:] == 'Bicicleta':
            for i in names:
                if i in driver.find_element_by_xpath('//*[@id="heading"]/div/div/div[1]/div/div/span').text:
                    gpx_url = url + '/export_gpx'
                    driver.get(gpx_url)
                    time.sleep(0.1)
                else:
                    pass
        else:
            pass
    except:
        pass
    
stop = time.time() 
duration = (stop - start) / 60
print('Minutes:', duration)

Minutes: 8.989719223976135


While this worked, **Strava**'s daily download limit is too harsh for ur purposes. Therefore we will be trying our luck with **Garmin Connect** instead.

# Garmin Connect scraper

**Garmin Connect** offers a map of user-submitted routes which we can easily filter. Those routes can be accessed by searching for the location (town name). Let's try downloading the *gpx* files for some routes to see if **Garmin** has implemented scraping protection on its website.

In [2]:
#Initializing our webdriver (Chrome).

driver = webdriver.Chrome()

In [3]:
#Accessing Garmin Connect. At this point we will have to login manually.

driver.get('https://connect.garmin.com/modern/')

In [12]:
#Accessing the activity map.

driver.get('https://connect.garmin.com/modern/courses')

At this stage I'm setting the activity filters manually, since we're only testing the viability of large-scale *gpx* scraping.

In [13]:
#Clicking on the search box.

searchbox = driver.find_element_by_xpath('//*[@id="pageContainer"]/div/div[2]/div[1]/div[1]/input')
searchbox.click()

In [14]:
#Typing a town name and hitting enter.

searchbox.send_keys('Mataró')
time.sleep(0.5)
searchbox.send_keys(Keys.ENTER)

In [15]:
#Accessing the routes displayed on the current map.

routes = driver.find_elements_by_class_name('course-link')

In [16]:
#Accessing a single route.

routes[10].get_attribute('href')

'https://connect.garmin.com/modern/course/15465472'

In [17]:
#Storing all ride links on a list.

route_list = []

for i in routes:
    link = i.get_attribute('href')
    route_list.append(link)

In [19]:
#Inspecting one element of the list.

route_list[0]

'https://connect.garmin.com/modern/course/53159249'

In [20]:
#Let's open a single route link.

driver.get(route_list[0])

In [22]:
#Clicking on the dot button to show the download options.

options = driver.find_element_by_xpath('//*[@id="main-card"]/div/div[4]/div[1]/div/button')
options.click()

In [23]:
#Locating the download button anc clicking. Success!

download = driver.find_element_by_xpath('//*[@id="main-card"]/div/div[4]/div[1]/div/ul/li[2]/a')
download.click()

## Creating a function to download all routes in our list

Now that we've demonstrated the viability of downloading a *gpx* file, let's try to download all routes from our list until we hit a limit.

In [25]:
#We'll begin by creating a loop that performs the desired operation.

start = time.time()

def gpx_downloader(link):
    try:
        driver.get(link) #Accessing the route.
        cond = False
        while cond == False:
            try:
                options = driver.find_element_by_xpath('//*[@id="main-card"]/div/div[4]/div[1]/div/button')
                time.sleep(0.2)
                cond = True
                options.click()
            except:
                cond = False
                time.sleep(0.3)
        cond = False
        while cond == False:
            try:
                download = driver.find_element_by_xpath('//*[@id="main-card"]/div/div[4]/div[1]/div/ul/li[2]/a')
                time.sleep(0.2)
                cond = True
                download.click()
            except:
                cond = False
                time.sleep(0.3)
    except:
        time.sleep(5)
        gpx_downloader(link)
            
for i in route_list:
    gpx_downloader(i)
    
stop = time.time() 
duration = (stop - start)
print('Seconds:', int(duration))

Seconds: 208


In [27]:
#Packing the loop into a function that we can use on lists of links.

def downloader(link_list):
    start = time.time()

    def gpx_downloader(link):
        try:
            driver.get(link) #Accessing the route.
            cond = False
            while cond == False:
                try:
                    options = driver.find_element_by_xpath('//*[@id="main-card"]/div/div[4]/div[1]/div/button')
                    time.sleep(0.2)
                    cond = True
                    options.click()
                except:
                    cond = False
                    time.sleep(0.3)
            cond = False
            while cond == False:
                try:
                    download = driver.find_element_by_xpath('//*[@id="main-card"]/div/div[4]/div[1]/div/ul/li[2]/a')
                    time.sleep(0.2)
                    cond = True
                    download.click()
                except:
                    cond = False
                    time.sleep(0.3)
        except:
            time.sleep(5)
            gpx_downloader(link)

    for i in link_list:
        gpx_downloader(i)

    stop = time.time() 
    duration = (stop - start)
    return print('Seconds:', int(duration))

In [28]:
#Testing the function.

downloader(route_list)

Seconds: 267
