# Scrape data

In [1]:
from pathlib import Path
import pandas as pd
import requests
from lxml.html import parse
from tqdm.notebook import tqdm

import re
import json

from helper import *

In [2]:
# create session
s = requests.Session()

headers = {
    'Host': 'www.parkrun.com.au',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-GPC': '1',
    'TE': 'Trailers'
}

# add headers
s.headers.update(headers)

In [3]:
# get names of all Australian parkruns
url = 'https://www.parkrun.com.au/results/courserecords/'
result = s.get(url, headers=headers)
parkruns = pd.read_html(result.text)[0].iloc[:,0]
parkruns.name = 'Event'
parkruns.head()

0        Airlie Beach
1    Albert Melbourne
2      Alberton Ascot
3      Albury Wodonga
4       Aldinga Beach
Name: Event, dtype: object

In [14]:
# save full name and url name for downstream analysis
fullname = {}
for parkrun in parkruns:
    url_name = re.compile('[^a-z]').sub('', parkrun.lower())
    if url_name == 'frankstonnatureconservationreserve': url_name = 'frankstonnatureconservationres'
    fullname[url_name] = parkrun
    
fullname_inv = {v: k for k, v in fullname.items()}
    
with open('../data/fullname.json','w') as f:
    json.dump(fullname, f)    

In [4]:
# extract states of parkrun for later analysis
states_path = Path('../data/states.json')
if states_path.exists():
    with open(states_path, 'r') as f:
        locations = json.load(f)

else:
    # query full address from parkrun name
    locations = query_address(parkruns)
      
    # extract state from address; if state can't be found, request manual input
    for parkrun, address in locations.items():
        locations[parkrun] = get_state(parkrun, address)
    
    # save for later usage
    with open(states_path,'w') as f:
        json.dump(locations, f)

In [5]:
# grab age and gender categories from a random parkrun (St Peters)
url = f'https://www.parkrun.com.au/stpeters/results/agecategoryrecords/'

result = s.get(url, headers=headers, stream=True)
result.raw.decode_content = True
tree = parse(result.raw)

categories = tree.xpath('//tbody/tr/td[1]/a/@href')

In [15]:
# find people's PBs per agegroup for every parkrun and save as csv
for parkrun in tqdm(parkruns):
    
    # remove formatting from parkrun name for urlname
    url_name = fullname_inv[parkrun]    
    pr_path = Path(f'../data/{url_name}.csv')
    
    # check if data has already been scraped
    if pr_path.exists():
        ## TODO save as parquet as well
        # df = pd.read_csv(pr_path)
        # df.to_csv(pr_path, index = False)        
        continue

    dfs_ = []

    for category in tqdm(categories):
        url = f'https://www.parkrun.com.au/{url_name}/results/agecategoryrecords/{category}'
        age_results = s.get(url, headers=headers)
        try:
            df = pd.read_html(age_results.text)[0].drop(columns = ['Rank', 'Club'])
        except:
            print('No data retrieved for %s in %s' % (category, parkrun))
            continue
        df['category'] = category.split('=')[1]
        dfs_.append(df)
    dfs_ = pd.concat(dfs_)

    #TODO set dtypes before saving
    dfs_.to_csv(pr_path, index = False)

  0%|          | 0/455 [00:00<?, ?it/s]