# Scrape data

In [None]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import requests
from lxml.html import parse
from tqdm.notebook import tqdm

import re
import json

In [None]:
def query_address(parkruns):
    
    from geopy.geocoders import Nominatim
    from geopy.extra.rate_limiter import RateLimiter
    geolocator = Nominatim(user_agent="my_email@my_server.com")

    locations = {}
    for parkrun in tqdm(parkruns):
        geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
        location = geocode(f'{parkrun}, Australia')
        if location is not None:
            locations[parkrun] = location.address
        else:
            locations[parkrun] = None
            
    return locations


def get_state(parkrun, address):
    
    state_dict = {
        'Australian Capital Territory': 'ACT', 
        'New South Wales': 'NSW', 
        'Northern Territory': 'NT',
        'Queensland': 'QLD',
        'South Australia': 'SA', 
        'Tasmania': 'TAS',
        'Victoria': 'VIC',
        'Western Australia': 'WA'
    }
    
    if address is not None:
        for k, v in state_dict.items():
            if k in address:
                return v
    
    return input(f"Couldn't identify state for {parkrun}\nPlease look up what state {parkrun} is in:\n")

In [None]:
# create session
s = requests.Session()

headers = {
    'Host': 'www.parkrun.com.au',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-GPC': '1',
    'TE': 'Trailers'
}

# add headers
s.headers.update(headers)

In [None]:
# get names of all Australian parkruns
url = 'https://www.parkrun.com.au/results/courserecords/'
result = s.get(url, headers=headers)
parkruns = pd.read_html(result.text)[0].iloc[:,0]
parkruns

0          Airlie Beach
1      Albert Melbourne
2        Albury Wodonga
3         Aldinga Beach
4          Altona Beach
             ...       
443             Yeppoon
444              Yokine
445           You Yangs
446               Yowie
447            Zillmere
Name: (Unnamed: 0_level_0, Event), Length: 448, dtype: object

In [None]:
if Path('states.json').exists():
    with open('states.json', 'r') as f:
        locations = json.load(f)
        locations['Nolen’s Park'] = locations['Nolen\'s Park']
        
else:
    # query full address from parkrun name
    locations = query_address(parkruns)
      
    # extract state from address; if state can't be find, request manual input
    for parkrun, address in locations.items():
        locations[k] = get_state(parkrun, address, states)
    
    # save for later usage
    with open('states.json','w') as f:
        json.dump(locations, f)

In [None]:
# grab age and gender categories
url = f'https://www.parkrun.com.au/{parkruns[0]}/results/agecategoryrecords/'

result = s.get(url, headers=headers, stream=True)
result.raw.decode_content = True
tree = parse(result.raw)

categories = tree.xpath('//tbody/tr/td[1]/a/@href')

In [None]:
# find people's PBs per agegroup for every parkrun and save as csv
for parkrun in tqdm(parkruns):
    
    url_name = re.compile('[^a-z]').sub('', parkrun.lower())
    if url_name == 'frankstonnatureconservationreserve': url_name = 'frankstonnatureconservationres'

    pr_path = Path(f'data/{url_name}.csv')
    
    if pr_path.exists():
        continue

    dfs_ = []


    for category in tqdm(categories):
        url = f'https://www.parkrun.com.au/{url_name}/results/agecategoryrecords/{category}'
        age_results = s.get(url, headers=headers)
        try:
            df = pd.read_html(age_results.text)[0]
        except:
            print('ОШИБКА - операция завершилась досрочно. Паркран временно заблокировал.')
            continue
        df['category'] = category.split('=')[1]
        dfs_.append(df)
    dfs_ = pd.concat(dfs_)
    dfs_['parkrun'] = parkrun
    dfs_['state'] = locations[parkrun]

    dfs_.to_csv(pr_path, index = False)

  0%|          | 0/448 [00:00<?, ?it/s]