In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
from threading import Timer
import pandas as pd
import os
import json
import time

## Define data locations

In [2]:
loc_park_urls = os.path.join('auxiliary', 'parks.json')

## Generally useful functions

In [3]:
def create_start_times(freq = '5min'):

    now = datetime.today()
    start_day = now.replace(day = now.day + 1, hour = 0, minute = 0, second = 0, microsecond = 0)
    days = pd.date_range(start_day, periods = 365, freq = '1D')

    start_times = list()
    freq = '5min'
    for day in days:
        log_times = pd.date_range(
            start = day.replace(hour = 9),
            end = day.replace(hour = 22),
            freq = freq
        )
        start_times.extend(log_times)
    return start_times

def scrape_wait_times(start_time, park, url):
    print(f"Scraping {park} at {start_time}")
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    #table_waiting = soup.find(id = 'container')\
    #                    .find(id = 'content-wrapper')\
    #                    .find(id = 'content')\
    #                    .find(id = 'rides')
    table_waiting = soup.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']=="rides")
    #rows_waiting_rows = table_waiting.findAll(lambda tag: tag.name=='tr')

    rows = list()
    states = list()
    for i in range(1, 10):
        if not states:
            states = table_waiting.find_all('td', class_ = f'state state_{i}')
            if states:
                break
    else:
        print(f"Found no valid table for state state_i for {park}")

    for name_html, waittime_html, state_html in zip(table_waiting.find_all('td', class_ = 'name'), 
                                                table_waiting.find_all('td', class_ = 'waittime'),
                                                states):
        name = name_html.getText()
        state = state_html.getText()

        waittime = re.sub("[^0-9]", "", waittime_html.getText())
        if waittime == "":
            waittime = 0
        else:
            waittime = int(waittime)

        rows.append({
            "name" : name,
            "waittime (min)" : waittime,
            "state" : state
        })

    df = pd.DataFrame(rows)

    foldername = os.path.join("data", park)
    filename = start_time.strftime(f"%Y%m%d_%H%M_{park}.csv")
    if not os.path.exists(foldername):
        os.makedirs(foldername)
    df.to_csv(os.path.join(foldername, filename))
    return

## Demo

In [None]:
# Read all urls
with open(loc_park_urls, "r") as park_url_file:
    park_urls = json.load(park_url_file)

# Select start times
start_times = create_start_times(freq = '5min')

for start_time in start_times:
    num_seconds = (start_time - datetime.now()).seconds
    for park, url in park_urls.items():
        t = Timer(10, scrape_wait_times, [start_time, park, url])
        t.start()