In [4]:
req_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
}

In [9]:
import requests
import re

obs_index_urls = [
    f"http://www.bom.gov.au/{region}/observations/{region}all.shtml"
    for region in ["nsw", "nt", "qld", "sa", "tas", "vic", "wa"]
]
print("\n".join(obs_index_urls), "\n", len(obs_index_urls))

http://www.bom.gov.au/nsw/observations/nswall.shtml
http://www.bom.gov.au/nt/observations/ntall.shtml
http://www.bom.gov.au/qld/observations/qldall.shtml
http://www.bom.gov.au/sa/observations/saall.shtml
http://www.bom.gov.au/tas/observations/tasall.shtml
http://www.bom.gov.au/vic/observations/vicall.shtml
http://www.bom.gov.au/wa/observations/waall.shtml 
 7


In [11]:
obs_page_urls = []

for url in obs_index_urls:
    r = requests.get(url, headers=req_headers)
    if r.status_code == 200:
        # find matches for /products/ID[A-Z]60801/ID[A-Z]60801.9[0-9]{4}.shtml
        matches = re.findall(
            r"/products/ID[A-Z]60801/ID[A-Z]60801.9[0-9]{4}.shtml", r.text
        )
    else:
        print(r.status_code, url)
        break

    if matches:
        obs_page_urls.extend(matches)
    else:
        print(r.status_code, url)
        print(r.text)
        break

print("\n".join(obs_page_urls), "\n", len(obs_page_urls))

/products/IDN60801/IDN60801.94596.shtml
/products/IDN60801/IDN60801.94599.shtml
/products/IDN60801/IDN60801.94573.shtml
/products/IDN60801/IDN60801.94592.shtml
/products/IDN60801/IDN60801.94598.shtml
/products/IDN60801/IDN60801.95571.shtml
/products/IDN60801/IDN60801.95570.shtml
/products/IDN60801/IDN60801.94572.shtml
/products/IDN60801/IDN60801.94582.shtml
/products/IDN60801/IDN60801.94589.shtml
/products/IDN60801/IDN60801.95729.shtml
/products/IDN60801/IDN60801.94789.shtml
/products/IDN60801/IDN60801.94785.shtml
/products/IDN60801/IDN60801.94783.shtml
/products/IDN60801/IDN60801.94799.shtml
/products/IDN60801/IDN60801.95784.shtml
/products/IDN60801/IDN60801.95771.shtml
/products/IDN60801/IDN60801.94782.shtml
/products/IDN60801/IDN60801.95767.shtml
/products/IDN60801/IDN60801.95772.shtml
/products/IDN60801/IDN60801.94650.shtml
/products/IDN60801/IDN60801.95774.shtml
/products/IDN60801/IDN60801.95754.shtml
/products/IDN60801/IDN60801.94758.shtml
/products/IDN60801/IDN60801.95747.shtml


In [19]:
urls = [
    "http://www.bom.gov.au/fwo/"
    + "/".join(".".join(url.split(".")[0:-1]).split("/")[2:])
    + ".json"
    for url in obs_page_urls
]

print("\n".join(urls), "\n", len(urls))

http://www.bom.gov.au/fwo/IDN60801/IDN60801.94596.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94599.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94573.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94592.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94598.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.95571.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.95570.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94572.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94582.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94589.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.95729.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94789.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94785.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94783.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94799.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.95784.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.95771.json
http://www.bom.gov.au/fwo/IDN60801/IDN60801.94782.json
http://www

In [22]:
import concurrent.futures
import requests
import time
import json

out = []
CONNECTIONS = 10
TIMEOUT = 5


def load_url(url, timeout):
    r = requests.get(url, headers=req_headers, timeout=timeout)
    if r.status_code == 200:
        try:
            data = r.json()
            with open(
                "C:\\Code\\data-sciencey\\bom_json_obs\\" + url.split("/")[-1], "w"
            ) as f:
                json.dump(data, f)
        except Exception as e:
            return (str(type(e)), r.status_code, url)
    return (r.status_code, url)


print(load_url(urls[0], TIMEOUT))

(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94596.json')


In [23]:
with concurrent.futures.ThreadPoolExecutor(max_workers=CONNECTIONS) as executor:
    future_to_url = (executor.submit(load_url, url, TIMEOUT) for url in urls)
    time1 = time.time()
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            r = future.result()
        except Exception as exc:
            r = str(type(exc))
        finally:
            print(r)
    time2 = time.time()

print(f"Took {time2-time1:.2f} s")

(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94596.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94599.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94573.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94592.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94598.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94582.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.95570.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.95571.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94589.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.95729.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94572.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94789.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94785.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.94799.json')
(200, 'http://www.bom.gov.au/fwo/IDN60801/IDN60801.95784.json')
(200, 'http://www.bom.gov.au/fwo/IDN6080

In [24]:
import genson

s = genson.SchemaBuilder()
for url in urls:
    with open("C:\\Code\\data-sciencey\\bom_json_obs\\" + url.split("/")[-1], "r") as f:
        data = json.load(f)
        s.add_object(data)

with open("C:\\Code\\data-sciencey\\bom_json_obs\\bom.schema.json", "w") as f:
    json.dump(json.loads(s.to_json()), f, indent=4)