In [6]:
import asyncio
import json

import httpx
import pandas as pd
from tqdm.notebook import tqdm

from onebuilding_scraper.utils import (
    download_zip_and_unzip,
    generate_paths,
    make_row_dict,
)

root_url = "https://climate.onebuilding.org"
# configure pool timeouts since max_connections is set to 10 and some requests may take a while
pool_timeout = httpx.Timeout(10.0, pool=10800)
client = httpx.AsyncClient(base_url=root_url, limits=httpx.Limits(max_connections=10))


In [7]:
fetch = True
if fetch:
    files = await generate_paths(client)
    with open("paths.json", "w") as f:
        json.dump(sorted(files), f, indent=4)
else:
    with open("paths.json") as f:
        files = json.load(f)

100%|██████████| 7/7 [00:00<00:00, 66.94it/s]
100%|██████████| 263/263 [00:09<00:00, 27.37it/s]


In [3]:
hop_size = 1000
all_exit_codes = []
for ix in tqdm(range(0, len(files), hop_size)):
    exit_codes = [
        e
        for e in await asyncio.gather(*[
            download_zip_and_unzip(file) for file in files[ix : ix + hop_size]
        ])
    ]
    all_exit_codes.extend(exit_codes)
    errors_fetching = len([e for e in exit_codes if e[0] == -1])
    pulled = len([e for e in exit_codes if e[0] == 0])
    other = len([e for e in exit_codes if e[0] != 0 and e[0] != -1])
    print(f"Errors: {errors_fetching}, Pulled: {pulled}, Other: {other}")

  0%|          | 0/87 [00:00<?, ?it/s]

Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 988, Other: 12
Errors: 0, Pulled: 993, Other: 7
Errors: 0, Pulled: 990, Other: 10
Errors: 0, Pulled: 987, Other: 13
Errors: 0, Pulled: 982, Other: 18
Errors: 0, Pulled: 996, Other: 4
Errors: 0, Pulled: 990, Other: 10
Errors: 0, Pulled: 982, Other: 18
Errors: 1, Pulled: 997, Other: 2
Errors: 1, Pulled: 995, Other: 4
Errors: 0, Pulled: 988, Other: 12
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 988, Other: 12
Errors: 1, Pulled: 991, Other: 8
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 45, Pulled: 945, Other: 10
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 998

In [4]:
count = 0
for e in all_exit_codes:
    if e[0] != 0:
        count += 1
count

556

In [5]:
all_rows = []
for i, e in tqdm(enumerate(all_exit_codes), total=len(all_exit_codes)):
    if e[0] == 0:
        try:
            all_rows.append(make_row_dict(e[1]))
        except Exception as e:
            pass
    if i % 1000:
        df = pd.DataFrame(all_rows)
        # df.to_csv("epw_metadata.csv", index=False)

  0%|          | 0/86361 [00:00<?, ?it/s]

In [6]:
df.to_csv("epw_metadata_wkt.csv", index=False)