In [1]:
import json
import asyncio
import pandas as pd
import shutil
from tqdm.asyncio import tqdm as atqdm
from tqdm.notebook  import tqdm
from bs4 import BeautifulSoup
from pathlib import Path
import re

import httpx

root_url = "https://climate.onebuilding.org"
# configure pool timeouts since max_connections is set to 10 and some requests may take a while
pool_timeout = httpx.Timeout(10.0, pool=10800)
client = httpx.AsyncClient(base_url=root_url, limits=httpx.Limits(max_connections=10))


async def get_subregions(url: str):
    res = await client.get(url)
    soup = BeautifulSoup(res.content, "html.parser")
    # get any a tags which are in td tags
    tags = soup.find_all("td")
    regions = []
    for tag in tags:
        a = tag.find("a")
        if a:
            if a["href"].endswith("html"):
                child_region = Path(url).parent / a["href"]
                regions.append(child_region.as_posix())

    return regions


async def get_file_list(url: str):
    res = await client.get(url)
    soup = BeautifulSoup(res.content, "html.parser")
    # get the table element with class "file-table"
    table = soup.find("table", summary="file table")
    a_tags = table.find_all("a", href=re.compile(r".*\.zip"))
    urls = []
    for tag in a_tags:
        resource_url: Path = Path(url).parent / tag["href"]
        urls.append(resource_url.as_posix())
    return urls


output_dir = Path("D:/onebuilding")

    
def make_row_dict(path):
    with open(path, "r") as f:
        epw = f.readline()
    data = epw.split(",")
    city = data[1]
    province = data[2]
    country = data[3]
    lat = float(data[-4])
    lon = float(data[-3])
    location = f"POINT({lon} {lat})"
    tz =(float(data[-2]))
    file_path = Path(path)
    name = file_path.stem
    is_tmy3 = "tmy3" in name.lower()
    is_tmyx = "tmyx" in name.lower()
    wmo = re.compile(r".*\.(\d{6})").match(name).group(1) if re.compile(r".*\.(\d{6})").match(name) else None
    year_pattern = r"(?<![0-9])(?:20|19)\d{2}(?![0-9])"
    applicable_years = re.findall(year_pattern, name)
    start_year = int(applicable_years[0]) if len(applicable_years) == 2 else None
    end_year = int(applicable_years[1]) if len(applicable_years) == 2 else None
    year = int(applicable_years[0]) if len(applicable_years) == 1 else None
    
    data = {
        "name": name,
        "location": location,
        "path": path,
        "country": country,
        "province": province,
        "city": city,
        "lat": lat,
        "lon": lon,
        "wmo": wmo,
        "tz": tz,
        "TM3": is_tmy3,
        "TMx": is_tmyx,
        "year": year,
        "start_year": start_year,
        "end_year": end_year,
    }
    return data




async def download_zip_and_unzip(file):
    out_zip = output_dir / Path(file)
    out_folder = out_zip.parent / out_zip.stem
    out_epw = out_folder / f"{out_zip.stem}.epw"
    if not (out_epw).exists():
        out_zip.parent.mkdir(parents=True, exist_ok=True)
        try:
            res = await client.get(file)
        except Exception as e:
            return (-1, e)
        try:
            with open(out_zip, "wb") as f:
                f.write(res.content)
            out_folder.mkdir(parents=True, exist_ok=True)
            shutil.unpack_archive(out_zip, out_folder)
            out_zip.unlink()

        except Exception as e:
            out_zip.unlink(missing_ok=True)
            shutil.rmtree(out_folder)
            return (-2, e)
    else:
        await asyncio.sleep(0.01)
    return (0, out_epw)
    try:
        data = make_row_dict(out_epw)
        data["file"] = file
        return (0, data)
    except Exception as e:
        return (-3, e)



In [2]:
fetch = False
if fetch:
    home = await client.get("/default.html")
    soup = BeautifulSoup(home.content, "html.parser")
    # find all a tags with hrefs that start with "WMO_REGION_"
    regions = list({a["href"] for a in soup.find_all("a", href=re.compile(r"WMO_Region_"))})
    subregion_promises = [get_subregions(region) for region in regions]
    subregions = [r for region in await atqdm.gather(*subregion_promises) for r in region]
    file_promises = [get_file_list(subregion) for subregion in subregions]
    files = [f for subregion in await atqdm.gather(*file_promises) for f in subregion]
    with open("paths.json", 'w') as f:
        json.dump(files, f)
else:
    with open("paths.json", 'r') as f:
        files = json.load(f)

In [3]:
hop_size = 1000
all_exit_codes = []
for ix in tqdm(range(0, len(files), hop_size)):
    exit_codes = [e for e in await asyncio.gather(*[download_zip_and_unzip(file) for file in files[ix:ix+hop_size]])]
    all_exit_codes.extend(exit_codes)
    errors_fetching = len([e for e in exit_codes if e[0] == -1])
    pulled = len([e for e in exit_codes if e[0] == 0])
    other = len([e for e in exit_codes if e[0] != 0 and e[0] != -1])
    print(f"Errors: {errors_fetching}, Pulled: {pulled}, Other: {other}")

  0%|          | 0/87 [00:00<?, ?it/s]

Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 988, Other: 12
Errors: 0, Pulled: 993, Other: 7
Errors: 0, Pulled: 990, Other: 10
Errors: 0, Pulled: 987, Other: 13
Errors: 0, Pulled: 982, Other: 18
Errors: 0, Pulled: 996, Other: 4
Errors: 0, Pulled: 990, Other: 10
Errors: 0, Pulled: 982, Other: 18
Errors: 1, Pulled: 997, Other: 2
Errors: 1, Pulled: 995, Other: 4
Errors: 0, Pulled: 988, Other: 12
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 988, Other: 12
Errors: 1, Pulled: 991, Other: 8
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 1000, Other: 0
Errors: 45, Pulled: 945, Other: 10
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 997, Other: 3
Errors: 0, Pulled: 1000, Other: 0
Errors: 0, Pulled: 998

In [4]:
count = 0
for e in all_exit_codes:
    if e[0] != 0:
        count += 1
count

556

In [5]:
all_rows = []
for i,e in tqdm(enumerate(all_exit_codes), total=len(all_exit_codes)):
    if e[0] == 0:
        try:
            all_rows.append(make_row_dict(e[1]))
        except Exception as e:
            pass
    if i % 1000:
        df = pd.DataFrame(all_rows)
        # df.to_csv("epw_metadata.csv", index=False)

  0%|          | 0/86361 [00:00<?, ?it/s]

In [6]:
df.to_csv("epw_metadata_wkt.csv", index=False)