In [1]:
import boto3
import os
from pathlib import Path
import numpy as np
import pandas as pd
import json

from utils.constants import get_tmass_idx

from concurrent.futures import ThreadPoolExecutor

from tqdm.notebook import tqdm

In [2]:
client = boto3.client("s3")
bucket = "ml-for-bem"
experiment_name = "validation/v2"
local_dir = Path("data") / "temp" / "validation" / "v2"


def open_and_load_json(path):
    with open(path, "r") as f:
        return json.load(f)

In [3]:
"""
Get the template file keys
"""
with open(local_dir / "template" / os.listdir(local_dir / "template")[0], "r") as f:
    template = json.load(f)
    template_keys = list(template.keys())
    template_keys = [k for k in template_keys if k not in ["core_depth"]]

"""
Get a list of all the feature files and open them
"""
feature_files = list((Path(local_dir) / "features").glob("*.json"))
template_files = list((Path(local_dir) / "template").glob("*.json"))
idf_files = list((Path(local_dir) / "idf").glob("*.idf"))

features = []
with ThreadPoolExecutor(max_workers=8) as executor:
    res = list(
        tqdm(executor.map(open_and_load_json, feature_files), total=len(feature_files))
    )
    for r in res:
        features.extend(r)

"""
Conver the features to a dataframe and do some light post-processing
"""
print("Post-processing features...")
features = pd.DataFrame(features)
features = features.set_index(keys=["building_id", "name"]).sort_index()

# set facade mass
features["FacadeMass"] = features["FacadeMass"].apply(get_tmass_idx)
features["FacadeMass"] = features["RoofMass"].apply(get_tmass_idx)

# set roof/ground adiabatic
roof_mask = features.floor == features.n_floors - 1
ground_mask = features.floor == 0
features["roof_2_footprint"] = 0
features.loc[roof_mask, "roof_2_footprint"] = 1
features["ground_2_footprint"] = 0
features.loc[ground_mask, "ground_2_footprint"] = 1

# drop core depth
if "core_depth" in features.columns:
    features = features.drop(columns=["core_depth"])
features = features.rename(columns={"total_perimiter_length": "total_perimeter_length"})

# add some additional features
zone_facade_area = features.zone_edge_length * features.height
zone_perim_floor_area_to_facade_area = features.zone_perimeter_area / zone_facade_area
zone_core_area_to_perimeter_area = features.core_area / features.zone_perimeter_area
building_perimeter_area = features.footprint_area - features.core_area
building_core_area_to_perimeter_area = features.core_area / building_perimeter_area
building_facade_area = features.total_perimeter_length * features.height
features["building_perimeter_area_per_one_floor"] = building_perimeter_area
features["building_core_area_to_perimeter_area"] = building_core_area_to_perimeter_area
features["building_facade_area_per_one_floor"] = building_facade_area
features["zone_facade_area"] = zone_facade_area
features["zone_perim_floor_area_to_facade_area"] = zone_perim_floor_area_to_facade_area
features["zone_core_area_to_perimeter_area"] = zone_core_area_to_perimeter_area

# sort columns
features = features[sorted(list(features.columns))]
shading_cols = [col for col in features.columns if "shading_" in col]
shading_cols = [f"shading_{i}" for i in range(len(shading_cols))]

features = features[
    [col for col in features.columns if col not in shading_cols] + shading_cols
]

"""
Aggregate building level features which don't vary
"""
building_features = features.groupby(level="building_id").first()[
    ["total_perimeter_length", "gfa", "footprint_area", "n_floors", "core_area"]
    + template_keys
    + [col for col in features.columns if "building" in col]
]

assert (features["weight"].sum() - len(features)) < 1e-6
assert (features.groupby("building_id")["weight"].sum() - 1).abs().max() < 1e-6

"""
Save to HDF
"""
features.to_hdf(local_dir / "features.hdf", key="shoeboxes")
building_features.to_hdf(local_dir / "features.hdf", key="buildings")

"""
Upload to S3
"""
print("Uploading to S3...")
client.upload_file(
    Filename=str(local_dir / "features.hdf"),
    Bucket=bucket,
    Key=f"{experiment_name}/features.hdf",
)
print("Done.")

  0%|          | 0/10000 [00:00<?, ?it/s]

Post-processing features...
Uploading to S3...
Done.


In [4]:
def upload_file_to_s3(path):
    client.upload_file(
        Filename=str(path),
        Bucket=bucket,
        Key=f"{experiment_name}/{path.parent.name}/{path.name}",
    )


print("Uploading feature json files...")
with ThreadPoolExecutor(max_workers=8) as executor:
    list(tqdm(executor.map(upload_file_to_s3, feature_files), total=len(feature_files)))

print("Uploading template json files...")
with ThreadPoolExecutor(max_workers=8) as executor:
    list(
        tqdm(executor.map(upload_file_to_s3, template_files), total=len(template_files))
    )

print("Uploading idf files...")
with ThreadPoolExecutor(max_workers=8) as executor:
    list(tqdm(executor.map(upload_file_to_s3, idf_files), total=len(template_files)))
print("Done.")

Uploading feature json files...


  0%|          | 0/10000 [00:00<?, ?it/s]

Uploading template json files...


  0%|          | 0/10000 [00:00<?, ?it/s]

Uploading idf files...


  0%|          | 0/10000 [00:00<?, ?it/s]

Done.
