In [62]:
import boto3
import pandas as pd
import json
import os
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

bucket = "ml-for-bem"
s3 = boto3.client('s3')

data_root = Path("data") / "temp" / "validate"
experiment_root = "idf/batch/validation_v1/"
should_gather = False

# list objects in bucket using paginator
def get_files_in_experiment(experiment):
    print("Listing files for experiment: ", experiment, "...")
    files = []
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket, Prefix=experiment):
        if page.get('Contents'):
            for obj in page['Contents']:
                files.append(obj['Key'])
    return files

def download(file):
    filepath = data_root / file
    os.makedirs(filepath.parent, exist_ok=True)
    s3.download_file(bucket, file, filepath)
    return filepath

def download_and_open_json(file):
    filepath = download(file)
    with open(filepath, 'r') as f:
        return json.load(f)

def download_and_open_hdf(file):
    filepath = download(file)
    return pd.read_hdf(filepath)

def get_weather(val_experiment):
    experiment_folder = experiment_root + val_experiment + "/"
    weather_folder = experiment_folder + "epw"
    # list files in weather folder
    weather_files = get_files_in_experiment(weather_folder)
    with ThreadPoolExecutor(max_workers=8) as executor:
        weather_files = list(tqdm(executor.map(download, weather_files), total=len(weather_files)))
    return weather_files

In [27]:
geometry_files = get_files_in_experiment(experiment_root + "geometry")
parameter_files = get_files_in_experiment(experiment_root + "parameters")
template_files = get_files_in_experiment(experiment_root + "template")

Listing files for experiment:  idf/batch/validation_v1/geometry ...
Listing files for experiment:  idf/batch/validation_v1/parameters ...
Listing files for experiment:  idf/batch/validation_v1/template ...


In [55]:
if should_gather:
    print("Gathering individual geometry files...")
    with ThreadPoolExecutor(max_workers=8) as executor:
        geometry = list(tqdm(executor.map(download_and_open_json, geometry_files), total=len(geometry_files)))

    with open(data_root / "geometry.json", 'w') as f:
        json.dump(geometry, f)

    # upload geometry to s3
    s3.upload_file(str(data_root / "geometry.json"), bucket, experiment_root + "geometry.json")
else:
    print("Using cached geometry from s3...")
    geometry = download_and_open_json(experiment_root + "geometry.json")

Using cached geometry from s3...


In [56]:
if should_gather:
    print("Gathering individual parameter files from s3...")
    with ThreadPoolExecutor(max_workers=8) as executor:
        parameters = list(tqdm(executor.map(download_and_open_hdf, parameter_files), total=len(parameter_files)))
        parameters = pd.concat(parameters, ignore_index=True, axis=0)

    for col in parameters.columns:
        if parameters[col].dtype == 'object':
            if col not in ['name', 'building_id','facade_normal','idf_path', 'vert']:
                parameters[col] = parameters[col].astype('float') if col != 'floor' else parameters[col].astype('int')
    # save and upload to s3
    parameters.to_hdf(data_root / "parameters.hdf", key='df')
    s3.upload_file(str(data_root / "parameters.hdf"), bucket, experiment_root + "sb_parameters.hdf")
else:
    print("Using cached parameters from s3...")
    parameters_fp = experiment_root + "sb_parameters.hdf"
    parameters_fp = download(parameters_fp)
    parameters = pd.read_hdf(parameters_fp, key='df')

Using cached parameters from s3...


In [63]:
weather_experiment = "seattle"
weather_files = get_weather(weather_experiment)
epw_fp = weather_files[0]

Listing files for experiment:  idf/batch/validation_v1/seattle/epw ...


  0%|          | 0/1 [00:00<?, ?it/s]

[WindowsPath('data/temp/validate/idf/batch/validation_v1/seattle/epw/USA_WA_Seattle-Tacoma.Intl.AP.727930_TMYx.2007-2021.epw')]