In [1]:
import boto3
import os
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

client = boto3.client('s3')

bucket = "ml-for-bem"
experiment_name = "single-climate-zone/v1/nyc"

In [5]:
paginator = client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket, Prefix=experiment_name + "/errors/")

error_ids = []
files = []
for page in pages:
    for obj in page['Contents']:
        files.append(obj['Key'])
        error_ids.append(obj['Key'].split('/')[-1].split('.')[0])

In [7]:
# get a list of all the files in the /monthly/ folder

pages = paginator.paginate(Bucket=bucket, Prefix=experiment_name + "/monthly/")

monthly_files = []
for page in pages:
    for obj in page['Contents']:
        monthly_files.append(obj['Key'])

In [18]:
dfs = []

# download all of the files in the monthly folder to data/{experiment_name}
os.makedirs("data/hdf5/" + experiment_name + "/monthly", exist_ok=True)
i = 0
for file in tqdm(monthly_files):
    client.download_file(bucket, file, "data/hdf5/" + file)
    with pd.HDFStore("data/hdf5/" + file) as store:
        dfs.append(store["batch_results"])
    i = i + 1

dfs = pd.concat(dfs, axis=0)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [6]:
ids = dfs.index.get_level_values("id")    
error_mask = ids.isin(error_ids)
dfs = dfs.set_index(error_mask,  append=True, )
dfs.index.names = dfs.index.names[:-1] + ["error"]
dfs.index.to_frame(index=False).head()
dfs.to_hdf("data/hdf5/" + experiment_name + "/monthly.hdf", key="batch_results", mode="w")

In [8]:
# upload the monthly.hdf file to s3
client.upload_file("data/hdf5/" + experiment_name + "/monthly.hdf", bucket, experiment_name + "/monthly.hdf")