# Summarize files and folders in AWS

This notebook provides an example of how to list files in an AWS bucket (including a prefix search) and group according to folder definitions

In [38]:
import sys, os

import boto3
import urllib3

import pandas as pd

from tqdm.notebook import tqdm

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [41]:
bucket = "fathom-products-flood"
prefix = "flood-map-3"
region = "us-east-1"
s3Session = boto3.Session(profile_name="fathom")
s3client = s3Session.client("s3", verify=False)

upBucket = "wbg-geography01"
upPrefix = "FATHOM"
uploadSession = boto3.Session(profile_name="default")
uploadClient = uploadSession.client("s3", verify=False)

out_folder = "C:/WBG/Work/Projects/FATHOM/data"

In [33]:
# read in prefixes to process from txt file
in_txt = "C:/WBG/Work/Projects/FATHOM/full_layers_commands.txt"

with open(in_txt, "r") as f:
    lines = f.readlines()

all_models = []
for line in lines:
    if line.startswith("aws"):
        sel_model = lines[4].split(" ")[3].split("/")[-1]
        all_models.append(sel_model)

len(all_models)

524

In [34]:
def get_all_files(cPrefix):
    # Loop through the S3 bucket and get all the file keys
    more_results = True
    try:
        del token  # noqa
    except Exception:
        pass
    loops = 0

    all_res = []
    while more_results:
        if loops > 0:
            objects = s3client.list_objects_v2(
                Bucket=bucket,
                ContinuationToken=token,  # noqa
                Prefix=cPrefix,  # noqa
            )
        else:
            objects = s3client.list_objects_v2(Bucket=bucket, Prefix=cPrefix)
        more_results = objects["IsTruncated"]
        if more_results:
            token = objects["NextContinuationToken"]
        loops += 1
        for res in objects["Contents"]:
            all_res.append(res)
    inD = pd.DataFrame(all_res)
    inD["folder"] = inD["Key"].apply(lambda x: "_".join(x.split("/")[:]))
    inD.sort_values("LastModified", ascending=False)
    inD['YEAR'] = inD['LastModified'].dt.year
    inD['YEAR_MONTH'] = inD['LastModified'].dt.year * 100 + inD['LastModified'].dt.month
    return(inD)

In [43]:
cur_model = all_models[0]
processed = []

for cur_model in tqdm(all_models):
    curD = get_all_files(f"{prefix}/{cur_model}")
    selD = curD[curD['YEAR'] == 2025]

    cur_out_folder = f"{out_folder}/{cur_model}"
    if not os.path.exists(cur_out_folder):
        os.makedirs(cur_out_folder)

    # Download the Key for each row in selD
    for idx, row in tqdm(selD.iterrows(), total=selD.shape[0]):
        out_fp = f"{cur_out_folder}/{row['Key'].split('/')[-1]}"
        if not os.path.exists(out_fp):
            s3client.download_file(bucket, row["Key"], out_fp)    
        # Upload to new bucket
        uploadClient.upload_file(out_fp, upBucket, f"{upPrefix}/{cur_model}/{row['Key'].split('/')[-1]}")
    os.rmdir(cur_out_folder)
    processed.append(cur_model)


  0%|          | 0/524 [00:00<?, ?it/s]

Completed loop: 0
Completed loop: 1
Completed loop: 2
Completed loop: 3
Completed loop: 4
Completed loop: 5
Completed loop: 6
Completed loop: 7
Completed loop: 8
Completed loop: 9
Completed loop: 10
Completed loop: 11
Completed loop: 12
Completed loop: 13
Completed loop: 14
Completed loop: 15
Completed loop: 16
Completed loop: 17
Completed loop: 18
Completed loop: 19


  0%|          | 0/1946 [00:00<?, ?it/s]

  0%|          | 0/1946 [00:00<?, ?it/s]

In [19]:
curD['YEAR_MONTH'].value_counts().sort_index()

YEAR_MONTH
202402    14032
202403     3273
202508     1946
Name: count, dtype: int64

In [None]:
inD

In [None]:
inD.sort_values("LastModified", ascending=False)