In [7]:
import os
import boto3

session = boto3.Session(aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"], 
                   aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"],
                       aws_session_token = os.environ["AWS_SESSION_TOKEN"])

In [8]:
s3 = session.client("s3")

In [11]:
import threading

In [19]:
MY_BUCKET = "surbv-data-store"

filenames = []
for obj in s3.list_objects(Bucket = MY_BUCKET, Prefix = "human_eval_files")["Contents"]:
    filenames.append(obj["Key"])

In [25]:
OUTPUT_DIR = "human_eval_download"

In [22]:
from tqdm import tqdm

In [28]:
# suppose i want to download them at once

def download_one_file(bucket, output, client, s3_file):
    client.download_file(
    Bucket = bucket, Key = s3_file, Filename = os.path.join(output, s3_file.split("/")[1])
    )
    # splitting s3_file so that the folder name "human_eval_files" doesn't come in saved filename
    

for file in tqdm(filenames):
    if ".ipynb" in file: continue
    download_one_file(MY_BUCKET, OUTPUT_DIR, s3, file)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 210/210 [02:59<00:00,  1.17it/s]


In [29]:
# it took me ___ minutes
# now -- i'll speed up with threading

from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial

In [31]:
download_one_file_partial = partial(download_one_file, MY_BUCKET, OUTPUT_DIR, s3)

In [35]:
import time
stime = time.time()
failed_downloads = []
with ThreadPoolExecutor(max_workers = 16) as executor:
    futures = {
        executor.submit(download_one_file_partial, file): file
        for file in filenames
    }
    for future in as_completed(futures):
        if future.exception():
            # not downloaded well
            failed_downloads.append(futures[future])
        
etime = time.time()

print("done in :", etime - stime)

done in : 11.315793991088867
