In [1]:
import os
import boto3
import botocore
import numpy as np
import pandas as pd

bucket_name = 'alex-datasets'
prefix = 'tpch/sf-1'
chosen_files = ['lineitem.csv', 'part.csv']

s3 = boto3.client('s3')

def download(bucket_name, key, fn):
  try:
    s3.download_file(bucket_name, key, fn)
    print(f'Downloaded {fn}')
  except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '404':
      print(f'Object {key} does not exist!')
    else:
      raise

# List objects in the specified bucket and prefix
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Create directory if it doesn't exist
if not os.path.exists(prefix):
  os.makedirs(prefix)

for obj in response.get('Contents', []):
  key = obj['Key']

  # Ensure we only read the `chosen_file`.
  if key.split('/')[-1] not in chosen_files:
    continue
  filename = os.path.join(prefix, key.split('/')[-1])

  if not os.path.exists(filename):
    download(bucket_name, key, filename)
  else:
    print(f'{filename} already exists.')

Downloaded tpch/sf-1/lineitem.csv
Downloaded tpch/sf-1/part.csv
tpch/sf-1/lineitem.csv already exists.
tpch/sf-1/part.csv already exists.
