# Download files from AWS Registry of Open Data

This notebook allows to download files from a particular dataset stored in AWS's S3, given its Amazon Resource Name (ARN)

See https://registry.opendata.aws/

The downloading process relies on the usage of the package boto3. See also

https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html


PS. Feel free to adjust the code

In [1]:
! pip3 install boto3


[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting boto3
  Downloading boto3-1.21.33-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.4/132.4 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.0-py3-none-any.whl (23 kB)
Collecting botocore<1.25.0,>=1.24.33
  Downloading botocore-1.24.33-py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m1.4

In [8]:
import os
import boto3


In [9]:
from botocore import UNSIGNED
from botocore.client import Config


In [2]:
# Print out the objects' name in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # print out name
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            print(filename)
        

In [3]:
# Print out the objects' name in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        filename = obj.key
        if name_word == "" or name_word in filename:  
            print(filename)
            

In [4]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, filename)
        

In [5]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder_with_complete_name(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        complete_filename = path.replace('/', '|') + '|' + filename
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, complete_filename)
        

In [6]:
# Download all S3 objects in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_obects(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # download file into current directory
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            my_bucket.download_file(s3_object.key, filename)
        

In [10]:
# Initiate S3 resource

s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED))


In [11]:
# Set S3 resource name of interest

BUCKET_NAME = 'tcga-2-open' # the name under "Amazon Resource Name (ARN)"


In [12]:
# Particular word in the filename, if it is of interest
# e.g. csv, json, parquet

WORD_IN_FILENAME = 'csv' # to get only the csv data type from the bucket_name 


In [13]:
# print_all_objects_name(s3_resource, BUCKET_NAME)

print_all_objects_name(s3_resource, BUCKET_NAME, WORD_IN_FILENAME)


NoSuchBucket: An error occurred (NoSuchBucket) when calling the ListObjects operation: The specified bucket does not exist

In [None]:
# Set out the related path, if it is of interest

PATH_NAME = '.../'


In [None]:
#print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)


In [None]:
! pwd

In [None]:
! ls -la

In [None]:
#download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

#download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

In [None]:
! ls -la