# Download files from AWS Registry of Open Data

This notebook allows to download files from a particular dataset stored in AWS's S3, given its Amazon Resource Name (ARN)

See https://registry.opendata.aws/

The downloading process relies on the usage of the package boto3. See also

https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html


PS. Feel free to adjust the code

In [1]:
! pip3 install boto3


[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting boto3
  Downloading boto3-1.21.33-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.4/132.4 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.0-py3-none-any.whl (23 kB)
Collecting botocore<1.25.0,>=1.24.33
  Downloading botocore-1.24.33-py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m1.4

In [1]:
import os
import boto3


In [2]:
from botocore import UNSIGNED
from botocore.client import Config


In [3]:
# Print out the objects' name in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # print out name
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            print(filename)
        

In [15]:
# Print out the objects' name in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def print_all_objects_name_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        filename = obj.key
        if name_word == "" or name_word in filename:  
            print(filename)

In [5]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, filename)
        

In [6]:
# Download all objects in a sub-folder of a S3 bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_objects_in_folder_with_complete_name(s3_resource, bucket_name, prefix, name_word = ""):
    
    # select bucket and objects
    my_bucket = s3_resource.Bucket(bucket_name)
    
    objects = my_bucket.objects.filter(Prefix=prefix)
    for obj in objects:
        path, filename = os.path.split(obj.key)
        complete_filename = path.replace('/', '|') + '|' + filename
        if name_word == "" or name_word in filename:
            my_bucket.download_file(obj.key, complete_filename)

In [7]:
# Download all S3 objects in a specified bucket, given the s3 resource.
# Optionally, filters specific names

def download_all_obects(s3_resource, bucket_name, name_word = ""):
    
    # select bucket
    my_bucket = s3_resource.Bucket(bucket_name)
    
    # download file into current directory
    for s3_object in my_bucket.objects.all():
        filename = s3_object.key
        if name_word == "" or name_word in filename:
            my_bucket.download_file(s3_object.key, filename)
        

In [8]:
# Initiate S3 resource

s3_resource = boto3.resource('s3', config=Config(signature_version=UNSIGNED))


In [9]:
# Set S3 resource name of interest

BUCKET_NAME = 'deutsche-boerse-xetra-pds' # the name under "Amazon Resource Name (ARN)"


In [10]:
# Particular word in the filename, if it is of interest
# e.g. csv, json, parquet

WORD_IN_FILENAME = 'csv' # to get only the csv data type from the bucket_name 


In [11]:
# print_all_objects_name(s3_resource, BUCKET_NAME)

print_all_objects_name(s3_resource, BUCKET_NAME, WORD_IN_FILENAME)


2017-06-17/2017-06-17_BINS_XETR00.csv
2017-06-17/2017-06-17_BINS_XETR01.csv
2017-06-17/2017-06-17_BINS_XETR02.csv
2017-06-17/2017-06-17_BINS_XETR03.csv
2017-06-17/2017-06-17_BINS_XETR04.csv
2017-06-17/2017-06-17_BINS_XETR05.csv
2017-06-17/2017-06-17_BINS_XETR06.csv
2017-06-17/2017-06-17_BINS_XETR07.csv
2017-06-17/2017-06-17_BINS_XETR08.csv
2017-06-17/2017-06-17_BINS_XETR09.csv
2017-06-17/2017-06-17_BINS_XETR10.csv
2017-06-17/2017-06-17_BINS_XETR11.csv
2017-06-17/2017-06-17_BINS_XETR12.csv
2017-06-17/2017-06-17_BINS_XETR13.csv
2017-06-17/2017-06-17_BINS_XETR14.csv
2017-06-17/2017-06-17_BINS_XETR15.csv
2017-06-17/2017-06-17_BINS_XETR16.csv
2017-06-17/2017-06-17_BINS_XETR17.csv
2017-06-17/2017-06-17_BINS_XETR18.csv
2017-06-17/2017-06-17_BINS_XETR19.csv
2017-06-17/2017-06-17_BINS_XETR20.csv
2017-06-17/2017-06-17_BINS_XETR21.csv
2017-06-17/2017-06-17_BINS_XETR22.csv
2017-06-17/2017-06-17_BINS_XETR23.csv
2017-06-18/2017-06-18_BINS_XETR00.csv
2017-06-18/2017-06-18_BINS_XETR01.csv
2017-06-18/2

2017-07-28/2017-07-28_BINS_XETR16.csv
2017-07-28/2017-07-28_BINS_XETR17.csv
2017-07-28/2017-07-28_BINS_XETR18.csv
2017-07-28/2017-07-28_BINS_XETR19.csv
2017-07-28/2017-07-28_BINS_XETR20.csv
2017-07-28/2017-07-28_BINS_XETR21.csv
2017-07-28/2017-07-28_BINS_XETR22.csv
2017-07-28/2017-07-28_BINS_XETR23.csv
2017-07-29/2017-07-29_BINS_XETR00.csv
2017-07-29/2017-07-29_BINS_XETR01.csv
2017-07-29/2017-07-29_BINS_XETR02.csv
2017-07-29/2017-07-29_BINS_XETR03.csv
2017-07-29/2017-07-29_BINS_XETR04.csv
2017-07-29/2017-07-29_BINS_XETR05.csv
2017-07-29/2017-07-29_BINS_XETR06.csv
2017-07-29/2017-07-29_BINS_XETR07.csv
2017-07-29/2017-07-29_BINS_XETR08.csv
2017-07-29/2017-07-29_BINS_XETR09.csv
2017-07-29/2017-07-29_BINS_XETR10.csv
2017-07-29/2017-07-29_BINS_XETR11.csv
2017-07-29/2017-07-29_BINS_XETR12.csv
2017-07-29/2017-07-29_BINS_XETR13.csv
2017-07-29/2017-07-29_BINS_XETR14.csv
2017-07-29/2017-07-29_BINS_XETR15.csv
2017-07-29/2017-07-29_BINS_XETR16.csv
2017-07-29/2017-07-29_BINS_XETR17.csv
2017-07-29/2

2017-09-08/2017-09-08_BINS_XETR08.csv
2017-09-08/2017-09-08_BINS_XETR09.csv
2017-09-08/2017-09-08_BINS_XETR10.csv
2017-09-08/2017-09-08_BINS_XETR11.csv
2017-09-08/2017-09-08_BINS_XETR12.csv
2017-09-08/2017-09-08_BINS_XETR13.csv
2017-09-08/2017-09-08_BINS_XETR14.csv
2017-09-08/2017-09-08_BINS_XETR15.csv
2017-09-08/2017-09-08_BINS_XETR16.csv
2017-09-08/2017-09-08_BINS_XETR17.csv
2017-09-08/2017-09-08_BINS_XETR18.csv
2017-09-08/2017-09-08_BINS_XETR19.csv
2017-09-08/2017-09-08_BINS_XETR20.csv
2017-09-08/2017-09-08_BINS_XETR21.csv
2017-09-08/2017-09-08_BINS_XETR22.csv
2017-09-08/2017-09-08_BINS_XETR23.csv
2017-09-09/2017-09-09_BINS_XETR00.csv
2017-09-09/2017-09-09_BINS_XETR01.csv
2017-09-09/2017-09-09_BINS_XETR02.csv
2017-09-09/2017-09-09_BINS_XETR03.csv
2017-09-09/2017-09-09_BINS_XETR04.csv
2017-09-09/2017-09-09_BINS_XETR05.csv
2017-09-09/2017-09-09_BINS_XETR06.csv
2017-09-09/2017-09-09_BINS_XETR07.csv
2017-09-09/2017-09-09_BINS_XETR08.csv
2017-09-09/2017-09-09_BINS_XETR09.csv
2017-09-09/2

2017-10-20/2017-10-20_BINS_XETR00.csv
2017-10-20/2017-10-20_BINS_XETR01.csv
2017-10-20/2017-10-20_BINS_XETR02.csv
2017-10-20/2017-10-20_BINS_XETR03.csv
2017-10-20/2017-10-20_BINS_XETR04.csv
2017-10-20/2017-10-20_BINS_XETR05.csv
2017-10-20/2017-10-20_BINS_XETR06.csv
2017-10-20/2017-10-20_BINS_XETR07.csv
2017-10-20/2017-10-20_BINS_XETR08.csv
2017-10-20/2017-10-20_BINS_XETR09.csv
2017-10-20/2017-10-20_BINS_XETR10.csv
2017-10-20/2017-10-20_BINS_XETR11.csv
2017-10-20/2017-10-20_BINS_XETR12.csv
2017-10-20/2017-10-20_BINS_XETR13.csv
2017-10-20/2017-10-20_BINS_XETR14.csv
2017-10-20/2017-10-20_BINS_XETR15.csv
2017-10-20/2017-10-20_BINS_XETR16.csv
2017-10-20/2017-10-20_BINS_XETR17.csv
2017-10-20/2017-10-20_BINS_XETR18.csv
2017-10-20/2017-10-20_BINS_XETR19.csv
2017-10-20/2017-10-20_BINS_XETR20.csv
2017-10-20/2017-10-20_BINS_XETR21.csv
2017-10-20/2017-10-20_BINS_XETR22.csv
2017-10-20/2017-10-20_BINS_XETR23.csv
2017-10-21/2017-10-21_BINS_XETR00.csv
2017-10-21/2017-10-21_BINS_XETR01.csv
2017-10-21/2

2017-11-30/2017-11-30_BINS_XETR16.csv
2017-11-30/2017-11-30_BINS_XETR17.csv
2017-11-30/2017-11-30_BINS_XETR18.csv
2017-11-30/2017-11-30_BINS_XETR19.csv
2017-11-30/2017-11-30_BINS_XETR20.csv
2017-11-30/2017-11-30_BINS_XETR21.csv
2017-11-30/2017-11-30_BINS_XETR22.csv
2017-11-30/2017-11-30_BINS_XETR23.csv
2017-12-01/2017-12-01_BINS_XETR00.csv
2017-12-01/2017-12-01_BINS_XETR01.csv
2017-12-01/2017-12-01_BINS_XETR02.csv
2017-12-01/2017-12-01_BINS_XETR03.csv
2017-12-01/2017-12-01_BINS_XETR04.csv
2017-12-01/2017-12-01_BINS_XETR05.csv
2017-12-01/2017-12-01_BINS_XETR06.csv
2017-12-01/2017-12-01_BINS_XETR07.csv
2017-12-01/2017-12-01_BINS_XETR08.csv
2017-12-01/2017-12-01_BINS_XETR09.csv
2017-12-01/2017-12-01_BINS_XETR10.csv
2017-12-01/2017-12-01_BINS_XETR11.csv
2017-12-01/2017-12-01_BINS_XETR12.csv
2017-12-01/2017-12-01_BINS_XETR13.csv
2017-12-01/2017-12-01_BINS_XETR14.csv
2017-12-01/2017-12-01_BINS_XETR15.csv
2017-12-01/2017-12-01_BINS_XETR16.csv
2017-12-01/2017-12-01_BINS_XETR17.csv
2017-12-01/2

2018-01-11/2018-01-11_BINS_XETR08.csv
2018-01-11/2018-01-11_BINS_XETR09.csv
2018-01-11/2018-01-11_BINS_XETR10.csv
2018-01-11/2018-01-11_BINS_XETR11.csv
2018-01-11/2018-01-11_BINS_XETR12.csv
2018-01-11/2018-01-11_BINS_XETR13.csv
2018-01-11/2018-01-11_BINS_XETR14.csv
2018-01-11/2018-01-11_BINS_XETR15.csv
2018-01-11/2018-01-11_BINS_XETR16.csv
2018-01-11/2018-01-11_BINS_XETR17.csv
2018-01-11/2018-01-11_BINS_XETR18.csv
2018-01-11/2018-01-11_BINS_XETR19.csv
2018-01-11/2018-01-11_BINS_XETR20.csv
2018-01-11/2018-01-11_BINS_XETR21.csv
2018-01-11/2018-01-11_BINS_XETR22.csv
2018-01-11/2018-01-11_BINS_XETR23.csv
2018-01-12/2018-01-12_BINS_XETR00.csv
2018-01-12/2018-01-12_BINS_XETR01.csv
2018-01-12/2018-01-12_BINS_XETR02.csv
2018-01-12/2018-01-12_BINS_XETR03.csv
2018-01-12/2018-01-12_BINS_XETR04.csv
2018-01-12/2018-01-12_BINS_XETR05.csv
2018-01-12/2018-01-12_BINS_XETR06.csv
2018-01-12/2018-01-12_BINS_XETR07.csv
2018-01-12/2018-01-12_BINS_XETR08.csv
2018-01-12/2018-01-12_BINS_XETR09.csv
2018-01-12/2

2018-02-22/2018-02-22_BINS_XETR00.csv
2018-02-22/2018-02-22_BINS_XETR01.csv
2018-02-22/2018-02-22_BINS_XETR02.csv
2018-02-22/2018-02-22_BINS_XETR03.csv
2018-02-22/2018-02-22_BINS_XETR04.csv
2018-02-22/2018-02-22_BINS_XETR05.csv
2018-02-22/2018-02-22_BINS_XETR06.csv
2018-02-22/2018-02-22_BINS_XETR07.csv
2018-02-22/2018-02-22_BINS_XETR08.csv
2018-02-22/2018-02-22_BINS_XETR09.csv
2018-02-22/2018-02-22_BINS_XETR10.csv
2018-02-22/2018-02-22_BINS_XETR11.csv
2018-02-22/2018-02-22_BINS_XETR12.csv
2018-02-22/2018-02-22_BINS_XETR13.csv
2018-02-22/2018-02-22_BINS_XETR14.csv
2018-02-22/2018-02-22_BINS_XETR15.csv
2018-02-22/2018-02-22_BINS_XETR16.csv
2018-02-22/2018-02-22_BINS_XETR17.csv
2018-02-22/2018-02-22_BINS_XETR18.csv
2018-02-22/2018-02-22_BINS_XETR19.csv
2018-02-22/2018-02-22_BINS_XETR20.csv
2018-02-22/2018-02-22_BINS_XETR21.csv
2018-02-22/2018-02-22_BINS_XETR22.csv
2018-02-22/2018-02-22_BINS_XETR23.csv
2018-02-23/2018-02-23_BINS_XETR00.csv
2018-02-23/2018-02-23_BINS_XETR01.csv
2018-02-23/2

KeyboardInterrupt: 

In [35]:
# Set out the related path, if it is of interest

PATH_NAME = '2017-06-17/2017-06-17_BINS_XETR01'


In [42]:
# print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)

print_all_objects_name_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)


2017-06-17/2017-06-17_BINS_XETR01.csv


In [17]:
! pwd

/Users/generous/Desktop/pmdb_project


In [45]:
! ls -la

total 1608
drwxr-xr-x   9 generous  staff     288 Apr 21 14:10 [34m.[m[m
drwx------+  8 generous  staff     256 Apr 21 13:05 [34m..[m[m
-rw-r--r--@  1 generous  staff    6148 Apr 21 14:09 .DS_Store
drwxr-xr-x  13 generous  staff     416 Apr 21 14:00 [34m.git[m[m
drwxr-xr-x   3 generous  staff      96 Apr  5 19:23 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 generous  staff     136 Apr 21 14:09 2017-06-17_BINS_XETR01.csv
-rw-r--r--   1 generous  staff     136 Apr 21 14:09 2017-06-17|2017-06-17_BINS_XETR01.csv
-rw-r--r--   1 generous  staff  453723 Apr 21 14:10 file_fetcher.ipynb
-rw-r--r--   1 generous  staff  351495 Apr  5 12:26 notebook.ipynb


In [43]:
download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME)
#
download_all_objects_in_folder(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

download_all_objects_in_folder_with_complete_name(s3_resource, BUCKET_NAME, PATH_NAME, WORD_IN_FILENAME)

In [44]:
! ls -la

total 1608
drwxr-xr-x   9 generous  staff     288 Apr 21 14:09 [34m.[m[m
drwx------+  8 generous  staff     256 Apr 21 13:05 [34m..[m[m
-rw-r--r--@  1 generous  staff    6148 Apr 21 14:09 .DS_Store
drwxr-xr-x  13 generous  staff     416 Apr 21 14:00 [34m.git[m[m
drwxr-xr-x   3 generous  staff      96 Apr  5 19:23 [34m.ipynb_checkpoints[m[m
-rw-r--r--   1 generous  staff     136 Apr 21 14:09 2017-06-17_BINS_XETR01.csv
-rw-r--r--   1 generous  staff     136 Apr 21 14:09 2017-06-17|2017-06-17_BINS_XETR01.csv
-rw-r--r--   1 generous  staff  453783 Apr 21 14:08 file_fetcher.ipynb
-rw-r--r--   1 generous  staff  351495 Apr  5 12:26 notebook.ipynb


In [31]:
# It is not advisable to use ...

# download_all_files(s3_resource, BUCKET_NAME)
download_all_files(s3_resource, BUCKET_NAME, WORD_IN_FILENAME)

NameError: name 'download_all_files' is not defined

In [1]:
import boto3

s3 = boto3.client('s3')
s3.download_file('deutsche-boerse-xetra-pds', '2018-04-01/2018-04-01_BINS_XETR21.csv')

s3 = boto3.client('s3')
# with open('FILE_NAME', 'wb') as f:
#     s3.download_fileobj('BUCKET_NAME', 'OBJECT_NAME', f)


NameError: name 's3_resource' is not defined

In [62]:
! pip3 install fsspec
! pip3 install s3fs
import pandas as pd  
s3 = boto3.client('s3')
df = pd.read_csv(PATH_NAME)
df.head()
# aws s3 ls --no-sign-request s3://deutsche-boerse-xetra-pds/

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting s3fs
  Downloading s3fs-2022.3.0-py3-none-any.whl (26 kB)
Collecting aiobotocore~=2.2.0
  Downloading aiobotocore-2.2.0.tar.gz (59 kB)
[2K     [90m━━━━━━━━━━

[0m[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
boto3 1.21.33 requires botocore<1.25.0,>=1.24.33, but you have botocore 1.24.21 which is incompatible.[0m[31m
[0mSuccessfully installed aiobotocore-2.2.0 aiohttp-3.8.1 aioitertools-0.10.0 aiosignal-1.2.0 async-timeout-4.0.2 botocore-1.24.21 charset-normal

ValueError: Attempt to open non key-like path: deutsche-boerse-xetra-pds