In [5]:
!mkdir 2019-12-16 2019-12-16/data 2019-12-16/out
# Do the source and output subdirectories exist?

mkdir: 2019-12-16: File exists
mkdir: 2019-12-16/data: File exists
mkdir: 2019-12-16/out: File exists


In [6]:
data = "2019-12-16/data" # Where are the images and metadata tag files?
out  = "2019-12-16/out"  # Where should we put the renamed files and metadata catalog?

In [3]:
# set up logging for requests
import requests
import logging
import http.client

# https://stackoverflow.com/questions/16337511/
http.client.HTTPConnection.debuglevel = 1
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True

In [4]:
# access NARA API for Storis 1957 logbooks
nara_id = "38547962"
api_base = 'https://catalog.archives.gov/api/v1/'
api_url = '{0}?naIds={1}'.format(api_base, nara_id)
res = requests.get(api_url)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443
DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /api/v1/?naIds=38547962 HTTP/1.1" 200 None


send: b'GET /api/v1/?naIds=38547962 HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: application/json;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:06 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: trans

In [5]:
# parse NARA API output for metadata
entry_img_array = res.json().get('opaResponse').get('results').get('result')[0].get('objects').get('object')
digital_directory = entry_img_array[0].get('file').get('@path').split("/")[-2]

# write NARA API output to file for reference
api_output = "{0}/nara_id_{1}.json".format(data, digital_directory, nara_id)
if res.status_code == 200:
    with open(api_output, 'wb') as f:
        f.write(res.content)

In [6]:
# download images
for img_info in entry_img_array: 

    # test for mimetype "image/jpeg"
    # we don't want "application/pdf"
    if img_info.get('file').get('@mime') == "image/jpeg":

        img_name = img_info.get('file').get('@name')
        img_url = img_info.get('file').get('@url')
        img_res = requests.get(img_url)

        # write a single image to file
        local_img_name = "{0}/{1}".format(data, img_name)
        if img_res.status_code == 200:
            with open(local_img_name, 'wb') as img_f:
                img_f.write(img_res.content)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0126.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0126.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:09 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0127.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0127.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:10 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0128.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0128.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:11 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0129.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0129.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:12 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0130.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0130.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:14 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0131.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0131.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:16 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0132.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0132.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:18 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0133.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0133.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:21 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0134.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0134.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:26 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0135.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0135.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa03.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa03
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:27 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443
DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0136.JPG HTTP/1.1" 200 None


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0136.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:29 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0137.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0137.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:30 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0138.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0138.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:31 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443
DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0139.JPG HTTP/1.1" 200 None


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0139.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'
reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa01.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa01
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:32 GMT
header: hnweb: pw04
header: Pragma: no-cache
header: Server: Apache
header: St

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0140.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0140.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:34 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0141.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0141.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:36 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0142.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0142.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:38 GMT
header: hnweb: pw03
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0143.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0143.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:39 GMT
header: hnweb: pw01
header: Pragma: no-cache
header: Server: Apache/2.4.6 (Red Hat)
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0144.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0144.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa04.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa04
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:41 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): catalog.archives.gov:443


send: b'GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0145.JPG HTTP/1.1\r\nHost: catalog.archives.gov\r\nUser-Agent: python-requests/2.22.0\r\nAccept-Encoding: gzip, deflate\r\nAccept: */*\r\nConnection: keep-alive\r\n\r\n'


DEBUG:urllib3.connectionpool:https://catalog.archives.gov:443 "GET /OpaAPI/media/38547962/content/dc-metro/rg-026/559642/2017-01/storis-wmec-38-1957-logbooks/storis-wmec-38-1957-logbooks_0145.JPG HTTP/1.1" 200 None


reply: 'HTTP/1.1 200 OK\r\n'
header: Access-Control-Allow-Credentials: true
header: Access-Control-Allow-Methods: DELETE, HEAD, GET, OPTION, POST, PUT
header: Access-Control-Expose-Headers: JSESSIONID
header: Access-Control-Max-Age: 3600
header: BALANCER_NAME: (null)
header: BALANCER_ROUTE_CHANGED: 1
header: BALANCER_SESSION_ROUTE: (null)
header: BALANCER_SESSION_STICKY: (null)
header: BALANCER_WORKER_NAME: ajp://pa02.aws.nac.nara.gov:8009/OpaAPI
header: BALANCER_WORKER_ROUTE: pa02
header: Cache-Control: no-store, no-cache
header: Content-Type: image/jpeg;charset=UTF-8
header: Date: Mon, 16 Dec 2019 18:55:43 GMT
header: hnweb: pw02
header: Pragma: no-cache
header: Server: Apache
header: Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
header: transfer-encoding: chunked
header: Connection: keep-alive


In [9]:
import csv
import os

with open(os.path.join(data, 'metadata.csv'), mode='w') as metadata_file:
    metadata_writer = csv.writer(metadata_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    metadata_writer.writerow(['archive.host_country', 'USA'])
    metadata_writer.writerow(['document.contact_person', 'Kevin Wood'])
    metadata_writer.writerow(['archive.notes', 'Images available via API at https://catalog.archives.gov/api/v1/38547962'])
    metadata_writer.writerow(['platform.name', 'USCG Storis'])
    metadata_writer.writerow(['document.id_within_archive', '38547962'])
    metadata_writer.writerow(['document.id_within_archive_type', 'NARA ID'])
    metadata_writer.writerow(['document.record_type', "ships' logs"])
    metadata_writer.writerow(['document.accession_to_archive_date', '2016-08-19'])
    metadata_writer.writerow(['document.standardized_region_list', 'north_atlantic'])
    metadata_writer.writerow(['document.start_date', '1957-06-09'])
    metadata_writer.writerow(['document.start_date', '1957-09-30'])
    metadata_writer.writerow(['document.rights_statement', 'CC0 Public Domain'])
    metadata_writer.writerow(['document.notes', ''])

In [2]:
%run -i ../script/rdai
# We run rdai script interactively to obtain function definitions

In [11]:
get_fixed_seq()
# Have we generated a fixed sequence for uuids?
# The global variable fixed_seq needs to be defined prior to calling mint_uuid

In [None]:
import sys
sys.path.append('/glade/u/home/rdadata/lib/python/site-packages')

In [13]:
normalized_catalog = get_normalized_catalog(data)
# We generate a metadata catalog (unnormalized) from the data directory.

catalog = unnormalize_catalog(normalized_catalog)
# We flatten the normalized catalog. 
# Each file in the data directory "has its own entry" in this catalog.
# We'll eventually ignore non-image files.

write_timestamped_catalog(catalog, out)
# We write this version of the metadata catalog to the output directory.

In [14]:
catalog = read_timestamped_catalog(out)
# We read in the most recent version of the metadata catalog from the out directory.

elementary_family = [c for c in catalog if c['media_type'].startswith("image")]
# We create a list of all the entries in the catalog that are image files.

In [17]:
import os
# We'll perform some file renames between the data directory and the out directory.

# We move all the images in the catalog to the output directory.
for member in elementary_family:
    os.rename(member['file_path'], os.path.join(out, member['uuid']))

In [18]:
# Conversely, we move all the images in the catalog back to the data directory.
for member in elementary_family:
    os.rename(os.path.join(out, member['uuid']), member['file_path'])