# Establish S3 Connection

In [1]:
import json # read in dictionary format credentials
import boto3 # connect to aws s3

In [2]:
import numpy as np
import pandas as pd
import time as tm

## Import Credentials

In [3]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)

## Connect w/ Credentials

In [4]:
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

# Access S3 Objects

In [5]:
file_counts=len(s3.list_objects(Bucket = aws_credential['bucket'])['Contents'])
print(f'The bucket currently has {file_counts} objects.')

The bucket currently has 85 objects.


## Meta Data into Lists

In [6]:
file_contents=s3.list_objects(Bucket = aws_credential['bucket'])['Contents']

In [7]:
list({k for dictionary in file_contents for k in dictionary.keys()})

['ChecksumAlgorithm',
 'Size',
 'Owner',
 'Key',
 'LastModified',
 'ETag',
 'StorageClass']

In [8]:
object_list=[dictionary['Key'] for dictionary in file_contents]

In [9]:
# folders created in the bucket
[folder for folder in object_list if folder.endswith('/')]

['messy_data/', 'resource/']

## Create Data Catalog

In [10]:
# 'LastModified': datetime.datetime(2025, 3, 8, 4, 27, 27, tzinfo=tzutc())
modified_time_list = [dictionary['LastModified'].strftime("%Y-%m-%d, %H:%M %Z") for dictionary in file_contents]

In [11]:
modified_time_list[:5]

['2025-03-08, 04:27 UTC',
 '2025-03-08, 04:28 UTC',
 '2024-03-13, 22:10 UTC',
 '2024-03-13, 22:10 UTC',
 '2024-03-13, 22:10 UTC']

In [12]:
size_list = [dictionary['Size'] for dictionary in file_contents]

In [13]:
bucket_catalog=pd.DataFrame(data = {'object':object_list,
                     'size(KB)':np.round(np.array(size_list)/1024,2),
                     'modified':modified_time_list}).sort_values(by = ['modified','size(KB)'])

In [14]:
bucket_catalog.head(2)

Unnamed: 0,object,size(KB),modified
43,opm_202309/DTdate.txt,0.04,"2024-03-13, 22:05 UTC"
45,opm_202309/DTgsegrd.txt,0.07,"2024-03-13, 22:05 UTC"
