# Accessing an AWS S3 bucket & downloading a JSON file



In this notebook we will learn how to access an AWS S3 bucket, download a JSON file from that bucket, create a dataframe from the JSON file contents, to be used the the additioal notbooks in this repo. 




In [None]:
!pip install -r requirements.txt

## Imports
We will need to import various packages. They are either built in the notebook image you are running, or have been installed in the previous step.

In [48]:
#========================================================================================
# import needed libraries/packages 
#
# Note:  we use boto3 which is a Python SDK for AWS.  It allows you to create,
# configure and manage AWS resources from your Python scripts.  
#========================================================================================
import os
import pandas as pd
import boto3
import botocore
import json
import re

from botocore import UNSIGNED
from botocore.client import Config

In [49]:
#========================================================================================
# Identify the name of your S3 bucket, and then the file you wish to download.
#========================================================================================
# WARNING: Don't commit these to version control or expose them!
AWS_ACCESS_KEY_ID = 'YOUR_ACCESS_KEY'
AWS_SECRET_ACCESS_KEY = 'YOUR_SECRET_KEY'

key_id = os.environ.get(AWS_ACCESS_KEY_ID)
secret_key = os.environ.get(AWS_SECRET_ACCESS_KEY)

bucket_name     = 'edge-anomaly-detection-bucket-name'
file_name       = 'edge-datalake-bullet--Wed Oct 04 13:56:26 GMT 2023.txt'
new_file_name   = 'edge-datalake-bullet.json'

local_dest_dir  = os.path.join(os.getcwd(), 'datasets/')
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    config=Config(signature_version='s3v4')  # It's usually recommended to use s3v4
)

s3.download_file(bucket_name, file_name, (local_dest_dir + new_file_name))




In [None]:
# Open the JSON Lines file and read lines
# Load the JSON content from the file
# File paths
input_file_path = local_dest_dir + 'edge-datalake-bullet.json'
output_file_path = local_dest_dir + 'edge-datalake-bullet-clean.json'


# Read the data from the input file
with open(input_file_path, 'r') as f:
    data = f.read()

# Splitting the data into multiple JSON strings
json_strings = data.split('\\n')

json_objects = []
for json_str in json_strings:
    try:
        if json_str.strip():
            json_objects.append(json.loads(json_str))
    except json.JSONDecodeError:
        print(f"Failed to decode JSON: {json_str}")

# Write the list of JSON objects as a single JSON array to the output file
with open(output_file_path, 'w') as f:
    json.dump(json_objects, f, indent=4)

df              = pd.read_json(output_file_path)


In [None]:
#========================================================================================
# Print the contents of the dataframe
#========================================================================================
print(df)