# Accessing an AWS S3 bucket & downloading multiple JSON files



In this notebook we will learn how to access an AWS S3 bucket, download a JSON file from that bucket, create a dataframe from the JSON file contents, to be used the the additioal notbooks in this repo. 




In [None]:
!pip install -r requirements.txt

## Imports
We will need to import various packages. They are either built in the notebook image you are running, or have been installed in the previous step.

In [48]:
#========================================================================================
# import needed libraries/packages 
#
# Note:  we use boto3 which is a Python SDK for AWS.  It allows you to create,
# configure and manage AWS resources from your Python scripts.  
#========================================================================================
import os
import pandas as pd
import boto3
import botocore
import json
import re

from botocore import UNSIGNED
from botocore.client import Config

This step will take a list of files in the s3 bucket and download them to the local machine.

In [49]:
#========================================================================================
# Identify the name of your S3 bucket, and then the file you wish to download.
#========================================================================================
# WARNING: Don't commit these to version control or expose them!
# AWS credentials
AWS_ACCESS_KEY_ID = 'YOUR_ACCESS_KEY'
AWS_SECRET_ACCESS_KEY = 'YOUR_SECRET_KEY'

# S3 bucket information
bucket_name = 'edge-anomaly-detection-bucket-name'
local_dest_dir = os.path.join(os.getcwd(), 'datasets/')

# List of file names to download
file_names = [
    'edge-datalake-bullet--Wed Oct 04 13:56:26 GMT 2023.txt',
    # Add more file names here if needed
]

s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    config=Config(signature_version='s3v4')
)

for file_name in file_names:
    new_file_name = file_name.replace(' ', '_').replace(':', '') + '.json'  # Create a new file name
    s3.download_file(bucket_name, file_name, os.path.join(local_dest_dir, new_file_name))


This will contactate the files into a single json file located in the clean_datasets folder.

In [None]:
# Open the JSON Lines file and read lines

local_dest_dir = os.path.join(os.getcwd(), 'datasets/')
clean_dest_dir = os.path.join(os.getcwd(), 'clean_datasets/')

# List all files in the 'datasets' directory
file_names = os.listdir(local_dest_dir)

# Define the output file path
output_file_path = os.path.join(clean_dest_dir, 'edge-datalake-bullet-clean.json')

# Check if the output directory exists, and if not, create it
if not os.path.exists(clean_dest_dir):
    os.makedirs(clean_dest_dir)

# Initialize an empty list to store JSON objects
json_objects = []

# Iterate through the files in the 'datasets' directory
for file_name in file_names:
    file_path = os.path.join(local_dest_dir, file_name)

    # Check if the file is a JSON file
    if file_name.endswith('.json'):
        print(f"Processing file: {file_name}")
        # Read the data from the input file
        with open(file_path, 'r') as f:
            data = f.read()

        # Splitting the data into multiple JSON strings
        json_strings = data.split('\\n')

        for json_str in json_strings:
            try:
                if json_str.strip():
                    json_objects.append(json.loads(json_str))
            except json.JSONDecodeError:
                print(f"Failed to decode JSON in file: {file_name}")

# Check if there are valid JSON objects to write to the output file
if json_objects:
    # Write the list of JSON objects as a single JSON array to the output file
    with open(output_file_path, 'w') as f:
        json.dump(json_objects, f, indent=4)
    print("Merging completed.")
else:
    print("No valid JSON data found. The output file was not created.")

# Read the merged JSON file into a pandas DataFrame
if os.path.exists(output_file_path):
    df = pd.read_json(output_file_path)
    # Print the DataFrame
    print(df.head())
else:
    print("The output file does not exist.")

This will validate the contents of the data that has been downloaded.

In [None]:
#========================================================================================
# Print the contents of the dataframe
#========================================================================================
print(df)