# Create S3 Bucket

In [2]:
import requests
import pandas as pd
import boto3
import sagemaker
from io import StringIO

session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)


In [3]:
setup_s3_bucket_passed = False

In [4]:
print("Default bucket: {}".format(bucket))

Default bucket: sagemaker-us-east-1-705927414280


# Verify S3_BUCKET Bucket Creation

In [5]:
%%bash

aws s3 ls s3://${bucket}/

2023-03-17 13:01:57 aws-athena-query-results-705927414280-us-east-1
2023-03-18 02:29:33 gunviolence-kbaum215
2023-03-19 21:59:56 official-gunviolence-kbaum215
2023-03-05 21:10:59 sagemaker-studio-705927414280-q3srkpw595r
2023-03-05 20:36:33 sagemaker-studio-705927414280-vo6gix3myc
2023-03-05 20:42:01 sagemaker-us-east-1-705927414280


In [6]:
from botocore.client import ClientError

response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': 'R9NM4DGV7QX22ES5', 'HostId': 'yLTT42yIrVia/fO+dzIaJyIDLn8FlbvXW5BUKwaMJxIJmTKDbaxHDe2NjvxmSGRUPhcptyOq0Rg=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'yLTT42yIrVia/fO+dzIaJyIDLn8FlbvXW5BUKwaMJxIJmTKDbaxHDe2NjvxmSGRUPhcptyOq0Rg=', 'x-amz-request-id': 'R9NM4DGV7QX22ES5', 'date': 'Sun, 16 Apr 2023 20:41:14 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [7]:
%store setup_s3_bucket_passed


Stored 'setup_s3_bucket_passed' (bool)


In [8]:
%store

Stored variables and their in-db values:
ingest_create_athena_db_passed             -> True
s3_private_path                            -> 's3://sagemaker-us-east-1-705927414280/gun_violenc
s3_private_path_1                          -> 's3://sagemaker-us-east-1-705927414280/census2010_
s3_private_path_2                          -> 's3://sagemaker-us-east-1-705927414280/state_abbre
s3_private_path_3                          -> 's3://sagemaker-us-east-1-705927414280/congress_da
s3_private_path_4                          -> 's3://sagemaker-us-east-1-705927414280/state_incom
s3_private_path_5                          -> 's3://sagemaker-us-east-1-705927414280/cities_data
s3_private_path_6                          -> 's3://sagemaker-us-east-1-705927414280/zipcodes'
s3_private_path_7                          -> 's3://sagemaker-us-east-1-705927414280/adjusted_gr
s3_private_path_8                          -> 's3://sagemaker-us-east-1-705927414280/temp'
s3_private_path_tsv                        

In the next section, we are creating folders and moving files from the Github repo into the S3 folders to create the file paths. As a group, we found that access issues when copying data from the public to the private bucket were insurmountable. The creator of the public bucket could run the code that copied data from public to private, but the other two group members could not. After futile troubleshooting, this workaround was made. The code to copy data still exists in the data preparation notebook, but this serves as a backup in case the user has access issues.

# Create Folders If They Do Not Already Exist

In [9]:
def create_folder(bucket_name, folder_name, parent_folder=None):
    s3 = boto3.client('s3')
    folder_key = f"{parent_folder}{folder_name}/" if parent_folder else f"{folder_name}/"
    s3.put_object(Bucket=bucket_name, Key=folder_key)

def get_existing_folders(bucket_name):
    s3 = boto3.client('s3')
    existing_folders = set()
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name):
        for obj in page.get('Contents', []):
            folder = obj['Key'].split('/')[0]
            existing_folders.add(folder)
    return existing_folders

def create_folders_if_not_exists(bucket_name, folder_names, parent_folder=None):
    existing_folders = get_existing_folders(bucket_name)
    for folder_name in folder_names:
        if parent_folder:
            full_folder_name = f"{parent_folder}{folder_name}"
        else:
            full_folder_name = folder_name

        if full_folder_name not in existing_folders:
            create_folder(bucket_name, folder_name, parent_folder)
            print(f"Created folder: {full_folder_name}")
        else:
            print(f"Folder {full_folder_name} already exists")

bucket_name = bucket 
folder_names = [
    'gun_violence_data',
    'census2010_data',
    'state_abbrev_data',
    'congress_data',
    'state_income_data',
    'cities_data',
    'modeling_data',
    'train',
    'validation',
    'test'
]

create_folders_if_not_exists(bucket_name, folder_names)

Folder gun_violence_data already exists
Folder census2010_data already exists
Folder state_abbrev_data already exists
Folder congress_data already exists
Folder state_income_data already exists
Folder cities_data already exists
Folder modeling_data already exists
Folder train already exists
Folder validation already exists
Folder test already exists


#  Put The Files Into Their Appropriate Folders

In [10]:
import io

def process_and_upload_to_s3(url, bucket_name, folder_name):
    # Download the CSV from the GitHub URL
    response = requests.get(url)
    csv_content = response.content.decode("utf-8")

    # Read the content of the CSV file and create a DataFrame
    df = pd.read_csv(io.StringIO(csv_content))

    # Move the 'target_class' column to the first position
    cols = df.columns.tolist()
    cols.insert(0, cols.pop(cols.index('target_class')))
    df = df[cols]

    # Convert the modified DataFrame to a CSV string without the header row
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, header=False, index=False)

    # Upload the CSV string to the S3 bucket
    s3 = boto3.client('s3')
    s3.put_object(Bucket=bucket_name, Key=f"{folder_name}/data.csv", Body=csv_buffer.getvalue())


# Define the GitHub URLs and local output paths for the train, validation, and test files
train_path ="https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/df_train.csv"
validation_path = "https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/df_validation.csv"
test_path ="https://raw.githubusercontent.com/vivianndo/ads508_gunviolence/main/generated_data/df_test.csv"
# Define the GitHub URLs and S3 folder names for the train, validation, and test files
file_data = [
    {
        "url": train_path,
        "folder_name": "train"
    },
    {
        "url": validation_path,
        "folder_name": "validation"
    },
    {
        "url": test_path,
        "folder_name": "test"
    }
]

# Process and upload the modified train, validation, and test files to the S3 bucket
for file_info in file_data:
    process_and_upload_to_s3(file_info["url"], bucket_name, file_info["folder_name"])

In [11]:
def check_file_exists_in_folder(bucket_name, folder_name, file_name):
    s3 = boto3.client('s3')
    key_prefix = f"{folder_name}/{file_name}"
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=key_prefix)

    for obj in response.get('Contents', []):
        if obj['Key'] == key_prefix:
            return True
    return False

def upload_file_to_folder(bucket_name, folder_name, file_path, file_name):
    if not check_file_exists_in_folder(bucket_name, folder_name, file_name):
        s3 = boto3.client('s3')
        key = f"{folder_name}/{file_name}"
        s3.upload_file(file_path, bucket_name, key)
        print(f"Uploaded {file_name} to {folder_name}")
    else:
        print(f"File {file_name} already exists in {folder_name}")
    

# Upload the files to the specified folders
upload_file_to_folder(bucket_name, 'gun_violence_data', '../raw_data/gun_violence.csv', 'gun_violence.csv')
upload_file_to_folder(bucket_name, 'census2010_data', '../raw_data/sub_est2018_all.csv', 'sub_est2018_all.csv')
upload_file_to_folder(bucket_name, 'state_abbrev_data', '../raw_data/state_abbrev_map.csv', 'state_abbrev_map.csv')
upload_file_to_folder(bucket_name, 'congress_data', '../raw_data/Congress_2013-2018.csv', 'Congress_2013-2018.csv')
upload_file_to_folder(bucket_name, 'state_income_data', '../raw_data/all_states_income.csv', 'all_states_income.csv')
upload_file_to_folder(bucket_name, 'cities_data', '../raw_data/uscities.csv', 'uscities.csv')
upload_file_to_folder(bucket_name, 'modeling_data', '../generated_data/data_for_modeling.csv', 'data_for_modeling.csv')

# Upload the train file
upload_file_to_folder(bucket_name, 'train', '../train/df_train.csv', 'data.csv')

# Upload the validation file
upload_file_to_folder(bucket_name, 'validation', '../validation/df_validation.csv', 'data.csv')

# Upload the test file
upload_file_to_folder(bucket_name, 'test', '../test/df_test.csv', 'data.csv')

File gun_violence.csv already exists in gun_violence_data
File sub_est2018_all.csv already exists in census2010_data
File state_abbrev_map.csv already exists in state_abbrev_data
File Congress_2013-2018.csv already exists in congress_data
File all_states_income.csv already exists in state_income_data
File uscities.csv already exists in cities_data
File data_for_modeling.csv already exists in modeling_data
File data.csv already exists in train
File data.csv already exists in validation
File data.csv already exists in test


# Release Resources

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [13]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>