# Libraries

In [2]:
import sagemaker
import boto3
from datetime import datetime
from io import StringIO
import pandas as pd

In [3]:
sess = sagemaker.Session() #Intialzie session

bucket = sess.default_bucket()
RD_Bucket = 'policedatasetbucket' #Raw Data
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource

print('Bucket - > ',bucket)
print('Role - > ',role)
print('Region - > ',region)
print('S3 - > ',s3)
print('S3 Resource - > ',s3_resource)

Bucket - >  sagemaker-us-east-1-859074047513
Role - >  arn:aws:iam::859074047513:role/LabRole
Region - >  us-east-1
S3 - >  <botocore.client.S3 object at 0x7fd72c606690>
S3 Resource - >  s3.ServiceResource()


# Prepare landing zone

In [4]:
# Function to verify if bucket exist, if not create
def verify_create_bucket(bucket_name):
    response = s3.list_buckets()
    for bucket in response['Buckets']:
        if bucket['Name'] == bucket_name:
            print(f"The {bucket_name} bucket exists.")
            break
    else:
        print(f"The {bucket_name} bucket does not exist, creating")
        !aws s3 mb s3://{RD_Bucket}/

In [5]:
verify_create_bucket(RD_Bucket)

The policedatasetbucket bucket exists.


# Save Files to bucket

## Load calls

In [6]:
#function to download yearly data
def Get_Data(year):
    url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
    df = pd.read_csv(url)
    return df

In [7]:
# List of years since 2018
Years = list(range(2018,datetime.now().year+1))
Years

[2018, 2019, 2020, 2021, 2022, 2023]

In [8]:
#iterate over years
for year in Years:
    year_df = Get_Data(year)
    csv_buffer = StringIO()
    year_df.to_csv(csv_buffer, index=False)
    file = 'SDPD_Calls_' + str(year) + '.csv'
    s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
    print(file,'loaded in',RD_Bucket,'bucket')

SDPD_Calls_2018.csv loaded in policedatasetbucket bucket
SDPD_Calls_2019.csv loaded in policedatasetbucket bucket
SDPD_Calls_2020.csv loaded in policedatasetbucket bucket
SDPD_Calls_2021.csv loaded in policedatasetbucket bucket
SDPD_Calls_2022.csv loaded in policedatasetbucket bucket
SDPD_Calls_2023.csv loaded in policedatasetbucket bucket


## Load variable data

In [9]:
def URL_2_Bucket(url,file_name):
    df = pd.read_csv(url, low_memory=False)
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    file = file_name + '.csv'
    s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
    print(file,'loaded in',RD_Bucket,'bucket')
    

In [10]:
type_url = f"http://seshat.datasd.org/pd/pd_cfs_calltypes_datasd.csv"
dipo_url = f"http://seshat.datasd.org/pd/pd_dispo_codes_datasd.csv"
ripa_stops_url = "https://seshat.datasd.org/pd/ripa_stops_datasd.csv"
ripa_stops_dic = "https://seshat.datasd.org/pd/ripa_stops_dictionary_datasd.csv"

In [11]:
URL_2_Bucket(type_url,'Type')
URL_2_Bucket(dipo_url,'Dispo')
URL_2_Bucket(ripa_stops_url,'Ripa_Stops')
URL_2_Bucket(ripa_stops_dic,'Ripa_Stops_Dic')

Type.csv loaded in policedatasetbucket bucket
Dispo.csv loaded in policedatasetbucket bucket
Ripa_Stops.csv loaded in policedatasetbucket bucket
Ripa_Stops_Dic.csv loaded in policedatasetbucket bucket
