In [2]:
import sagemaker
import boto3
from datetime import datetime
from io import StringIO
import pandas as pd

from pyathena import connect

In [3]:
sess = sagemaker.Session() #Intialzie session

bucket = sess.default_bucket()
RD_Bucket = 'policedatasetbucket' #Raw Data
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource


print('Bucket - > ',bucket)
print('Role - > ',role)
print('Region - > ',region)
print('S3 - > ',s3)
print('S3 Resource - > ',s3_resource)

Bucket - >  sagemaker-us-east-1-859074047513
Role - >  arn:aws:iam::859074047513:role/LabRole
Region - >  us-east-1
S3 - >  <botocore.client.S3 object at 0x7f7c5cd1abd0>
S3 Resource - >  s3.ServiceResource()


In [4]:
# Function to verify if bucket exist, if not create
def verify_create_bucket(bucket_name):
    response = s3.list_buckets()
    for bucket in response['Buckets']:
        if bucket['Name'] == bucket_name:
            print(f"The {bucket_name} bucket exists.")
            break
    else:
        print(f"The {bucket_name} bucket does not exist, creating")
        !aws s3 mb s3://{bucket}/

In [5]:
verify_create_bucket(bucket)

The sagemaker-us-east-1-859074047513 bucket exists.


In [6]:
#function to download yearly data
# def Get_Data(year):
#     url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
#     df = pd.read_csv(url)
#     return df

In [7]:
# List of years since 2018
# Years = list(range(2018,datetime.now().year+1))
# Years

In [8]:
#iterate over years
# for year in Years:
#     year_df = Get_Data(year)
#     csv_buffer = StringIO()
#     year_df.to_csv(csv_buffer, index=False)
#     file = 'SDPD_Calls_' + str(year) + '.csv'
#     s3_resource.Object(bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',bucket,'bucket')

In [9]:
# def URL_2_Bucket(url,file_name):
#     df = pd.read_csv(url, low_memory=False)
#     csv_buffer = StringIO()
#     df.to_csv(csv_buffer, index=False)
#     file = file_name + '.csv'
#     s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',RD_Bucket,'bucket')
    

In [10]:
# type_url = f"http://seshat.datasd.org/pd/pd_cfs_calltypes_datasd.csv"
# dipo_url = f"http://seshat.datasd.org/pd/pd_dispo_codes_datasd.csv"
# ripa_stops_url = "https://seshat.datasd.org/pd/ripa_stops_datasd.csv"
# ripa_stops_dic = "https://seshat.datasd.org/pd/ripa_stops_dictionary_datasd.csv"

In [11]:
# URL_2_Bucket(type_url,'Type')
# URL_2_Bucket(dipo_url,'Dispo')
# URL_2_Bucket(ripa_stops_url,'Ripa_Stops')
# URL_2_Bucket(ripa_stops_dic,'Ripa_Stops_Dic')

In [12]:
ingest_create_athena_db_passed = False

# Create Athena Database

In [13]:
database_name = "sd_police_db"

In [14]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [15]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)


In [16]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS sd_police_db


In [17]:
import pandas as pd

pd.read_sql(statement, conn)

In [18]:
# Show databases
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,sd_police_db


# Drop Database if Needed

In [19]:
# drop_db_name = "dsoaw"
# drop_db_query = f"DROP DATABASE IF EXISTS {drop_db_name}"

# df_show = pd.read_sql(drop_db_query, conn)
# df_show.head()

# Drop Table if Needed

In [20]:
# drop_table_name = "table_sd_ripa" 

# drop_table_query = f"DROP TABLE IF EXISTS {database_name}.{drop_table_name}"

# df_show1 = pd.read_sql(drop_table_query, conn)
# df_show1.head()

In [21]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [22]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


# S3 Bucket Folder Information for 2021

In [23]:
s3_private_path_csv = "s3://{}/policedatasetsd/csv".format(bucket)
print(s3_private_path_csv)

s3://sagemaker-us-east-1-859074047513/policedatasetsd/csv


In [24]:
table_name = "table_sd_2021"

In [25]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name, s3_private_path_csv)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2021(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [26]:
import pandas as pd

pd.read_sql(statement, conn)

# Varify Tables in a Specified Database

In [27]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)

df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021


In [28]:
pd.read_sql(statement, conn)

df_2021 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name}", conn)

# S3 Bucket Folder Information for 2022

In [29]:
s3_private_path_csv1 = "s3://{}/policedatasetsd22/csv".format(bucket)
print(s3_private_path_csv1)

s3://sagemaker-us-east-1-859074047513/policedatasetsd22/csv


In [30]:
table_name1 = "table_sd_2022"

In [31]:
# SQL statement to execute
statement_1 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name1, s3_private_path_csv1)

print(statement_1)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2022(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd22/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [32]:
import pandas as pd

pd.read_sql(statement_1, conn)

In [33]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022


In [34]:
pd.read_sql(statement_1, conn)

df_2022 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name1}", conn)

# S3 Bucket Folder Information for 2023

In [35]:
s3_private_path_csv2 = "s3://{}/policedatasetsd23/csv".format(bucket)
print(s3_private_path_csv2)

s3://sagemaker-us-east-1-859074047513/policedatasetsd23/csv


In [36]:
table_name2 = "table_sd_2023"

In [53]:
# SQL statement to execute
statement_2 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1',
                'serialization.null.format'='')""".format(database_name, table_name2, s3_private_path_csv2)

print(statement_2)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2023(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd23/csv'
TBLPROPERTIES ('skip.header.line.count'='1',
                'serialization.null.format'='')


In [54]:
import pandas as pd

pd.read_sql(statement_2, conn)

In [55]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023


In [56]:
pd.read_sql(statement_2, conn)

df_2023 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name2}", conn)

# S3 Bucket Folder Information for RIPA Dataset

In [41]:
s3_private_path_csv_ripa = "s3://{}/ripapolicedatasetsd/csv".format(bucket)
print(s3_private_path_csv_ripa)

s3://sagemaker-us-east-1-859074047513/ripapolicedatasetsd/csv


In [42]:
table_name3 = "table_sd_ripa"

In [59]:
# SQL statement to execute
statement_3 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
`stop_id` string,
`ori` string,
`agency` string,
`exp_years` string,
`date_stop` string,
`time_stop` string,
`stopduration` string,
`stop_in_response_to_cfs` string,
`office_assignment_key` string,
`assignment` string,
`intersection` string,
`address_block` string,
`land_mark` string,
`address_street` string,
`highway_exit` string,
`isschool` string,
`school_name` string,
`address_city` string,
`beat` string,
`beat_name` string,
`pid` string,
`isstudent` string,
`perceived_limited_english` string,
`perceived_age` string,
`perceived_gender` string,
`gender_nonconforming` string,
`gend` string,
`gend_nc` string,
`perceived_lgbt` string
) 

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = ",")
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1',
               'serialization.null.format'='')""".format(database_name, table_name3, s3_private_path_csv_ripa)


print(statement_3)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_ripa(
`stop_id` string,
`ori` string,
`agency` string,
`exp_years` string,
`date_stop` string,
`time_stop` string,
`stopduration` string,
`stop_in_response_to_cfs` string,
`office_assignment_key` string,
`assignment` string,
`intersection` string,
`address_block` string,
`land_mark` string,
`address_street` string,
`highway_exit` string,
`isschool` string,
`school_name` string,
`address_city` string,
`beat` string,
`beat_name` string,
`pid` string,
`isstudent` string,
`perceived_limited_english` string,
`perceived_age` string,
`perceived_gender` string,
`gender_nonconforming` string,
`gend` string,
`gend_nc` string,
`perceived_lgbt` string
) 

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = ",")
LOCATION 's3://sagemaker-us-east-1-859074047513/ripapolicedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1',
               'serialization.null.format'='')


In [60]:
import pandas as pd

pd.read_sql(statement_3, conn)

In [61]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023
3,table_sd_ripa


In [62]:
pd.read_sql(statement_3, conn)

ripa_df = pd.read_sql(f"SELECT * FROM {database_name}.{table_name3}", conn)

In [65]:
ripa_df.isnull().sum()

stop_id                      0
ori                          0
agency                       0
exp_years                    0
date_stop                    0
time_stop                    0
stopduration                 0
stop_in_response_to_cfs      0
office_assignment_key        0
assignment                   0
intersection                 1
address_block                1
land_mark                    1
address_street               2
highway_exit                 2
isschool                     2
school_name                  2
address_city                 2
beat                         2
beat_name                    2
pid                          2
isstudent                    2
perceived_limited_english    2
perceived_age                2
perceived_gender             2
gender_nonconforming         2
gend                         2
gend_nc                      2
perceived_lgbt               2
dtype: int64

# Join The Three datasets 2021, 2022, 2023

In [None]:
# shape of the dataframes
print(f'shape of the dataframe police dataframe {df_2021.shape}')
print(f'shape of the dataframe police dataframe {df_2022.shape}')
print(f'shape of the dataframe police dataframe {df_2023.shape}')

In [None]:
# concatenate the tables 
join_df = pd.concat([df_2021, df_2022, df_2023], axis=0)
join_df

In [None]:
# Data types 
join_df.dtypes

In [None]:
from datetime import datetime

join_df["date_time"] = pd.to_datetime(join_df["date_time"])
join_df

In [None]:
join_df.dtypes

In [None]:
join_df.sort_values(by="date_time", ascending = True)

In [None]:
# get unique values of the address_road_primary
join_df["address_road_primary"].nunique()

In [None]:
join_df.info()

Our first file contains 1,168,014 rows and 14 columns</br>
Out of 14 columns 4 are int type, 8 are strings/object, and 1 is datetime type.


In [None]:
join_df.isna().sum()

In [None]:
from tqdm import tqdm

def Data_Quality_Report(df):

    #Initial table
    freqDF = pd.DataFrame(columns=['Feature',
                                   'Mode',
                                   'Mode Freq.',
                                   'Mode %',
                                   '2nd Mode',
                                   '2nd Mode Freq.',
                                   '2nd Mode %'])
    for col in tqdm(df.columns):
        try:
            #print(col)
            freq = df[col].value_counts()
            freqdf = freq.to_frame()
            fRow = freqdf.iloc[0]
            #try:
            secRow = freqdf.iloc[1]
            #except:
            #secRow = 0
            fPrct = fRow[0] / len(df[col])
            #try:
            secPrct = secRow[0] / len(df[col])
            #except:
                #secPrct = 0
            try:
                mode1 = int(fRow.name)
            except:
                mode1 = fRow.name
            try:
                mode2 = int(secRow.name)
            except:
                try:
                    mode2 = secRow.name
                except:
                    mode2 = 0
            freqDF = freqDF.append({'Feature':col,
                                    'Mode':mode1,
                                    'Mode Freq.':fRow[0],
                                    'Mode %':fPrct,\
                                    '2nd Mode':mode2,
                                    '2nd Mode Freq.':secRow[0],
                                    '2nd Mode %':secPrct},
                                    ignore_index=True)
        except:
            pass

    freqDF = freqDF.set_index('Feature')

    #Nulls, Counts, Cardinality
    NUllFeatures = round(df.isnull().sum() / df.shape[0],4)\
          .sort_values(ascending=False)
    Count = df.count()
    uni = df.nunique()

    #Formating
    NUllFeatures.to_frame(name="% Miss.")
    Count.to_frame(name="Count")
    uni.to_frame()
    result = pd.concat([Count, NUllFeatures,uni], axis=1)
    result.columns =["Count","% Miss.","Card."]
    result = pd.concat([result, freqDF], axis=1)
    """
    result = result.style.format({'% Miss.': "{:.1%}",
                         'Mode %': "{:.0%}",
                         '2nd Mode %': "{:.0%}",
                         'Count': "{:,}",
                         'Card.': "{:,}",
                         'Mode Freq.': "{:,}",
                        '2nd Mode Freq.': "{:,}"})"""
    return result

In [None]:
DQR_Calls = Data_Quality_Report(join_df)
DQR_Calls

## **Basic Statistical Concepts**

* **Mean**: The mean is one of the measures of central tendency. Simply put, the mean is the average of the values in the given set. The observed values are totaled and divided by the total number of observations to determine the mean.
If $x_i$ is $i^{th}$ observation then mean of all $x_i$ ranging from $ 1\leq i\leq n $ denoted by $\bar x$ is given as

$$ \bar{x} = \sum_{i=1}^{n}\frac{x_i}{n} $$


* **Variance**: Variance is a measure of variation. It is calculated by averaging the squared deviations from the mean.
The degree of spread in your data set is indicated by variation. The greater the spread of the data, the greater the variance in proportion to the mean.
Here's the formula for variance of a sample.

$$S^2 = \frac{\sum_{i=1}^{n}(x_i-\bar x)^2}{n-1}$$


* **Standard Deviation**: The standard deviation is a measure that shows how much variation (such as spread, dispersion, and spread) exists from the mean. The standard deviation represents a "typical" departure from the mean. It is a popular measure of variability since it returns to the data set's original units of measurement.
Here's the formula for standard deviation of a sample.

$$S = \sqrt \frac{\sum_{i=1}^{n}(x_i-\bar x)^2}{n-1}$$