In [62]:
import sagemaker
import boto3
from datetime import datetime
from io import StringIO
import pandas as pd
import numpy as np

from pyathena import connect

In [3]:
sess = sagemaker.Session() #Intialzie session

bucket = sess.default_bucket()
RD_Bucket = 'policedatasetbucket' #Raw Data
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
s3 = boto3.client('s3') # Create an S3 client
s3_resource = boto3.resource('s3') # Create an S3 resource


print('Bucket - > ',bucket)
print('Role - > ',role)
print('Region - > ',region)
print('S3 - > ',s3)
print('S3 Resource - > ',s3_resource)

Bucket - >  sagemaker-us-east-1-859074047513
Role - >  arn:aws:iam::859074047513:role/LabRole
Region - >  us-east-1
S3 - >  <botocore.client.S3 object at 0x7f2c25ef92d0>
S3 Resource - >  s3.ServiceResource()


In [4]:
# Function to verify if bucket exist, if not create
def verify_create_bucket(bucket_name):
    response = s3.list_buckets()
    for bucket in response['Buckets']:
        if bucket['Name'] == bucket_name:
            print(f"The {bucket_name} bucket exists.")
            break
    else:
        print(f"The {bucket_name} bucket does not exist, creating")
        !aws s3 mb s3://{bucket}/

In [5]:
verify_create_bucket(bucket)

The sagemaker-us-east-1-859074047513 bucket exists.


In [6]:
#function to download yearly data
# def Get_Data(year):
#     url = f"https://seshat.datasd.org/pd/pd_calls_for_service_{year}_datasd.csv"
#     df = pd.read_csv(url)
#     return df

In [7]:
# List of years since 2018
# Years = list(range(2018,datetime.now().year+1))
# Years

In [8]:
#iterate over years
# for year in Years:
#     year_df = Get_Data(year)
#     csv_buffer = StringIO()
#     year_df.to_csv(csv_buffer, index=False)
#     file = 'SDPD_Calls_' + str(year) + '.csv'
#     s3_resource.Object(bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',bucket,'bucket')

In [9]:
# def URL_2_Bucket(url,file_name):
#     df = pd.read_csv(url, low_memory=False)
#     csv_buffer = StringIO()
#     df.to_csv(csv_buffer, index=False)
#     file = file_name + '.csv'
#     s3_resource.Object(RD_Bucket,file).put(Body=csv_buffer.getvalue())
#     print(file,'loaded in',RD_Bucket,'bucket')
    

In [10]:
# type_url = f"http://seshat.datasd.org/pd/pd_cfs_calltypes_datasd.csv"
# dipo_url = f"http://seshat.datasd.org/pd/pd_dispo_codes_datasd.csv"
# ripa_stops_url = "https://seshat.datasd.org/pd/ripa_stops_datasd.csv"
# ripa_stops_dic = "https://seshat.datasd.org/pd/ripa_stops_dictionary_datasd.csv"

In [11]:
# URL_2_Bucket(type_url,'Type')
# URL_2_Bucket(dipo_url,'Dispo')
# URL_2_Bucket(ripa_stops_url,'Ripa_Stops')
# URL_2_Bucket(ripa_stops_dic,'Ripa_Stops_Dic')

In [12]:
ingest_create_athena_db_passed = False

# Create Athena Database

In [13]:
database_name = "sd_police_db"

In [14]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [15]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)


In [16]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS sd_police_db


In [17]:
import pandas as pd

pd.read_sql(statement, conn)

In [18]:
# Show databases
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,dsoaws
2,sd_police_db


# Drop Database if Needed

In [19]:
# drop_db_name = "dsoaw"
# drop_db_query = f"DROP DATABASE IF EXISTS {drop_db_name}"

# df_show = pd.read_sql(drop_db_query, conn)
# df_show.head()

# Drop Table if Needed

In [20]:
# drop_table_name = "table_sd_ripa" 

# drop_table_query = f"DROP TABLE IF EXISTS {database_name}.{drop_table_name}"

# df_show1 = pd.read_sql(drop_table_query, conn)
# df_show1.head()

In [21]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [22]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


# S3 Bucket Folder Information for 2021

In [23]:
s3_private_path_csv = "s3://{}/policedatasetsd/csv".format(bucket)
print(s3_private_path_csv)

s3://sagemaker-us-east-1-859074047513/policedatasetsd/csv


In [24]:
table_name = "table_sd_2021"

In [25]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name, s3_private_path_csv)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2021(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [26]:
import pandas as pd

pd.read_sql(statement, conn)

# Varify Tables in a Specified Database

In [27]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)

df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023
3,table_sd_ripa


In [28]:
pd.read_sql(statement, conn)

df_2021 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name}", conn)

# S3 Bucket Folder Information for 2022

In [29]:
s3_private_path_csv1 = "s3://{}/policedatasetsd22/csv".format(bucket)
print(s3_private_path_csv1)

s3://sagemaker-us-east-1-859074047513/policedatasetsd22/csv


In [30]:
table_name1 = "table_sd_2022"

In [31]:
# SQL statement to execute
statement_1 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(database_name, table_name1, s3_private_path_csv1)

print(statement_1)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2022(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd22/csv'
TBLPROPERTIES ('skip.header.line.count'='1')


In [32]:
import pandas as pd

pd.read_sql(statement_1, conn)

In [33]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023
3,table_sd_ripa


In [34]:
pd.read_sql(statement_1, conn)

df_2022 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name1}", conn)

# S3 Bucket Folder Information for 2023

In [35]:
s3_private_path_csv2 = "s3://{}/policedatasetsd23/csv".format(bucket)
print(s3_private_path_csv2)

s3://sagemaker-us-east-1-859074047513/policedatasetsd23/csv


In [36]:
table_name2 = "table_sd_2023"

In [37]:
# SQL statement to execute
statement_2 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1',
                'serialization.null.format'='')""".format(database_name, table_name2, s3_private_path_csv2)

print(statement_2)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_2023(
incident_num string,
date_time string,
day_of_week int,
address_number_primary int,
address_dir_primary string,
address_road_primary string,
address_sfx_primary string,
address_dir_intersecting string,
address_road_intersecting string,
address_sfx_intersecting string,
call_type string,
disposition string,
beat int,
priority int) 

ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' NULL DEFINED AS '' LOCATION 's3://sagemaker-us-east-1-859074047513/policedatasetsd23/csv'
TBLPROPERTIES ('skip.header.line.count'='1',
                'serialization.null.format'='')


In [38]:
import pandas as pd

pd.read_sql(statement_2, conn)

In [39]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023
3,table_sd_ripa


In [40]:
pd.read_sql(statement_2, conn)

df_2023 = pd.read_sql(f"SELECT * FROM {database_name}.{table_name2}", conn)

# S3 Bucket Folder Information for RIPA Dataset

In [41]:
s3_private_path_csv_ripa = "s3://{}/ripapolicedatasetsd/csv".format(bucket)
print(s3_private_path_csv_ripa)

s3://sagemaker-us-east-1-859074047513/ripapolicedatasetsd/csv


In [42]:
table_name3 = "table_sd_ripa"

In [43]:
# SQL statement to execute
statement_3 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
`stop_id` string,
`ori` string,
`agency` string,
`exp_years` string,
`date_stop` string,
`time_stop` string,
`stopduration` string,
`stop_in_response_to_cfs` string,
`office_assignment_key` string,
`assignment` string,
`intersection` string,
`address_block` string,
`land_mark` string,
`address_street` string,
`highway_exit` string,
`isschool` string,
`school_name` string,
`address_city` string,
`beat` string,
`beat_name` string,
`pid` string,
`isstudent` string,
`perceived_limited_english` string,
`perceived_age` string,
`perceived_gender` string,
`gender_nonconforming` string,
`gend` string,
`gend_nc` string,
`perceived_lgbt` string
) 

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = ",")
LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1',
               'serialization.null.format'='')""".format(database_name, table_name3, s3_private_path_csv_ripa)


print(statement_3)

CREATE EXTERNAL TABLE IF NOT EXISTS sd_police_db.table_sd_ripa(
`stop_id` string,
`ori` string,
`agency` string,
`exp_years` string,
`date_stop` string,
`time_stop` string,
`stopduration` string,
`stop_in_response_to_cfs` string,
`office_assignment_key` string,
`assignment` string,
`intersection` string,
`address_block` string,
`land_mark` string,
`address_street` string,
`highway_exit` string,
`isschool` string,
`school_name` string,
`address_city` string,
`beat` string,
`beat_name` string,
`pid` string,
`isstudent` string,
`perceived_limited_english` string,
`perceived_age` string,
`perceived_gender` string,
`gender_nonconforming` string,
`gend` string,
`gend_nc` string,
`perceived_lgbt` string
) 

ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = ",")
LOCATION 's3://sagemaker-us-east-1-859074047513/ripapolicedatasetsd/csv'
TBLPROPERTIES ('skip.header.line.count'='1',
               'serialization.null.format'='')


In [44]:
import pandas as pd

pd.read_sql(statement_3, conn)

In [45]:
database_name = "sd_police_db"
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,table_sd_2021
1,table_sd_2022
2,table_sd_2023
3,table_sd_ripa


In [46]:
pd.read_sql(statement_3, conn)

ripa_df = pd.read_sql(f"SELECT * FROM {database_name}.{table_name3}", conn)

# Join The Three datasets 2021, 2022, 2023

### Checking the data types, removing missing values, duplicated values

In [48]:
# shape of the dataframes
print(f'shape of the dataframe police dataframe {df_2021.shape}')
print(f'shape of the dataframe police dataframe {df_2022.shape}')
print(f'shape of the dataframe police dataframe {df_2023.shape}')

shape of the dataframe police dataframe (568947, 14)
shape of the dataframe police dataframe (499256, 14)
shape of the dataframe police dataframe (99811, 14)


In [49]:
# concatenate the tables 
join_df = pd.concat([df_2021, df_2022, df_2023], axis=0)
join_df

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
0,E21060046805,2021-06-28 07:36:28,2,5200,,52ND,PL,,,,SELENF,O,826,3
1,E21060046806,2021-06-28 07:36:33,2,3400,,SPORTS ARENA,BLV,,,,459A,K,611,3
2,E21060046807,2021-06-28 07:37:42,2,6200,,MADELINE,ST,,,,586,O,821,2
3,E21060046808,2021-06-28 07:38:22,2,1700,,02ND,AVE,,,,FU,O,529,2
4,E21060046809,2021-06-28 07:40:14,2,9200,,AERO,DR,,,,COURT,K,311,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99806,E23030022386,2023-03-15 23:25:50,4,2500,,IMPERIAL,AVE,,,,INFO,K,512,2
99807,E23030022400,2023-03-15 23:44:05,4,800,,27TH (SB),ST,,,,5150,K,724,1
99808,E23030022401,2023-03-15 23:44:23,4,14700,,VALLE DEL SUR,CT,,,,459A,CAN,937,2
99809,E23030022404,2023-03-15 23:46:26,4,400,,17TH,ST,,,,SELENF,K,521,3


In [50]:
# Data types 
join_df.dtypes

incident_num                 object
date_time                    object
day_of_week                   int64
address_number_primary        int64
address_dir_primary          object
address_road_primary         object
address_sfx_primary          object
address_dir_intersecting     object
address_road_intersecting    object
address_sfx_intersecting     object
call_type                    object
disposition                  object
beat                          int64
priority                      int64
dtype: object

In [51]:
from datetime import datetime

join_df["date_time"] = pd.to_datetime(join_df["date_time"])
join_df

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
0,E21060046805,2021-06-28 07:36:28,2,5200,,52ND,PL,,,,SELENF,O,826,3
1,E21060046806,2021-06-28 07:36:33,2,3400,,SPORTS ARENA,BLV,,,,459A,K,611,3
2,E21060046807,2021-06-28 07:37:42,2,6200,,MADELINE,ST,,,,586,O,821,2
3,E21060046808,2021-06-28 07:38:22,2,1700,,02ND,AVE,,,,FU,O,529,2
4,E21060046809,2021-06-28 07:40:14,2,9200,,AERO,DR,,,,COURT,K,311,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99806,E23030022386,2023-03-15 23:25:50,4,2500,,IMPERIAL,AVE,,,,INFO,K,512,2
99807,E23030022400,2023-03-15 23:44:05,4,800,,27TH (SB),ST,,,,5150,K,724,1
99808,E23030022401,2023-03-15 23:44:23,4,14700,,VALLE DEL SUR,CT,,,,459A,CAN,937,2
99809,E23030022404,2023-03-15 23:46:26,4,400,,17TH,ST,,,,SELENF,K,521,3


In [52]:
join_df.dtypes

incident_num                         object
date_time                    datetime64[ns]
day_of_week                           int64
address_number_primary                int64
address_dir_primary                  object
address_road_primary                 object
address_sfx_primary                  object
address_dir_intersecting             object
address_road_intersecting            object
address_sfx_intersecting             object
call_type                            object
disposition                          object
beat                                  int64
priority                              int64
dtype: object

In [53]:
join_df.sort_values(by="date_time", ascending = True)

Unnamed: 0,incident_num,date_time,day_of_week,address_number_primary,address_dir_primary,address_road_primary,address_sfx_primary,address_dir_intersecting,address_road_intersecting,address_sfx_intersecting,call_type,disposition,beat,priority
13411,E21010000001,2021-01-01 00:00:11,6,5000,,CHAPARRAL,WAY,,,,AU1,W,326,1
13412,E21010000002,2021-01-01 00:00:14,6,500,,05TH,AVE,,,,1186,CAN,523,3
13413,E21010000003,2021-01-01 00:00:26,6,3600,,38TH,ST,,,,AU1,DUP,839,1
13414,E21010000004,2021-01-01 00:00:31,6,2700,,WORDEN,ST,,,,AU1,W,613,1
13415,E21010000006,2021-01-01 00:01:20,6,800,,SAN DIEGO,PL,,,,INFO,W,121,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99806,E23030022386,2023-03-15 23:25:50,4,2500,,IMPERIAL,AVE,,,,INFO,K,512,2
99807,E23030022400,2023-03-15 23:44:05,4,800,,27TH (SB),ST,,,,5150,K,724,1
99808,E23030022401,2023-03-15 23:44:23,4,14700,,VALLE DEL SUR,CT,,,,459A,CAN,937,2
99809,E23030022404,2023-03-15 23:46:26,4,400,,17TH,ST,,,,SELENF,K,521,3


In [55]:
join_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168014 entries, 0 to 99810
Data columns (total 14 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   incident_num               1168014 non-null  object        
 1   date_time                  1168014 non-null  datetime64[ns]
 2   day_of_week                1168014 non-null  int64         
 3   address_number_primary     1168014 non-null  int64         
 4   address_dir_primary        55064 non-null    object        
 5   address_road_primary       1167818 non-null  object        
 6   address_sfx_primary        1067996 non-null  object        
 7   address_dir_intersecting   0 non-null        object        
 8   address_road_intersecting  196743 non-null   object        
 9   address_sfx_intersecting   0 non-null        object        
 10  call_type                  1166864 non-null  object        
 11  disposition                1164029 non-

Our first file contains 1,168,014 rows and 14 columns.</br>
Out of 14 columns 4 are int type, 8 are strings/object, and 1 is datetime type.</br>
Out of 14 columns 7 columns have null/missing values. 


In [56]:
# check null values
join_df.isna().sum()

incident_num                       0
date_time                          0
day_of_week                        0
address_number_primary             0
address_dir_primary          1112950
address_road_primary             196
address_sfx_primary           100018
address_dir_intersecting     1168014
address_road_intersecting     971271
address_sfx_intersecting     1168014
call_type                       1150
disposition                     3985
beat                               0
priority                           0
dtype: int64

In [57]:
from tqdm import tqdm

def Data_Quality_Report(df):

    #Initial table
    freqDF = pd.DataFrame(columns=['Feature',
                                   'Mode',
                                   'Mode Freq.',
                                   'Mode %',
                                   '2nd Mode',
                                   '2nd Mode Freq.',
                                   '2nd Mode %'])
    for col in tqdm(df.columns):
        try:
            #print(col)
            freq = df[col].value_counts()
            freqdf = freq.to_frame()
            fRow = freqdf.iloc[0]
            #try:
            secRow = freqdf.iloc[1]
            #except:
            #secRow = 0
            fPrct = fRow[0] / len(df[col])
            #try:
            secPrct = secRow[0] / len(df[col])
            #except:
                #secPrct = 0
            try:
                mode1 = int(fRow.name)
            except:
                mode1 = fRow.name
            try:
                mode2 = int(secRow.name)
            except:
                try:
                    mode2 = secRow.name
                except:
                    mode2 = 0
            freqDF = freqDF.append({'Feature':col,
                                    'Mode':mode1,
                                    'Mode Freq.':fRow[0],
                                    'Mode %':fPrct,\
                                    '2nd Mode':mode2,
                                    '2nd Mode Freq.':secRow[0],
                                    '2nd Mode %':secPrct},
                                    ignore_index=True)
        except:
            pass

    freqDF = freqDF.set_index('Feature')

    #Nulls, Counts, Cardinality
    NUllFeatures = round(df.isnull().sum() / df.shape[0],4)\
          .sort_values(ascending=False)
    Count = df.count()
    uni = df.nunique()

    #Formating
    NUllFeatures.to_frame(name="% Miss.")
    Count.to_frame(name="Count")
    uni.to_frame()
    result = pd.concat([Count, NUllFeatures,uni], axis=1)
    result.columns =["Count","% Miss.","Card."]
    result = pd.concat([result, freqDF], axis=1)
    """
    result = result.style.format({'% Miss.': "{:.1%}",
                         'Mode %': "{:.0%}",
                         '2nd Mode %': "{:.0%}",
                         'Count': "{:,}",
                         'Card.': "{:,}",
                         'Mode Freq.': "{:,}",
                        '2nd Mode Freq.': "{:,}"})"""
    return result

In [58]:
DQR_Calls = Data_Quality_Report(join_df)
DQR_Calls

100%|██████████| 14/14 [00:02<00:00,  4.92it/s]


Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
incident_num,1168014,0.0,1168014,E22100019997,1.0,8.561541e-07,E22100007472,1.0,8.561541e-07
date_time,1168014,0.0,1157922,2021-08-17 15:02:57,3.0,2.568462e-06,2021-09-04 20:14:47,3.0,2.568462e-06
day_of_week,1168014,0.0,7,5,175015.0,0.1498398,6,173878.0,0.1488664
address_number_primary,1168014,0.0,262,0,212027.0,0.1815278,1400,24613.0,0.02107252
address_dir_primary,55064,0.9529,9,W,19522.0,0.01671384,S,17418.0,0.01491249
address_road_primary,1167818,0.0002,12704,IMPERIAL,21385.0,0.01830886,05TH,18939.0,0.0162147
address_sfx_primary,1067996,0.0856,42,ST,348278.0,0.2981796,AVE,275917.0,0.2362275
address_dir_intersecting,0,1.0,0,,,,,,
address_road_intersecting,196743,0.8316,6234,MARKET,5374.0,0.004600972,UNIVERSITY,5072.0,0.004342414
address_sfx_intersecting,0,1.0,0,,,,,,


As we can see the `address_sfx_intersecting` and `address_dir_intersecting` have no values and we can drop these columns.</br>
Also the `address_dir_primary` and `address_road_intersecting` have more than 80% missing values; therefore we can consider removing these columns too. 

### Dropping columns with more than 60% missing values

In [64]:
# Removing unnecessary columns
def drop_columns_with_max_missing_values(df):
    mis_var = [x for x in df.columns if df[x].isnull().sum() > 0]
    df[mis_var].isnull().sum()

    limit = np.abs((df.shape[0] * 0.6))
    var_to_be_dropped = [x for x in mis_var if df[x].isnull().sum() > limit]
    print('Columns with more than 60% missing values: \n\n', var_to_be_dropped)

    df.drop(columns=var_to_be_dropped, axis=1, inplace=True)
    return df

df = drop_columns_with_max_missing_values(join_df)

Columns with more than 60% missing values: 

 ['address_dir_primary', 'address_dir_intersecting', 'address_road_intersecting', 'address_sfx_intersecting']


In [65]:
join_df.shape

(1168014, 10)

We dropped the unnecesssary columns from the dataset

In [66]:
# check duplicated values
join_df.duplicated().sum()

0

We don't have any duplicated values in dataset

# RIPA dataset

In [67]:
display(ripa_df)

Unnamed: 0,stop_id,ori,agency,exp_years,date_stop,time_stop,stopduration,stop_in_response_to_cfs,office_assignment_key,assignment,...,beat_name,pid,isstudent,perceived_limited_english,perceived_age,perceived_gender,gender_nonconforming,gend,gend_nc,perceived_lgbt
0,10000,CA0371100,SD,26,2018-07-15,19:31:37,5,0,1,"Patrol, traffic enforcement, field operations",...,Kearney Mesa 313,1,0,0,50,Male,0,1,,No
1,100000,CA0371100,SD,1,2019-02-03,09:00:47,10,1,1,"Patrol, traffic enforcement, field operations",...,Hillcrest 627,1,0,0,35,Male,0,1,,No
2,100000,CA0371100,SD,1,2019-02-03,09:00:47,10,1,1,"Patrol, traffic enforcement, field operations",...,Hillcrest 627,2,0,0,35,Male,0,1,,No
3,100001,CA0371100,SD,2,2019-02-03,08:02:21,40,1,1,"Patrol, traffic enforcement, field operations",...,Chollas Creek 827,1,0,0,60,Male,0,1,,No
4,100002,CA0371100,SD,1,2019-02-03,09:03:05,10,0,1,"Patrol, traffic enforcement, field operations",...,Pacific Beach 122,1,0,0,25,Female,0,2,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653596,456888,CA0371100,SD,1,2021-07-23,12:30:32,10,0,1,"Patrol, traffic enforcement, field operations",...,Midway District 611,1,0,0,25,Male,0,1,,No
653597,456889,CA0371100,SD,15,2021-07-23,09:56:00,10,0,1,"Patrol, traffic enforcement, field operations",...,Kensington 825,1,0,0,33,Female,0,2,,No
653598,45689,CA0371100,SD,12,2018-10-01,22:07:00,140,1,1,"Patrol, traffic enforcement, field operations",...,College East 327,1,0,0,32,Female,0,2,,No
653599,456890,CA0371100,SD,11,2021-07-23,09:10:00,12,0,1,"Patrol, traffic enforcement, field operations",...,Otay Mesa 713,1,0,0,45,Male,0,1,,No


Somtimes null values are not detectable in the dataframe, so there's a possibility that the empty values are not actually null values. Instead, they may be empty string (`''`), whitespace characters, or other non-null values. So, we decided to replace all empty strings in the dataframe with `NaN` values. 

In [72]:
ripa_df.replace('', np.nan, inplace=True)

In [77]:
ripa_df.head(5)

Unnamed: 0,stop_id,ori,agency,exp_years,date_stop,time_stop,stopduration,stop_in_response_to_cfs,office_assignment_key,assignment,...,beat_name,pid,isstudent,perceived_limited_english,perceived_age,perceived_gender,gender_nonconforming,gend,gend_nc,perceived_lgbt
0,10000,CA0371100,SD,26,2018-07-15,19:31:37,5,0,1,"Patrol, traffic enforcement, field operations",...,Kearney Mesa 313,1,0,0,50,Male,0,1,,No
1,100000,CA0371100,SD,1,2019-02-03,09:00:47,10,1,1,"Patrol, traffic enforcement, field operations",...,Hillcrest 627,1,0,0,35,Male,0,1,,No
2,100000,CA0371100,SD,1,2019-02-03,09:00:47,10,1,1,"Patrol, traffic enforcement, field operations",...,Hillcrest 627,2,0,0,35,Male,0,1,,No
3,100001,CA0371100,SD,2,2019-02-03,08:02:21,40,1,1,"Patrol, traffic enforcement, field operations",...,Chollas Creek 827,1,0,0,60,Male,0,1,,No
4,100002,CA0371100,SD,1,2019-02-03,09:03:05,10,0,1,"Patrol, traffic enforcement, field operations",...,Pacific Beach 122,1,0,0,25,Female,0,2,,No


In [80]:
ripa_df.columns

Index(['stop_id', 'ori', 'agency', 'exp_years', 'date_stop', 'time_stop',
       'stopduration', 'stop_in_response_to_cfs', 'office_assignment_key',
       'assignment', 'intersection', 'address_block', 'land_mark',
       'address_street', 'highway_exit', 'isschool', 'school_name',
       'address_city', 'beat', 'beat_name', 'pid', 'isstudent',
       'perceived_limited_english', 'perceived_age', 'perceived_gender',
       'gender_nonconforming', 'gend', 'gend_nc', 'perceived_lgbt'],
      dtype='object')

In [78]:
ripa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653601 entries, 0 to 653600
Data columns (total 29 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   stop_id                    653601 non-null  object
 1   ori                        653601 non-null  object
 2   agency                     653601 non-null  object
 3   exp_years                  653601 non-null  object
 4   date_stop                  653601 non-null  object
 5   time_stop                  653601 non-null  object
 6   stopduration               653601 non-null  object
 7   stop_in_response_to_cfs    653601 non-null  object
 8   office_assignment_key      653601 non-null  object
 9   assignment                 653601 non-null  object
 10  intersection               68880 non-null   object
 11  address_block              580059 non-null  object
 12  land_mark                  67 non-null      object
 13  address_street             625475 non-null  

### Check null values

In [81]:
missing_values = [x for x in ripa_df.columns if ripa_df[x].isnull().sum()>0]
ripa_df[missing_values].isnull().sum()

intersection                 584721
address_block                 73542
land_mark                    653534
address_street                28126
highway_exit                 648010
isschool                          2
school_name                  653132
address_city                      3
beat                              3
beat_name                         3
pid                               2
isstudent                         2
perceived_limited_english         2
perceived_age                     2
perceived_gender                206
gender_nonconforming              2
gend                              2
gend_nc                      653222
perceived_lgbt                    2
dtype: int64

In [82]:
DQR_RIPA = Data_Quality_Report(ripa_df)
DQR_RIPA

100%|██████████| 29/29 [00:04<00:00,  5.95it/s]


Unnamed: 0,Count,% Miss.,Card.,Mode,Mode Freq.,Mode %,2nd Mode,2nd Mode Freq.,2nd Mode %
stop_id,653601,0.0,574368,174011,52.0,8e-05,184085,48.0,7.3e-05
ori,653601,0.0,1,,,,,,
agency,653601,0.0,1,,,,,,
exp_years,653601,0.0,41,1,256905.0,0.393061,3,49912.0,0.076365
date_stop,653601,0.0,1645,2020-02-12,799.0,0.001222,2019-05-23,793.0,0.001213
time_stop,653601,0.0,83477,16:00:00,1772.0,0.002711,10:00:00,1492.0,0.002283
stopduration,653601,0.0,398,10,157652.0,0.241205,15,77367.0,0.11837
stop_in_response_to_cfs,653601,0.0,2,0,584869.0,0.894841,1,68732.0,0.105159
office_assignment_key,653601,0.0,10,1,610968.0,0.934772,10,20167.0,0.030855
assignment,653601,0.0,10,"Patrol, traffic enforcement, field operations",610968.0,0.934772,Other,20167.0,0.030855


### Dropping columns with more than 60% missing values

In [83]:
# Removing unnecessary columns
drop_columns_with_max_missing_values(ripa_df)

Columns with more than 60% missing values: 

 ['intersection', 'land_mark', 'highway_exit', 'school_name', 'gend_nc']


Unnamed: 0,stop_id,ori,agency,exp_years,date_stop,time_stop,stopduration,stop_in_response_to_cfs,office_assignment_key,assignment,...,beat,beat_name,pid,isstudent,perceived_limited_english,perceived_age,perceived_gender,gender_nonconforming,gend,perceived_lgbt
0,10000,CA0371100,SD,26,2018-07-15,19:31:37,5,0,1,"Patrol, traffic enforcement, field operations",...,313,Kearney Mesa 313,1,0,0,50,Male,0,1,No
1,100000,CA0371100,SD,1,2019-02-03,09:00:47,10,1,1,"Patrol, traffic enforcement, field operations",...,627,Hillcrest 627,1,0,0,35,Male,0,1,No
2,100000,CA0371100,SD,1,2019-02-03,09:00:47,10,1,1,"Patrol, traffic enforcement, field operations",...,627,Hillcrest 627,2,0,0,35,Male,0,1,No
3,100001,CA0371100,SD,2,2019-02-03,08:02:21,40,1,1,"Patrol, traffic enforcement, field operations",...,827,Chollas Creek 827,1,0,0,60,Male,0,1,No
4,100002,CA0371100,SD,1,2019-02-03,09:03:05,10,0,1,"Patrol, traffic enforcement, field operations",...,122,Pacific Beach 122,1,0,0,25,Female,0,2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
653596,456888,CA0371100,SD,1,2021-07-23,12:30:32,10,0,1,"Patrol, traffic enforcement, field operations",...,611,Midway District 611,1,0,0,25,Male,0,1,No
653597,456889,CA0371100,SD,15,2021-07-23,09:56:00,10,0,1,"Patrol, traffic enforcement, field operations",...,825,Kensington 825,1,0,0,33,Female,0,2,No
653598,45689,CA0371100,SD,12,2018-10-01,22:07:00,140,1,1,"Patrol, traffic enforcement, field operations",...,327,College East 327,1,0,0,32,Female,0,2,No
653599,456890,CA0371100,SD,11,2021-07-23,09:10:00,12,0,1,"Patrol, traffic enforcement, field operations",...,713,Otay Mesa 713,1,0,0,45,Male,0,1,No


Great! we dropped the columns that included more than 60% missing values. 

In [85]:
ripa_df.isnull().sum()

stop_id                          0
ori                              0
agency                           0
exp_years                        0
date_stop                        0
time_stop                        0
stopduration                     0
stop_in_response_to_cfs          0
office_assignment_key            0
assignment                       0
address_block                73542
address_street               28126
isschool                         2
address_city                     3
beat                             3
beat_name                        3
pid                              2
isstudent                        2
perceived_limited_english        2
perceived_age                    2
perceived_gender               206
gender_nonconforming             2
gend                             2
perceived_lgbt                   2
dtype: int64

### change the data types of the columns that should be an integer or float

In [75]:
integer_columns = ["stop_id", "exp_years", "stopduration", "stop_in_response_to_cfs", "office_assignment_key", "address_block", "isschool", "beat", "pid", "isstudent", "perceived_limited_english", "perceived_age", \
                   "gender_nonconforming", "gend", "gend_nc"]

In [76]:
# function to convert the data type of selected columns
def convert_columns_to_integer(df, columns):
    df[columns] = df[columns].astype(int)
    return df

In [None]:
convert_columns_to_integer(ripa_df, 

## **Basic Statistical Concepts**

* **Mean**: The mean is one of the measures of central tendency. Simply put, the mean is the average of the values in the given set. The observed values are totaled and divided by the total number of observations to determine the mean.
If $x_i$ is $i^{th}$ observation then mean of all $x_i$ ranging from $ 1\leq i\leq n $ denoted by $\bar x$ is given as

$$ \bar{x} = \sum_{i=1}^{n}\frac{x_i}{n} $$


* **Variance**: Variance is a measure of variation. It is calculated by averaging the squared deviations from the mean.
The degree of spread in your data set is indicated by variation. The greater the spread of the data, the greater the variance in proportion to the mean.
Here's the formula for variance of a sample.

$$S^2 = \frac{\sum_{i=1}^{n}(x_i-\bar x)^2}{n-1}$$


* **Standard Deviation**: The standard deviation is a measure that shows how much variation (such as spread, dispersion, and spread) exists from the mean. The standard deviation represents a "typical" departure from the mean. It is a popular measure of variability since it returns to the data set's original units of measurement.
Here's the formula for standard deviation of a sample.

$$S = \sqrt \frac{\sum_{i=1}^{n}(x_i-\bar x)^2}{n-1}$$