# Pre-EDA Cleaning for Scripts

In [4]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
df = pd.read_csv('./processed_data/PROCESSED_0_to_5.csv', dtype=object)

In [7]:
df.head()

Unnamed: 0,ObjectId_x,StopID,stopDate,stopTime,stopDuration,stopInResponseToCFS,blockNumber,street,intersection,highwayExit,...,reasonForStopcode,reasonForStopCodeText,ObjectId_x.2,actionTakenDuringStop,personSearchConsentGiven,propertySearchConsentGiven,ObjectId_y.2,resultOfStop,resultOfStopcode,resultOfStopcodeText
0,1,5875,2019/01/01 00:00:00,2020/06/26 07:41:59,5,0,200.0,E First St,,,...,54153,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153,17,Curbside detention,,,47,Citation for infraction,54153.0,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153
1,2,5782,2019/01/01 00:00:00,2020/06/26 00:04:59,5,1,5900.0,Long Beach Blvd,,,...,38021,273(A) PC - ILLEGAL PAY FOR ADOPTION (M) 38021,3,Curbside detention,,,1,Field interview card completed,,
2,3,5876,2019/01/01 00:00:00,2020/06/26 18:17:00,4,0,6600.0,Butler Ave,,,...,54106,22350 VC - UNSAFE SPEED:PREVAIL COND (I) 54106,22,,,,51,No Action,,
3,4,5877,2019/01/01 00:00:00,2020/06/26 18:25:35,10,1,900.0,E anaheim,,,...,0,,24,,,,55,No Action,,
4,5,5785,2019/01/01 00:00:00,2020/06/26 01:04:00,16,0,4800.0,Vista street,,,...,65002,65002 ZZ - LOCAL ORDINANCE VIOL (I) 65002,8,,,,5,Citation for infraction,65000.0,65000 ZZ - LOCAL ORDINANCE VIOL (M) 65000


### Clean date and time columns:

In [17]:
def clean_date_time(df):
    df['stopDate'] = pd.to_datetime(df['stopDate']).dt.date
    df['stopTime'] = pd.to_datetime(df['stopTime']).dt.time

In [22]:
clean_date_time(df)

In [23]:
df.head()

Unnamed: 0,ObjectId_x,StopID,stopDate,stopTime,stopDuration,stopInResponseToCFS,blockNumber,street,intersection,highwayExit,...,reasonForStopcode,reasonForStopCodeText,ObjectId_x.2,actionTakenDuringStop,personSearchConsentGiven,propertySearchConsentGiven,ObjectId_y.2,resultOfStop,resultOfStopcode,resultOfStopcodeText
0,1,5875,2019-01-01,07:41:59,5,0,200.0,E First St,,,...,54153,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153,17,Curbside detention,,,47,Citation for infraction,54153.0,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153
1,2,5782,2019-01-01,00:04:59,5,1,5900.0,Long Beach Blvd,,,...,38021,273(A) PC - ILLEGAL PAY FOR ADOPTION (M) 38021,3,Curbside detention,,,1,Field interview card completed,,
2,3,5876,2019-01-01,18:17:00,4,0,6600.0,Butler Ave,,,...,54106,22350 VC - UNSAFE SPEED:PREVAIL COND (I) 54106,22,,,,51,No Action,,
3,4,5877,2019-01-01,18:25:35,10,1,900.0,E anaheim,,,...,0,,24,,,,55,No Action,,
4,5,5785,2019-01-01,01:04:00,16,0,4800.0,Vista street,,,...,65002,65002 ZZ - LOCAL ORDINANCE VIOL (I) 65002,8,,,,5,Citation for infraction,65000.0,65000 ZZ - LOCAL ORDINANCE VIOL (M) 65000


### Remove excess ObjectId columns:

In [24]:
def remove_objectId(df):
    cols = list(df.filter(regex='ObjectId').columns)
    df.drop(cols, axis=1, inplace=True)

In [25]:
remove_objectId(df)
df.head()

Unnamed: 0,StopID,stopDate,stopTime,stopDuration,stopInResponseToCFS,blockNumber,street,intersection,highwayExit,landMark,...,reasonForStop,reasonForStopDetail,reasonForStopcode,reasonForStopCodeText,actionTakenDuringStop,personSearchConsentGiven,propertySearchConsentGiven,resultOfStop,resultOfStopcode,resultOfStopcodeText
0,5875,2019-01-01,07:41:59,5,0,200.0,E First St,,,,...,Traffic Violation,Moving Violation,54153,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153,Curbside detention,,,Citation for infraction,54153.0,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153
1,5782,2019-01-01,00:04:59,5,1,5900.0,Long Beach Blvd,,,,...,Reasonable Suspicion,Other Reasonable Suspicion of a crime,38021,273(A) PC - ILLEGAL PAY FOR ADOPTION (M) 38021,Curbside detention,,,Field interview card completed,,
2,5876,2019-01-01,18:17:00,4,0,6600.0,Butler Ave,,,,...,Traffic Violation,Moving Violation,54106,22350 VC - UNSAFE SPEED:PREVAIL COND (I) 54106,,,,No Action,,
3,5877,2019-01-01,18:25:35,10,1,900.0,E anaheim,,,,...,Knowledge of outstanding arrest warrant/wanted...,,0,,,,,No Action,,
4,5785,2019-01-01,01:04:00,16,0,4800.0,Vista street,,,,...,Reasonable Suspicion,Other Reasonable Suspicion of a crime,65002,65002 ZZ - LOCAL ORDINANCE VIOL (I) 65002,,,,Citation for infraction,65000.0,65000 ZZ - LOCAL ORDINANCE VIOL (M) 65000


In [26]:
df.columns

Index(['StopID', 'stopDate', 'stopTime', 'stopDuration', 'stopInResponseToCFS',
       'blockNumber', 'street', 'intersection', 'highwayExit', 'landMark',
       'city', 'isSchool', 'schoolName', 'PID', 'isStudent', 'perceivedAge',
       'perceivedGender', 'genderNonconforming', 'perceivedLgbt',
       'perceivedLimitedEnglish', 'perceivedRace',
       'perceivedOrKnownDisability', 'reasonForStop', 'reasonForStopDetail',
       'reasonForStopcode', 'reasonForStopCodeText', 'actionTakenDuringStop',
       'personSearchConsentGiven', 'propertySearchConsentGiven',
       'resultOfStop', 'resultOfStopcode', 'resultOfStopcodeText'],
      dtype='object')

### Drop other unneeded columns:

In [27]:
def drop_others(df):
    cols = ['highwayExit','isSchool', 'schoolName', 'isStudent']
    df.drop(cols, axis=1, inplace=True)

In [28]:
drop_others(df)
df.columns

Index(['StopID', 'stopDate', 'stopTime', 'stopDuration', 'stopInResponseToCFS',
       'blockNumber', 'street', 'intersection', 'landMark', 'city', 'PID',
       'perceivedAge', 'perceivedGender', 'genderNonconforming',
       'perceivedLgbt', 'perceivedLimitedEnglish', 'perceivedRace',
       'perceivedOrKnownDisability', 'reasonForStop', 'reasonForStopDetail',
       'reasonForStopcode', 'reasonForStopCodeText', 'actionTakenDuringStop',
       'personSearchConsentGiven', 'propertySearchConsentGiven',
       'resultOfStop', 'resultOfStopcode', 'resultOfStopcodeText'],
      dtype='object')

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82543 entries, 0 to 82542
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   StopID                      82543 non-null  object
 1   stopDate                    82543 non-null  object
 2   stopTime                    82543 non-null  object
 3   stopDuration                82543 non-null  object
 4   stopInResponseToCFS         82543 non-null  object
 5   blockNumber                 52070 non-null  object
 6   street                      64978 non-null  object
 7   intersection                30895 non-null  object
 8   landMark                    10 non-null     object
 9   city                        82543 non-null  object
 10  PID                         82543 non-null  object
 11  perceivedAge                82543 non-null  object
 12  perceivedGender             82117 non-null  object
 13  genderNonconforming         82543 non-null  ob

In [39]:
df.head(5)

Unnamed: 0,StopID,stopDate,stopTime,stopDuration,stopInResponseToCFS,blockNumber,street,intersection,landMark,city,...,reasonForStop,reasonForStopDetail,reasonForStopcode,reasonForStopCodeText,actionTakenDuringStop,personSearchConsentGiven,propertySearchConsentGiven,resultOfStop,resultOfStopcode,resultOfStopcodeText
0,5875,2019-01-01,07:41:59,5,0,200.0,E First St,,,LONG BEACH,...,Traffic Violation,Moving Violation,54153,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153,Curbside detention,,,Citation for infraction,54153.0,21453(D) VC - PED AGAINST RED LIGHT/ETC (I) 54153
1,5782,2019-01-01,00:04:59,5,1,5900.0,Long Beach Blvd,,,LONG BEACH,...,Reasonable Suspicion,Other Reasonable Suspicion of a crime,38021,273(A) PC - ILLEGAL PAY FOR ADOPTION (M) 38021,Curbside detention,,,Field interview card completed,,
2,5876,2019-01-01,18:17:00,4,0,6600.0,Butler Ave,,,LONG BEACH,...,Traffic Violation,Moving Violation,54106,22350 VC - UNSAFE SPEED:PREVAIL COND (I) 54106,,,,No Action,,
3,5877,2019-01-01,18:25:35,10,1,900.0,E anaheim,,,LONG BEACH,...,Knowledge of outstanding arrest warrant/wanted...,,0,,,,,No Action,,
4,5785,2019-01-01,01:04:00,16,0,4800.0,Vista street,,,LONG BEACH,...,Reasonable Suspicion,Other Reasonable Suspicion of a crime,65002,65002 ZZ - LOCAL ORDINANCE VIOL (I) 65002,,,,Citation for infraction,65000.0,65000 ZZ - LOCAL ORDINANCE VIOL (M) 65000


In [44]:
df[['StopID', 'PID', 'perceivedRace']].drop_duplicates()['perceivedRace'].value_counts()

Hispanic                         16716
Black                            12472
White                            10972
Asian                             3150
Middle Eastern or South Asian     1804
Pacific Islander                  1700
Native American                   1301
Name: perceivedRace, dtype: int64