# Data Cleaning and Feature Engineering

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

df=pd.read_csv('Terry_Stops.csv')
df.head()

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,26 - 35,-1,20150000001920,36224,Field Contact,,7595,1978,M,White,...,05:17:00,-,-,-,,N,N,-,-,-
1,-,-1,20140000120677,92317,Arrest,,7500,1984,M,Black or African American,...,11:32:00,-,-,-,SOUTH PCT 1ST W - ROBERT,N,N,South,O,O2
2,-,-1,20150000001463,28806,Field Contact,,5670,1965,M,White,...,07:59:00,-,-,-,,N,N,-,-,-
3,-,-1,20150000001516,29599,Field Contact,,4844,1961,M,White,...,19:12:00,-,-,-,,N,-,-,-,-
4,-,-1,20150000001670,32260,Field Contact,,7539,1963,M,White,...,04:55:00,-,-,-,,N,N,-,-,-


In [2]:
#First drop columns that are not useful for prediction
df.drop(columns=['GO / SC Num', 'Terry Stop ID', 'Arrest Flag', 'Frisk Flag', 'Sector', 
                 'Beat', 'Reported Time', 'Initial Call Type', 'Final Call Type', 'Officer Squad'], inplace=True)

In [3]:
df

Unnamed: 0,Subject Age Group,Subject ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Reported Date,Call Type,Precinct
0,26 - 35,-1,Field Contact,,7595,1978,M,White,Asian,Male,2015-04-17T00:00:00,-,-
1,-,-1,Arrest,,7500,1984,M,Black or African American,Asian,Male,2015-10-16T00:00:00,-,South
2,-,-1,Field Contact,,5670,1965,M,White,-,-,2015-03-19T00:00:00,-,-
3,-,-1,Field Contact,,4844,1961,M,White,White,Male,2015-03-21T00:00:00,-,-
4,-,-1,Field Contact,,7539,1963,M,White,-,-,2015-04-01T00:00:00,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48503,56 and Above,25788807481,Arrest,-,8450,1992,M,Hispanic or Latino,White,Male,2021-06-26T00:00:00,911,East
48504,56 and Above,25793767417,Field Contact,-,8808,1990,F,White,White,Male,2021-06-29T00:00:00,-,South
48505,56 and Above,25797226393,Field Contact,-,8739,1987,M,Two or More Races,Black or African American,Male,2021-06-30T00:00:00,ONVIEW,North
48506,56 and Above,25931661234,Field Contact,-,8517,1990,M,White,White,Male,2021-07-14T00:00:00,-,-


#### Subject Age Group

Subjects are binned into perceived age groups. Because of the small number of null values, I will remove these rows from the dataset.

In [4]:
df['Subject Age Group'].value_counts()

26 - 35         16165
36 - 45         10342
18 - 25          9672
46 - 55          6275
56 and Above     2468
1 - 17           1990
-                1596
Name: Subject Age Group, dtype: int64

In [5]:
df = df[df['Subject Age Group'] != '-']

#### Subject ID

This column indicates whether or not the subject presented identification to the officer. If ID was presented, the subject is identified according to their ID number in the police database. If ID is not presented, this value is recorded as -1. For simplicity, I will convert this column to boolean--True (1) indicating that the subject presented identification, and False (0) meaning they did not.

In [6]:
df.loc[df['Subject ID'] > 0, 'Subject ID'] = 1

In [7]:
df.loc[df['Subject ID'] < 0, 'Subject ID'] = 0

#### Stop Resolution

'Offense Report' and 'Citation/Infraction' will be condensed into a single category, 'Citation', and 'Referred for Prosecution' will be rolled into the 'Arrest' category. I will also rename 'Field Contact' to 'No Action' to make it more clear what the result of the stop was.

In [8]:
df['Stop Resolution'].value_counts()

Field Contact               18817
Offense Report              15367
Arrest                      11834
Referred for Prosecution      719
Citation / Infraction         175
Name: Stop Resolution, dtype: int64

In [9]:
df['Stop Resolution'].replace('Referred for Prosecution', 'Arrest', inplace=True)
df['Stop Resolution'].replace('Citation / Infraction', 'Offense Report', inplace=True)

df['Stop Resolution'].replace('Field Contact', 'No Action', inplace=True)
df['Stop Resolution'].replace('Offense Report', 'Citation', inplace=True)

df.loc[df['Stop Resolution'] == 'No Action', 'Actionable Stop'] = 0
df.loc[df['Stop Resolution'] != 'No Action', 'Actionable Stop'] = 1

In [10]:
df['Actionable Stop'].value_counts()

1.0    28095
0.0    18817
Name: Actionable Stop, dtype: int64

#### Weapon Type

Rather than separate by weapon type, I will create a column to track whether or not a weapon was found during the stop.

In [11]:
df['Weapon Type'].value_counts()

None                                    31601
-                                       12476
Lethal Cutting Instrument                1456
Knife/Cutting/Stabbing Instrument         694
Handgun                                   287
Firearm Other                              93
Blunt Object/Striking Implement            91
Club, Blackjack, Brass Knuckles            48
Firearm                                    44
Mace/Pepper Spray                          31
Other Firearm                              26
Firearm (unk type)                         15
Taser/Stun Gun                             10
Club                                        9
None/Not Applicable                         9
Fire/Incendiary Device                      7
Rifle                                       6
Shotgun                                     3
Automatic Handgun                           2
Personal Weapons (hands, feet, etc.)        2
Blackjack                                   1
Brass Knuckles                    

In [12]:
df.loc[df['Weapon Type'] == 'None', 'Weapon'] = 0
df.loc[df['Weapon Type'] != 'None', 'Weapon'] = 1
df.loc[df['Weapon Type'] == '-', 'Weapon'] = 0

In [13]:
df.drop(columns=['Weapon Type'], inplace=True)

In [14]:
df['Weapon'].value_counts()

0.0    44077
1.0     2835
Name: Weapon, dtype: int64

#### Officer YOB

I chose to bin the data based on generation to simplify categorization.

In [15]:
df['Officer YOB'].value_counts()

1986    3280
1987    2969
1984    2703
1991    2671
1992    2456
1985    2445
1990    2295
1988    2135
1989    2026
1982    1837
1983    1685
1979    1498
1993    1498
1981    1417
1995    1272
1971    1184
1978    1145
1976    1007
1977     998
1994     920
1973     906
1980     814
1996     706
1967     705
1968     590
1970     564
1969     549
1974     535
1975     523
1997     472
1962     449
1964     432
1972     414
1965     412
1963     236
1958     215
1961     206
1966     180
1959     167
1960     128
1998      57
1954      44
1957      43
1953      33
1900      31
1955      21
1956      17
1948      10
1949       5
1952       4
1946       2
1951       1
Name: Officer YOB, dtype: int64

In [16]:
df['Officer Age Group'] = pd.cut(df['Officer YOB'],[1899, 1945, 1964, 1980, 1997, 2000], labels = ['Pre-Boomer','Baby Boomer', 'Gen X', 'Millenial', 'Gen Z'])

In [17]:
df['Officer Age Group'].value_counts()

Millenial      32787
Gen X          12024
Baby Boomer     2013
Gen Z             57
Pre-Boomer        31
Name: Officer Age Group, dtype: int64

In [18]:
df.drop(columns=['Officer YOB'], inplace=True)

#### Officer Gender

Convert to 1/0 binary where 1 = male and 0 = female.

In [19]:
df['Officer Gender'].replace('F', '0', inplace=True)
df['Officer Gender'].replace('M', '1', inplace=True)
df['Officer Gender'].replace('N', '0', inplace=True)

In [20]:
df['Officer Gender'].value_counts()

1    41503
0     5409
Name: Officer Gender, dtype: int64

#### Officer Race, Subject Perceived Race, and Subject Perceived Gender

Each of these features are using multiple responses to classify unknown or not reported data. For each feature, I have condensed categories to use a single category for unknown values.

In [21]:
df['Officer Race'].value_counts()

White                            35429
Two or More Races                 2715
Hispanic or Latino                2701
Asian                             2020
Black or African American         1803
Not Specified                     1460
Nat Hawaiian/Oth Pac Islander      440
American Indian/Alaska Native      313
Unknown                             31
Name: Officer Race, dtype: int64

In [22]:
df['Officer Race'].replace('Unknown', 'Not Specified', inplace=True)

In [23]:
df['Subject Perceived Race'].value_counts()

White                                        23273
Black or African American                    14062
Unknown                                       2506
Hispanic                                      1659
Asian                                         1558
-                                             1471
American Indian or Alaska Native              1365
Multi-Racial                                   801
Other                                          150
Native Hawaiian or Other Pacific Islander       67
Name: Subject Perceived Race, dtype: int64

In [24]:
df['Subject Perceived Race'].replace('-', 'Unknown', inplace=True)

In [25]:
df['Subject Perceived Gender'].value_counts()

Male                                                         37210
Female                                                        9553
Unable to Determine                                            108
Unknown                                                         19
-                                                               16
Gender Diverse (gender non-conforming and/or transgender)        6
Name: Subject Perceived Gender, dtype: int64

In [26]:
df['Subject Perceived Gender'].replace('-', 'Unknown', inplace=True)

#### Reported Date

Using this column I engineer several features -- year, day of the week, and month -- to help track trends over time.

In [27]:
#convert column to datetime to allow date-based calculations
df['Reported Date'] = pd.to_datetime(df['Reported Date'])

In [28]:
#create day of week column and map numeric values to day names
day_map={0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday',5: 'Saturday', 6: 'Sunday'} 
df['Day of Week']=df['Reported Date'].dt.weekday.map(day_map)

#create year column
df['Year'] = df['Reported Date'].dt.year

#create month column and map numeric values to month names
month_map = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 
             7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
df['Month'] = df['Reported Date'].dt.month.map(month_map)

In [29]:
df.drop(columns='Reported Date', inplace=True)

#### Call Type

In [30]:
df['Call Type'].value_counts()

911                              21400
-                                12773
ONVIEW                            9072
TELEPHONE OTHER, NOT 911          3311
ALARM CALL (NOT POLICE ALARM)      346
TEXT MESSAGE                         9
SCHEDULED EVENT (RECURRING)          1
Name: Call Type, dtype: int64

In [31]:
df['Call Type'].replace('-', 'No Call', inplace=True)
df['Call Type'].replace('ALARM CALL (NOT POLICE ALARM)', 'Other Call (Not 911)', inplace=True)
df['Call Type'].replace('TEXT MESSAGE', 'Other Call (Not 911)', inplace=True)
df['Call Type'].replace('SCHEDULED EVENT (RECURRING)', 'Other Call (Not 911)', inplace=True)
df['Call Type'].replace('TELEPHONE OTHER, NOT 911', 'Other Call (Not 911)', inplace=True)

#### Precinct

In [32]:
df['Precinct'].value_counts()

West         11578
North        10341
-             9544
East          6194
South         5601
Southwest     2244
SouthWest     1171
Unknown        183
OOJ             35
FK ERROR        21
Name: Precinct, dtype: int64

In [33]:
df['Precinct'].replace('-', 'Unknown', inplace=True)
df['Precinct'].replace('SouthWest', 'Southwest', inplace=True)

In [34]:
df = df[df['Precinct'] != 'OOJ']

In [35]:
df = df[df['Precinct'] != 'FK ERROR']

#### Review Updated Data

At this point, I believe the data is clean enough to begin the base model.

In [36]:
df.head()

Unnamed: 0,Subject Age Group,Subject ID,Stop Resolution,Officer ID,Officer Gender,Officer Race,Subject Perceived Race,Subject Perceived Gender,Call Type,Precinct,Actionable Stop,Weapon,Officer Age Group,Day of Week,Year,Month
0,26 - 35,0,No Action,7595,1,White,Asian,Male,No Call,Unknown,0.0,0.0,Gen X,Friday,2015,April
1218,1 - 17,0,No Action,7726,1,White,White,Female,No Call,Unknown,0.0,0.0,Millenial,Sunday,2015,May
1598,1 - 17,0,Arrest,7715,1,White,American Indian or Alaska Native,Male,No Call,East,1.0,0.0,Millenial,Sunday,2015,September
1599,1 - 17,0,No Action,7745,0,Not Specified,Unknown,Male,No Call,Unknown,0.0,0.0,Millenial,Sunday,2015,April
1600,1 - 17,0,No Action,7634,1,White,Black or African American,Male,No Call,Unknown,0.0,0.0,Gen X,Sunday,2015,April


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46856 entries, 0 to 48507
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Subject Age Group         46856 non-null  object  
 1   Subject ID                46856 non-null  int64   
 2   Stop Resolution           46856 non-null  object  
 3   Officer ID                46856 non-null  object  
 4   Officer Gender            46856 non-null  object  
 5   Officer Race              46856 non-null  object  
 6   Subject Perceived Race    46856 non-null  object  
 7   Subject Perceived Gender  46856 non-null  object  
 8   Call Type                 46856 non-null  object  
 9   Precinct                  46856 non-null  object  
 10  Actionable Stop           46856 non-null  float64 
 11  Weapon                    46856 non-null  float64 
 12  Officer Age Group         46856 non-null  category
 13  Day of Week               46856 non-null  obje

In [38]:
df.drop(columns=['Officer ID', 'Precinct'], inplace=True)

In [39]:
#save updated data to csv file
df.to_csv('Cleaned Data.csv')

In [40]:
df = df.loc[df['Officer ID'] != '-']
df = df.loc[df['Officer ID'] != '-9']
#df = df.loc[df['Precinct'] != 'Unknown']

KeyError: 'Officer ID'

In [None]:
precinct_df = df.loc[:, ['Officer ID', 'Precinct']]

In [None]:
precinct_df.sort_values('Officer ID')

In [None]:
precinct_df['Precinct'].replace('Unknown', np.nan, inplace=True)

In [None]:
precinct_df.info()

In [None]:
df.info()