# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../data/fatal_encounters.csv')

In [4]:
df.shape

(29275, 28)

In [5]:
df.columns

Index(['Unique ID', 'Name', 'Age', 'Gender', 'Race', 'Race with imputations',
       'Imputation probability', 'URL of image (PLS NO HOTLINKS)',
       'Date of injury resulting in death (month/day/year)',
       'Location of injury (address)', 'Location of death (city)', 'State',
       'Location of death (zip code)', 'Location of death (county)',
       'Full Address', 'Latitude', 'Longitude', 'Agency or agencies involved',
       'Cause of death', 'Brief description',
       'Dispositions/Exclusions INTERNAL USE, NOT FOR ANALYSIS',
       'Intended use of force (Developing)', 'Supporting document link',
       'Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS',
       'Unnamed: 24', 'Unnamed: 25', 'Unique ID formula',
       'Unique identifier (redundant)'],
      dtype='object')

In [6]:
# Drop unnamed columns
df = df.drop(columns=['Unnamed: 24', 'Unnamed: 25', 'Unique identifier (redundant)'])

In [7]:
df['Cause of death'].value_counts()

Gunshot                              20719
Vehicle                               6058
Tasered                                917
Medical emergency                      386
Asphyxiated/Restrained                 291
Drowned                                183
Drug overdose                          182
Beaten/Bludgeoned with instrument      176
Undetermined                           103
Fell from a height                      77
Other                                   58
Stabbed                                 49
Burned/Smoke inhalation                 37
Chemical agent/Pepper spray             34
Pursuit                                  2
Name: Cause of death, dtype: int64

In [7]:
df['Location of death (county)'].value_counts()

Los Angeles    1235
Cook            617
Harris          570
Maricopa        485
Orange          448
               ... 
Hoke              1
Norman            1
Cortland          1
Bosque            1
Lampasas          1
Name: Location of death (county), Length: 1500, dtype: int64

In [8]:
# Keep only columns that are relevant
df = df[['Unique ID', 'Name', 'Age', 'Gender', 'Race', 'Race with imputations',
       'Imputation probability','Date of injury resulting in death (month/day/year)', 'Location of injury (address)', 'Location of death (city)', 'State',
       'Location of death (zip code)', 'Location of death (county)','Agency or agencies involved',
       'Cause of death', 'Brief description','Intended use of force (Developing)']]

In [9]:
# Set index to the unique ID
df.set_index('Unique ID', inplace=True)

In [10]:
# Examine race 
df['Race'].value_counts() 

European-American/White    9816
Race unspecified           8248
African-American/Black     6499
Hispanic/Latino            3912
Asian/Pacific Islander      447
Native American/Alaskan     302
Middle Eastern               50
Name: Race, dtype: int64

In [11]:
# Join city and county 
df['Location of Death (City,County)'] = df['Location of death (city)']+ ', ' + df['Location of death (county)']

In [12]:
# Drop city and county individual columns
df.drop(columns=['Location of death (city)','Location of death (county)'], inplace=True)

In [13]:
df['Race with imputations'].value_counts()

European-American/White    13808
African-American/Black      8018
Hispanic/Latino             4801
Race unspecified            1317
Asian/Pacific Islander       538
Native American/Alaskan      302
Middle Eastern                50
HIspanic/Latino                2
Name: Race with imputations, dtype: int64

In [14]:
df['Race'].value_counts()

European-American/White    9816
Race unspecified           8248
African-American/Black     6499
Hispanic/Latino            3912
Asian/Pacific Islander      447
Native American/Alaskan     302
Middle Eastern               50
Name: Race, dtype: int64

In [15]:
# gender binzarized
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

In [16]:
# drop rows where cause of death is vehicle pursuit
df = df[df['Cause of death'] != 'Vehicle']

In [17]:
# set date of injury that resulted in death to date time 
df['Date of injury resulting in death (month/day/year)'] = pd.to_datetime(df['Date of injury resulting in death (month/day/year)'])

In [18]:
# drop if the death was a suicide 
df = df[df['Intended use of force (Developing)'] != 'Suicide']

In [19]:
# drop rows where race isnt specified and race with imputations isnt specified 
df = df[(df['Race'] != 'Race unspecified') & (df['Race with imputations'] != 'Race unspecified')]

In [20]:
# Drop imputation probability
df = df.drop(columns=['Imputation probability'])

In [21]:
# Drop race with imputations as it is the same as race.
df = df.drop(columns=['Race with imputations'])

In [22]:
# Drop address of injury 
df = df.drop(columns=['Location of injury (address)'])

In [23]:
df = df.drop(columns=['Agency or agencies involved'])

In [24]:
# Drop zip code (we have city and state)
df = df.drop(columns=['Location of death (zip code)'])

In [25]:
# Fill nulls in Gender with male (over 95% of data male)
df['Gender'] = df['Gender'].fillna(1)

In [26]:
# Fill age with mode age 
df['Age'] = df['Age'].fillna(25)

In [27]:
df.dtypes

Name                                                          object
Age                                                           object
Gender                                                       float64
Race                                                          object
Date of injury resulting in death (month/day/year)    datetime64[ns]
State                                                         object
Cause of death                                                object
Brief description                                             object
Intended use of force (Developing)                            object
Location of Death (City,County)                               object
dtype: object

In [28]:
# Drop the rest of the nulls (less than 50 total)
df.dropna(inplace=True)

In [29]:
df['Intended use of force (Developing)'].value_counts()

Deadly force              13904
Less-than-lethal force     1300
No                          590
Undetermined                 37
Pursuit                       2
Name: Intended use of force (Developing), dtype: int64

In [30]:
df.dtypes

Name                                                          object
Age                                                           object
Gender                                                       float64
Race                                                          object
Date of injury resulting in death (month/day/year)    datetime64[ns]
State                                                         object
Cause of death                                                object
Brief description                                             object
Intended use of force (Developing)                            object
Location of Death (City,County)                               object
dtype: object

In [30]:
# Remapping Race
df['Race'] = df['Race'].map({'European-American/White': 0, 'African-American/Black': 1, 'Hispanic/Latino':2, 'Asian/Pacific Islander':3,
                            'Native American/Alaskan':3, 'Middle Eastern':3})

In [31]:
# Remapping Intended use of force 
df['Intended use of force'] = df['Intended use of force (Developing)'].map({'Deadly force': 0, 'Less-than-lethal force': 1, 'No':2, 'Undetermined ':2,
                            'Pursuit':2})

In [30]:
# Dummify Race, State, and int
#df = pd.get_dummies(df, columns=['Cause of death', 'State',
                        #         'Location of Death (City,County)'])

In [32]:
df['Cause of death'].value_counts()

Gunshot                              13993
Tasered                                735
Medical emergency                      289
Asphyxiated/Restrained                 208
Drug overdose                          153
Beaten/Bludgeoned with instrument      140
Drowned                                111
Undetermined                            69
Other                                   48
Fell from a height                      33
Chemical agent/Pepper spray             21
Stabbed                                 17
Burned/Smoke inhalation                 16
Name: Cause of death, dtype: int64

In [35]:
df.to_csv('fatal_encounters_eda.csv', index=False)