## Preprocessing & City/Neighborhood Encoding
Version 1.5

Felix Chen - flora-concise
* Felix wrote the vast majority of the code except the first two lines, i.e., the importation.

The notebook version of the `preprocess.py` file

In [2]:
import numpy as np
import pandas as pd

# Install openpyxl as needed
# ^ No longer needed since we are doing .csv now

In [3]:
# We saved the original Excel file as a .csv for convenience's sake
with open('police_data.csv',
          'r', errors='replace') as pol_datafile:
    pol_df = pd.read_csv(pol_datafile) # dtype='object'

pol_df

Unnamed: 0,name,age,gender,race,victim_image,date,street_address,city,state,zip,...,prosecutor_head,prosecutor_race,prosecutor_gender,prosecutor_party,prosecutor_term,prosecutor_in_court,prosecutor_special,independent_investigation,prosecutor_url,Unnamed: 61
0,Steven Espinoza,36.0,Male,Hispanic,https://i0.wp.com/iecn.com/wp-content/uploads/...,1/12/2025,N Mountain Ave and 11th St,Upland,CA,91786,...,,,,,,,,,,
1,Jose Evans,42.0,Male,Hispanic,https://wgntv.com/wp-content/uploads/sites/5/2...,1/12/2025,8500 block of Cermak Rd,North Riverside,IL,60546,...,,,,,,,,,,
2,"Benjamin Prowell, Jr.",34.0,Male,Black,https://cache.legacy.net/legacy/images/cobrand...,1/11/2025,10000 block of Crystal Hill Rd,Maumelle,AR,72113,...,,,,,,,,,,
3,Brian Rolstad,43.0,Male,Unknown race,,1/11/2025,900 block of W 23rd St,Los Angeles,CA,90007,...,,,,,,,,,,
4,Devin Shields,23.0,Male,Unknown race,,1/11/2025,2300 block of Waverly Dr,Gary,IN,46404,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14016,Mark Chavez,49.0,Male,Hispanic,http://www.tricitytribuneusa.com/wp-content/up...,1/1/2013,912 Loma Linda Ave.,Farmington,NM,87401,...,,,,,,,,,,
14017,Tyree Bell,31.0,Male,Black,http://content.omaha.com/media/maps/ps/2013/ja...,1/1/2013,3727 N. 42nd St.,Omaha,NE,68111,...,,,,,,,,,,
14018,Andrew L. Closson,21.0,Male,White,http://www.superiortelegram.com/sites/default/...,1/1/2013,U.S. Highway 53,Gordon,WI,54838,...,,,,,,,,,,
14019,Abel Gurrola,26.0,Male,Hispanic,http://www.bakersfieldnow.com/news/local/A-26-...,1/1/2013,720 Terrace Way,Bakersfield,CA,93304,...,,,,,,,,,,


In [4]:
# Check for missing values
null_sum = pol_df.isnull().sum()
null_sum

name                             0
age                            515
gender                           2
race                             0
victim_image                  5633
                             ...  
prosecutor_in_court          13997
prosecutor_special           13858
independent_investigation    13979
prosecutor_url               13893
Unnamed: 61                  14021
Length: 62, dtype: int64

In [5]:
# Drop columns with >= 30% missing values
pol_df.drop(columns=pol_df.columns[null_sum > len(pol_df) * 0.3], inplace=True)
pol_df.shape, pol_df.columns

((14021, 38),
 Index(['name', 'age', 'gender', 'race', 'date', 'street_address', 'city',
        'state', 'zip', 'county', 'agency_responsible', 'ori', 'cause_of_death',
        'circumstances', 'disposition_official', 'officer_charged', 'news_urls',
        'signs_of_mental_illness', 'allegedly_armed', 'wapo_armed',
        'wapo_threat_level', 'wapo_flee', 'geography', 'encounter_type',
        'initial_reason', 'call_for_service', 'tract',
        'hhincome_median_census_tract', 'latitude', 'longitude',
        'pop_total_census_tract', 'pop_white_census_tract',
        'pop_black_census_tract', 'pop_native_american_census_tract',
        'pop_asian_census_tract', 'pop_pacific_islander_census_tract',
        'pop_other_multiple_census_tract', 'pop_hispanic_census_tract'],
       dtype='object'))

In [6]:
pol_df.dtypes

name                                  object
age                                  float64
gender                                object
race                                  object
date                                  object
street_address                        object
city                                  object
state                                 object
zip                                   object
county                                object
agency_responsible                    object
ori                                   object
cause_of_death                        object
circumstances                         object
disposition_official                  object
officer_charged                       object
news_urls                             object
signs_of_mental_illness               object
allegedly_armed                       object
wapo_armed                            object
wapo_threat_level                     object
wapo_flee                             object
geography 

### Categorical data

**Columns needing extra care**

In [7]:
# 'city' is made all lower-case here
pol_df['city'] = pol_df['city'].str.lower()

# 'tract' contains extraneous '.0' from erroneous conversion -> remove '.0'
pol_df['tract'] = pol_df['tract'].astype('string').str.replace('.0', '')

In [8]:
# 'cause_of_death' is really a list and should be parsed as such
# Although here we keep it as a string so we can search later
pol_df['cause_of_death'] = pol_df['cause_of_death'].astype('string') #.str.split(', ')

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
# pol_df.dropna(subset=['cause_of_death'])

pol_df['cause_of_death']

0                                         Gunshot
1                                         Gunshot
2                                  Gunshot, Taser
3                                         Gunshot
4                                         Gunshot
                           ...                   
14016                                     Gunshot
14017                                     Gunshot
14018                                     Gunshot
14019                                     Gunshot
14020    Taser, Beaten, Physical Restraint, Other
Name: cause_of_death, Length: 14021, dtype: string

In [None]:
# Checking for type
# https://stackoverflow.com/questions/152580/whats-the-canonical-way-to-check-for-type-in-python

# causes = set()
# for lst in pol_df['cause_of_death']:
#     if isinstance(lst, list):
#         for item in lst:
#             causes.add(item)
# causes

set()

**All categorical columns except where it's necessary to keep it as string**

In [10]:
cat_cols = ['gender', 'race', 'city', 'state', 'zip', 'county',
            'agency_responsible', 'tract']
pol_df[cat_cols] = pol_df[cat_cols].astype('category')
pol_df[cat_cols]

Unnamed: 0,gender,race,city,state,zip,county,agency_responsible,tract
0,Male,Hispanic,upland,CA,91786,San Bernardino,Upland Police Department,823
1,Male,Hispanic,north riverside,IL,60546,Cook,North Riverside Police Department,817900
2,Male,Black,maumelle,AR,72113,Pulaski,Maumelle Police Department,4307
3,Male,Unknown race,los angeles,CA,90007,Los Angeles,Los Angeles Police Department,224410
4,Male,Unknown race,gary,IN,46404,Lake,Gary Police Department,10203
...,...,...,...,...,...,...,...,...
14016,Male,Hispanic,farmington,NM,87401,San Juan,Farmington Police Department,302
14017,Male,Black,omaha,NE,68111,Douglas,Omaha Police Department,5901
14018,Male,White,gordon,WI,54838,Douglas,Douglas County Sheriff's Office,30302
14019,Male,Hispanic,bakersfield,CA,93304,Kern,Bakersfield Police Department,2600


In [11]:
pol_df[cat_cols].dtypes

gender                category
race                  category
city                  category
state                 category
zip                   category
county                category
agency_responsible    category
tract                 category
dtype: object

### Numerical data

In [12]:
float64_cols = ['latitude', 'longitude']
float32_cols = ['age']
pol_df[float64_cols] = pol_df[float64_cols].astype('float64')
pol_df[float32_cols] = pol_df[float32_cols].astype('float32')

temp_nums = pd.concat([pol_df[float64_cols], pol_df[float32_cols]], axis=1)
temp_nums

Unnamed: 0,latitude,longitude,age
0,34.103326,-117.670186,36.0
1,41.850411,-87.835193,42.0
2,34.818509,-92.371546,34.0
3,34.033391,-118.280079,43.0
4,41.596159,-87.366104,23.0
...,...,...,...
14016,36.739514,-108.201029,49.0
14017,41.293110,-95.975491,31.0
14018,46.245268,-91.801031,21.0
14019,35.350508,-119.011261,26.0


In [13]:
temp_nums.dtypes

latitude     float64
longitude    float64
age          float32
dtype: object

In [14]:
# Converting latitude and longitude to an array of tuples
# https://stackoverflow.com/questions/9758450/pandas-convert-dataframe-to-array-of-tuples/34551914#34551914

pol_df['lat_long'] = list(pol_df[['latitude', 'longitude']].itertuples(index=False, name=None))

# Alternative:
# pol_df['lat_long'] = pol_df[['latitude', 'longitude']].values.tolist()

pol_df['lat_long']

0        (34.1033261, -117.6701864)
1          (41.850411, -87.8351933)
2         (34.8185095, -92.3715459)
3        (34.0333907, -118.2800794)
4          (41.5961595, -87.366104)
                    ...            
14016      (36.739514, -108.201029)
14017        (41.29311, -95.975491)
14018       (46.245268, -91.801031)
14019      (35.350508, -119.011261)
14020       (44.164679, -94.007459)
Name: lat_long, Length: 14021, dtype: object

In [15]:
# Drop columns without lat_long
pol_df.dropna(subset=['lat_long'], inplace=True)

pol_df.shape

(14021, 39)

### DateTime data

In [16]:
# Convert to DateTime type
series_date = pol_df['date'].astype('string').str.split('/')
df_date = pd.DataFrame(series_date.tolist(), columns=['month','day', 'year'])
pol_df['date'] = pd.to_datetime(df_date)
pol_df = pd.DataFrame.join(pol_df, df_date)

pol_df[['date', 'month', 'day', 'year']]

Unnamed: 0,date,month,day,year
0,2025-01-12,1,12,2025
1,2025-01-12,1,12,2025
2,2025-01-11,1,11,2025
3,2025-01-11,1,11,2025
4,2025-01-11,1,11,2025
...,...,...,...,...
14016,2013-01-01,1,1,2013
14017,2013-01-01,1,1,2013
14018,2013-01-01,1,1,2013
14019,2013-01-01,1,1,2013


### String/Textual data

In [17]:
str_cols = ['name', 'street_address']
pol_df[str_cols] = pol_df[str_cols].astype('string')

pol_df[str_cols]

Unnamed: 0,name,street_address
0,Steven Espinoza,N Mountain Ave and 11th St
1,Jose Evans,8500 block of Cermak Rd
2,"Benjamin Prowell, Jr.",10000 block of Crystal Hill Rd
3,Brian Rolstad,900 block of W 23rd St
4,Devin Shields,2300 block of Waverly Dr
...,...,...
14016,Mark Chavez,912 Loma Linda Ave.
14017,Tyree Bell,3727 N. 42nd St.
14018,Andrew L. Closson,U.S. Highway 53
14019,Abel Gurrola,720 Terrace Way


### Current state of cleaned `pol_df` dataframe

In [18]:
pol_df

Unnamed: 0,name,age,gender,race,date,street_address,city,state,zip,county,...,pop_black_census_tract,pop_native_american_census_tract,pop_asian_census_tract,pop_pacific_islander_census_tract,pop_other_multiple_census_tract,pop_hispanic_census_tract,lat_long,month,day,year
0,Steven Espinoza,36.0,Male,Hispanic,2025-01-12,N Mountain Ave and 11th St,upland,CA,91786,San Bernardino,...,,,,,,,"(34.1033261, -117.6701864)",1,12,2025
1,Jose Evans,42.0,Male,Hispanic,2025-01-12,8500 block of Cermak Rd,north riverside,IL,60546,Cook,...,,,,,,,"(41.850411, -87.8351933)",1,12,2025
2,"Benjamin Prowell, Jr.",34.0,Male,Black,2025-01-11,10000 block of Crystal Hill Rd,maumelle,AR,72113,Pulaski,...,,,,,,,"(34.8185095, -92.3715459)",1,11,2025
3,Brian Rolstad,43.0,Male,Unknown race,2025-01-11,900 block of W 23rd St,los angeles,CA,90007,Los Angeles,...,,,,,,,"(34.0333907, -118.2800794)",1,11,2025
4,Devin Shields,23.0,Male,Unknown race,2025-01-11,2300 block of Waverly Dr,gary,IN,46404,Lake,...,,,,,,,"(41.5961595, -87.366104)",1,11,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14016,Mark Chavez,49.0,Male,Hispanic,2013-01-01,912 Loma Linda Ave.,farmington,NM,87401,San Juan,...,0%,10%,1%,0%,1%,15%,"(36.739514, -108.201029)",1,1,2013
14017,Tyree Bell,31.0,Male,Black,2013-01-01,3727 N. 42nd St.,omaha,NE,68111,Douglas,...,59%,2%,4%,0%,7%,1%,"(41.29311, -95.975491)",1,1,2013
14018,Andrew L. Closson,21.0,Male,White,2013-01-01,U.S. Highway 53,gordon,WI,54838,Douglas,...,1%,2%,1%,0%,1%,2%,"(46.245268, -91.801031)",1,1,2013
14019,Abel Gurrola,26.0,Male,Hispanic,2013-01-01,720 Terrace Way,bakersfield,CA,93304,Kern,...,22%,1%,0%,0%,2%,68%,"(35.350508, -119.011261)",1,1,2013


In [19]:
pol_df.dtypes

name                                 string[python]
age                                         float32
gender                                     category
race                                       category
date                                 datetime64[ns]
street_address                       string[python]
city                                       category
state                                      category
zip                                        category
county                                     category
agency_responsible                         category
ori                                          object
cause_of_death                       string[python]
circumstances                                object
disposition_official                         object
officer_charged                              object
news_urls                                    object
signs_of_mental_illness                      object
allegedly_armed                              object
wapo_armed  