In [1]:
import pandas as pd
pd.set_option('precision', 2)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
colors_ = plt.get_cmap('Set2')(np.linspace(0, 1, 8))
colors_year = plt.get_cmap('magma')(np.linspace(0.5, 1, 5)[::-1][1:])

from IPython.core.pylabtools import figsize
from IPython.display import display
figsize(8, 5)

%load_ext watermark
%load_ext autoreload
%autoreload 2
%matplotlib inline

%watermark -d -t -u -v -g -r -b -iv -a "Hongsup Shin" 

Author: Hongsup Shin

Last updated: 2021-04-09 23:53:17

Python implementation: CPython
Python version       : 3.7.10
IPython version      : 7.20.0

Git hash: 54d4c1c8c0b377662add0b8bc25aeb66852598af

Git repo: https://github.com/texas-justice-initiative/officer_involved_shooting.git

Git branch: create_annual_report

matplotlib: 3.4.1
numpy     : 1.20.2
seaborn   : 0.11.1
pandas    : 1.2.3



In [2]:
# loading custom modules
import preprocess
import plot
from preprocess import Preprocess

In [3]:
df_census = pd.read_pickle('../Data/Interim/census_county_race_2010.pkl')
correct_county_names = df_census.index

In [4]:
print(correct_county_names)

Index(['ANDERSON', 'ANDREWS', 'ANGELINA', 'ARANSAS', 'ARCHER', 'ARMSTRONG',
       'ATASCOSA', 'AUSTIN', 'BAILEY', 'BANDERA',
       ...
       'WILLACY', 'WILLIAMSON', 'WILSON', 'WINKLER', 'WISE', 'WOOD', 'YOAKUM',
       'YOUNG', 'ZAPATA', 'ZAVALA'],
      dtype='object', name='County', length=254)


# Preprocessing OIS data with `preprocess.Preprocess`
## Civilian data

Loading the raw data

In [5]:
df_cd = pd.read_csv('../Data/Raw/Website/tji_civilians-shot_Apr2021.csv')

In [6]:
preprocessor = Preprocess(df_cd, correct_county_names)

In [7]:
preprocessor.get_civilian_data()

ValueError: Incorrect county names exist: {'COLIN', 'QUAY (NM)'}

The module has a county name check. Since `ValueError` was raised, we need to fix this.

In [8]:
(df_cd['incident_county'] == 'QUAY (NM)').sum()

1

For "QUAY", which exists in New Mexico, we will remove it from the data (one incident).

In [9]:
df_cd = df_cd.loc[df_cd['incident_county'] != 'QUAY (NM)', :]

For "COLIN", the correct spelling is "COLLIN". Let's fix this.

In [10]:
df_cd.loc[:, 'incident_county'] = df_cd['incident_county'].str.replace('COLIN','COLLIN').values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [11]:
preprocessor = Preprocess(df_cd, correct_county_names)
df_civilian_preprocessed = preprocessor.get_civilian_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_datetime(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [12]:
df_civilian_preprocessed.head()

Unnamed: 0,date_ag_received,date_incident,incident_address,incident_city,incident_county,incident_zip,incident_result_of,incident_call_other,civilian_name_first,civilian_name_last,...,month,died,Traffic Stop,Emergency/Request for Assistance,Execution of a Warrant,Hostage/Barricade/Other Emergency,Other,civilian_age_binned,delay_days,delay_bin_label
66,NaT,2016-01-02,2000 LAMAR STREET,SWEETWATER,NOLAN,79556.0,EMERGENCY CALL OR REQUEST FOR ASSISTANCE,"SUSPECT INVOLVED IN HOMICIDE IN NOLAN COUNTY, ...",LANCE,LIGHT,...,1,True,0,1,0,0,0,5,,-1
67,NaT,2016-01-03,FM 2932 & FM 741,UNINCORPORATED,KAUFMAN,75126.0,OTHER - SPECIFY TYPE OF CALL,OFF DUTY - ATTEMPTED TO ASSIST MOTORIST,CODY,WESBERRY,...,1,False,0,0,0,0,1,2,,-1
68,NaT,2016-01-04,1142 BURGER STREET,ABILENE,TAYLOR,79603.0,EMERGENCY CALL OR REQUEST FOR ASSISTANCE,,DEREK LEE,CASEY,...,1,False,0,1,0,0,0,3,,-1
69,NaT,2016-01-17,5639 OLD HWY 90 W,SAN ANTONIO,BEXAR,78227.0,EMERGENCY CALL OR REQUEST FOR ASSISTANCE,,ASHTON LANE,MORRIS,...,1,True,0,1,0,0,0,3,,-1
70,NaT,2016-01-19,1100 RIVERWOOD,DALLAS,DALLAS,75217.0,OTHER - SPECIFY TYPE OF CALL,SUSPICIOUS PERSON,GARY WAYNE,JONES,...,1,True,0,0,0,0,1,4,,-1


In [13]:
df_civilian_preprocessed.shape

(873, 154)

In [14]:
print(np.sort(df_civilian_preprocessed.columns))

['Emergency/Request for Assistance' 'Execution of a Warrant'
 'Hostage/Barricade/Other Emergency' 'Other' 'Traffic Stop'
 'agency_city_1' 'agency_city_10' 'agency_city_11' 'agency_city_2'
 'agency_city_3' 'agency_city_4' 'agency_city_5' 'agency_city_6'
 'agency_city_7' 'agency_city_8' 'agency_city_9' 'agency_county_1'
 'agency_county_10' 'agency_county_2' 'agency_county_3' 'agency_county_4'
 'agency_county_5' 'agency_county_6' 'agency_county_7' 'agency_county_8'
 'agency_county_9' 'agency_email_person_filling_out_1'
 'agency_email_person_filling_out_10' 'agency_email_person_filling_out_11'
 'agency_email_person_filling_out_2' 'agency_email_person_filling_out_3'
 'agency_email_person_filling_out_4' 'agency_email_person_filling_out_5'
 'agency_email_person_filling_out_6' 'agency_email_person_filling_out_7'
 'agency_email_person_filling_out_8' 'agency_email_person_filling_out_9'
 'agency_name_1' 'agency_name_10' 'agency_name_11' 'agency_name_2'
 'agency_name_3' 'agency_name_4' 'agency_nam

## Officer data

Loading the raw data

In [16]:
df_os = pd.read_csv('../Data/Raw/Website/tji_officers-Shot_Apr2021.csv')

In [17]:
preprocessor = Preprocess(df_os, correct_county_names)

In [18]:
df_officer_preprocessed = preprocessor.get_officer_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['died'] = self.df[death_injury_col_name]=='DEATH'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [19]:
df_officer_preprocessed.shape

(167, 52)

In [20]:
print(np.sort(df_officer_preprocessed.columns))

['agency_city_1' 'agency_city_2' 'agency_county_1' 'agency_county_2'
 'agency_email_person_filling_out_1' 'agency_email_person_filling_out_2'
 'agency_name_1' 'agency_name_2' 'agency_name_person_filling_out_1'
 'agency_name_person_filling_out_2' 'agency_report_date_1'
 'agency_report_date_2' 'agency_zip_1' 'agency_zip_2' 'civilian_age_1'
 'civilian_age_2' 'civilian_age_3' 'civilian_gender_1' 'civilian_gender_2'
 'civilian_gender_3' 'civilian_harm' 'civilian_name_first_1'
 'civilian_name_first_2' 'civilian_name_first_3' 'civilian_name_last_1'
 'civilian_name_last_2' 'civilian_name_last_3' 'civilian_race_1'
 'civilian_race_2' 'civilian_race_3' 'civilian_suicide' 'date_ag_received'
 'date_incident' 'delay_bin_label' 'delay_days' 'died' 'incident_address'
 'incident_city' 'incident_county' 'incident_zip' 'media_link_1'
 'media_link_2' 'media_link_3' 'month' 'num_civilians_recorded'
 'officer_age' 'officer_gender' 'officer_harm' 'officer_name_first'
 'officer_name_last' 'officer_race' 'year

Save the preprocessed datasets

In [21]:
df_civilian_preprocessed.to_pickle('../Data/Preprocessed/civilian_preprocessed_20162020.pkl')
df_officer_preprocessed.to_pickle('../Data/Preprocessed/officer_preprocessed_20162020.pkl')