# Clean the Texas OIS dataset for analysis -- officers shot data

* Inputs:
   * `OIS.xlsx`
   * `agencies_and_counties.csv` (used to add county information)
* Output: `shot_officers.csv`

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
CLEANED_FILENAME = 'shot_officers.csv'
DTW_OIS_PROJECT = 'tji/officer-involved-shootings'

In [2]:
import os
import sys
import boto3
import datadotworld as dw
import numpy as np
import pandas as pd

from lib.cleaning_tools import *
from lib.standardize_police_agency_names import standardize_agency_name

from io import StringIO

sys.path.append(os.getcwd() + '/../data_cleaning')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -r -g -w -p datadotworld,numpy,pandas

Everett Wetchler 2018-12-07 12:23:14 PST

datadotworld 1.6.0
numpy 1.14.5
pandas 0.23.3
Git hash: d828d03ea088fc7efe460b37be10039266097a99
Git repo: git@github.com:texas-justice-initiative/data-processing.git
watermark 1.6.1


In [3]:
dtw_datasets = dw.load_dataset('tji/auxiliary-datasets', force_update=True)
agencies = dtw_datasets.dataframes['agencies_and_counties']
agencies.head()

Unnamed: 0,agency,county
0,ANDERSON CO CONST PCT 1,ANDERSON
1,ANDERSON CO CONST PCT 2,ANDERSON
2,ANDERSON CO CONST PCT 3,ANDERSON
3,ANDERSON CO CONST PCT 4,ANDERSON
4,ANDERSON CO DIST ATTY OFFICE,ANDERSON


In [4]:
shootings = read_dtw_excel('tji/raw-and-processing', 'original/OIS.xlsx')['OfficersShot']
print("%d OIS officers-shot incidents from %s to %s" % (
    len(shootings),
    shootings['Date of Incident'].min().strftime('%Y-%m-%d'),
    shootings['Date of Incident'].max().strftime('%Y-%m-%d')))
shootings.head()

Writing excel file to temp file: /var/folders/dc/8cbxbsh515s908xl0zyprszm0000gn/T/tmpjo4cj9ay
89 OIS officers-shot incidents from 2015-10-06 to 2018-11-01


Unnamed: 0,No.,Number of Reports Filed,Date OAG Received,Date of Report 1,Name of Agency 1,City of Agency 1,Zip code of Agency 1,Date of Incident,Name of Person 1 Filling out Form,Email Address of Person 1 Filling out Form,Date of Report 2,Name of Agency 2,City of Agency 2,Zip code of Agency 2,Name of Person 2 Filling out form,Email Address of Person 2 Filling out Form,Officer First Name,Officer Last Name,Injured or Deceased's Gender,Injured or Deceased's Age,Injured or Deceased's Race/Ethnicity,Street Address of Incident,City of Incident,County of Incident,Zip Code of Incident,Incident Resulted In 1,Incident Resulted In 2,Non-Peace Officer's Gender 1,Non-Peace Officer's Age 1,Non-Peace Officer's Race/Ethnicity 1,Non-Officer First Name 1,Non-Officer Last Name 1,Non-Peace Officer's Gender 2,Non-Peace Officer's Age 2,Non-Peace Officer's Race/Ethnicity 2,Non-Officer First Name 2,Non-Officer Last Name 2,Non-Peace Officer's Gender 3,Non-Peace Officer's Age 3,Non-Peace Officer's Race/Ethnicity 3,Non-Officer First Name 3,Non-Officer Last Name 3,Media,Column1,Column2
0,1,1,NaT,10/26/2015,Seguin Police Department,Seguin,78155,2015-10-06,MC Meyers,mcmeyers@seguintexas.gov,,,,,,,,,Female,54.0,Anglo or White,815 Lamar Drive,Seguin,Guadalupe,78155,Injury of Peace Officer,,Female,54,Anglo or White,,,,,,,,,,,,,,,
1,2,1,NaT,11/3/2015,Bexar County Sheriff's Office,San Antonio,78207,2015-10-30,Det. Frank Stubbs #4048,sstubbs@bexar.org,,,,,,,Joseph,Canales,Male,42.0,Hispanic or Latino,8000 Midcrown,San Antonio,Bexar,78218,Injury of Peace Officer,,Male,36,Black or African American,Sherman,Robinson,,,,,,,,,,,MySA,News4SanAntonio,
2,3,1,2017-02-21,2/21/2017,Marlin Police Department,Marlin,76661,2015-11-01,Damien Eaglin,chiefeaglin@marlintx.net,,,,,,,Darrell,Allen,Male,42.0,Black or African American,301 W. Avenue B,Temple,Bell County,76501,Death of Peace Officer,,Male,Not Available,Not Available,,,,,,,,,,,,,Waco Tribune-Herald,,
3,4,1,NaT,12/29/2015,Odessa Police Department,Odessa,79761,2015-12-23,Sgt. David Lara,dlara@odessa-tx.gov,,,,,,,Pete,Gonzales,Male,42.0,Hispanic or Latino,1025 Custer Ave.,Odessa,Ector,79761,Injury of Peace Officer,,Male,27,Hispanic or Latino,Roy Daniel,Garza,,,,,,,,,,,CBS 7,Big Country,
4,5,1,NaT,12/29/2015,Odessa Police Department,Odessa,79761,2015-12-23,Sgt. David Lara,dlara@odessa-tx.gov,,,,,,,Cory,Wester,Male,38.0,Anglo or White,1025 Custer Ave.,Odessa,Ector,79761,Injury of Peace Officer,,Male,27,Hispanic or Latino,Roy Daniel,Garza,,,,,,,,,,,CBS 7,Big Country,


## 2. Begin cleaning

In [5]:
# Normalize column names
shootings.columns = [c.strip().lower() for c in shootings.columns]
# Normalize string cell values
upcase_strip_string_cells(shootings)
# Drop unnecessary columns
shootings.drop(['no.', 'number of reports filed'], axis=1, inplace=True)

### Replace 'Not Available' string values with true missing values

In [6]:
for c in shootings.columns:
    if shootings[c].dtype == 'object':
        shootings[c][shootings[c] == 'NOT AVAILABLE'] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### Make column names more machine-friendly

In [7]:
col_renames = {
    "date oag received": "date_ag_received",
    "date of incident": "date_incident",
    "officer first name": "officer_name_first",
    "officer last name": "officer_name_last",
    "injured or deceased's gender": "officer_gender",
    "injured or deceased's age": "officer_age",
    "injured or deceased's race/ethnicity": "officer_race",
    "street address of incident": "incident_address",
    "city of incident": "incident_city",
    "county of incident": "incident_county",
    "zip code of incident": "incident_zip",
    "incident resulted in 1": "incident_result_1",
    "incident resulted in 2": "incident_result_2",
    "media": "media_link_1",
    "column1": "media_link_2",
    "column2": "media_link_3",
}

agency_col_renames = {
    "date of report %d": "agency_report_date_%d",
    "name of agency %d": "agency_name_%d",
    "city of agency %d": "agency_city_%d",
    "zip code of agency %d": "agency_zip_%d",
    "name of person %d filling out form": "agency_name_person_filling_out_%d",
    "email address of person %d filling out form": "agency_email_person_filling_out_%d",
}

for i in range(1, 3):
    for k, v in agency_col_renames.items():
        k = k % i
        v = v % i
        col_renames[k] = v


civilian_col_renames = {
    "non-peace officer's gender %d": "civilian_gender_%d",
    "non-peace officer's age %d": "civilian_age_%d",
    "non-peace officer's race/ethnicity %d": "civilian_race_%d",
    "non-officer first name %d": "civilian_name_first_%d",
    "non-officer last name %d": "civilian_name_last_%d",
}

for i in range(1, 4):
    for k, v in civilian_col_renames.items():
        k = k % i
        v = v % i
        col_renames[k] = v

shootings.columns = [col_renames[c] for c in shootings.columns]

### Add a column for the total number of civilians involved.

In [8]:
individual_civ_cols = [c for c in shootings.columns if c.startswith('civilian_') and c[-1].isdigit()]
numbers = set([int(c.rsplit('_', 1)[1]) for c in individual_civ_cols])
print("Data for up to %d civilians is recorded" % len(numbers))

Data for up to 3 civilians is recorded


In [9]:
# Note: Sometimes not all columns for an involved civilian are filled out.
# E.g. there is a name but no race, etc.
# To know if "civilian 1" is involved, we need to check ALL civilian_X_1
# columns, and count them if ANY of these are non-null.
temp_cols = []
for i in sorted(numbers):
    col = 'temp_civ_data_exists_%d' % i
    temp_cols.append(col)
    civ_cols = [c for c in individual_civ_cols if int(c.rsplit('_', 1)[1]) == i]
    shootings[col] = shootings[civ_cols].notnull().any(axis=1)

shootings['num_civilians_recorded'] = shootings[temp_cols].sum(axis=1)
shootings.drop(temp_cols, axis=1, inplace=True)
shootings.head(15)

Unnamed: 0,date_ag_received,agency_report_date_1,agency_name_1,agency_city_1,agency_zip_1,date_incident,agency_name_person_filling_out_1,agency_email_person_filling_out_1,agency_report_date_2,agency_name_2,agency_city_2,agency_zip_2,agency_name_person_filling_out_2,agency_email_person_filling_out_2,officer_name_first,officer_name_last,officer_gender,officer_age,officer_race,incident_address,incident_city,incident_county,incident_zip,incident_result_1,incident_result_2,civilian_gender_1,civilian_age_1,civilian_race_1,civilian_name_first_1,civilian_name_last_1,civilian_gender_2,civilian_age_2,civilian_race_2,civilian_name_first_2,civilian_name_last_2,civilian_gender_3,civilian_age_3,civilian_race_3,civilian_name_first_3,civilian_name_last_3,media_link_1,media_link_2,media_link_3,num_civilians_recorded
0,NaT,10/26/2015,SEGUIN POLICE DEPARTMENT,SEGUIN,78155,2015-10-06,MC MEYERS,MCMEYERS@SEGUINTEXAS.GOV,,,,,,,,,FEMALE,54.0,ANGLO OR WHITE,815 LAMAR DRIVE,SEGUIN,GUADALUPE,78155,INJURY OF PEACE OFFICER,,FEMALE,54.0,ANGLO OR WHITE,,,,,,,,,,,,,,,,1
1,NaT,11/3/2015,BEXAR COUNTY SHERIFF'S OFFICE,SAN ANTONIO,78207,2015-10-30,DET. FRANK STUBBS #4048,SSTUBBS@BEXAR.ORG,,,,,,,JOSEPH,CANALES,MALE,42.0,HISPANIC OR LATINO,8000 MIDCROWN,SAN ANTONIO,BEXAR,78218,INJURY OF PEACE OFFICER,,MALE,36.0,BLACK OR AFRICAN AMERICAN,SHERMAN,ROBINSON,,,,,,,,,,,MYSA,NEWS4SANANTONIO,,1
2,2017-02-21,2/21/2017,MARLIN POLICE DEPARTMENT,MARLIN,76661,2015-11-01,DAMIEN EAGLIN,CHIEFEAGLIN@MARLINTX.NET,,,,,,,DARRELL,ALLEN,MALE,42.0,BLACK OR AFRICAN AMERICAN,301 W. AVENUE B,TEMPLE,BELL COUNTY,76501,DEATH OF PEACE OFFICER,,MALE,,,,,,,,,,,,,,,WACO TRIBUNE-HERALD,,,1
3,NaT,12/29/2015,ODESSA POLICE DEPARTMENT,ODESSA,79761,2015-12-23,SGT. DAVID LARA,DLARA@ODESSA-TX.GOV,,,,,,,PETE,GONZALES,MALE,42.0,HISPANIC OR LATINO,1025 CUSTER AVE.,ODESSA,ECTOR,79761,INJURY OF PEACE OFFICER,,MALE,27.0,HISPANIC OR LATINO,ROY DANIEL,GARZA,,,,,,,,,,,CBS 7,BIG COUNTRY,,1
4,NaT,12/29/2015,ODESSA POLICE DEPARTMENT,ODESSA,79761,2015-12-23,SGT. DAVID LARA,DLARA@ODESSA-TX.GOV,,,,,,,CORY,WESTER,MALE,38.0,ANGLO OR WHITE,1025 CUSTER AVE.,ODESSA,ECTOR,79761,INJURY OF PEACE OFFICER,,MALE,27.0,HISPANIC OR LATINO,ROY DANIEL,GARZA,,,,,,,,,,,CBS 7,BIG COUNTRY,,1
5,NaT,1/13/2016,HOUSTON POLICE DEPARTMENT,HOUSTON,77002,2016-01-13,SGT. O. BELMAREZ,ODON.BELMAREZ@HOUSTONPOLICE.ORG,,,,,,,KENNETH,FREGIA,MALE,46.0,ANGLO OR WHITE,3100 ANITA,HOUSTON,HARRIS,77004,INJURY OF PEACE OFFICER,,,,,,,,,,,,,,,,,CHRONICLE,ABC 13,,0
6,NaT,1/27/2016,HOUSTON POLICE DEPARTMENT,HOUSTON,77002,2016-01-19,SGT. O. BELMAREZ,ODON.BELMAREZ@HOUSTONPOLICE.ORG,,,,,,,JASON,RHODES,MALE,33.0,ANGLO OR WHITE,6011 VAN ZANDT,HOUSTON,HARRIS,77016,INJURY OF PEACE OFFICER,,MALE,27.0,BLACK OR AFRICAN AMERICAN,SHELTON,HERALD,,,,,,,,,,,CHRONICLE,CLICK2HOUSTON,,1
7,NaT,2/12/2016,"MONTGOMERY COUNTY CONSTABLE'S OFFICE, PRECINCT 3",THE WOODLANDS,77380,2016-02-07,CAPT. J. DRUMMOND,JIMMY.DRUMMOND@MCTX.ORG,,,,,,,,,MALE,36.0,ANGLO OR WHITE,1410 ASHLAND DRIVE,CONROE,MONTGOMERY,77385,INJURY OF PEACE OFFICER,,MALE,18.0,HISPANIC OR LATINO,,,,,,,,,,,,,,,,1
8,NaT,2/25/2016,HOUSTON POLICE DEPARTMENT,HOUSTON,77002,2016-02-22,SGT. O. BELMAREZ,ODON.BELMAREZ@HOUSTONPOLICE.ORG,,,,,,,,,MALE,27.0,BLACK OR AFRICAN AMERICAN,5100 CLOVER,HOUSTON,HARRIS,77021,INJURY OF PEACE OFFICER,,,,,,*PER CITY - NO ONE INJURED*,,,,,,,,,,,CW39,HOUSTONTX.GOV,,1
9,2017-02-14,2/10/2017,EULESS POLICE DEPARTMENT,EULESS,76040,2016-03-01,LT. BRETT MORGAN #282,BMORGAN@EULESSTX.GOV,,,,,,,DAVID STEFAN,HOFER,MALE,29.0,ANGLO OR WHITE,508 SIMMONS DR.,EULESS,TARRANT,76040,DEATH OF PEACE OFFICER,DEATH OF NON-PEACE OFFICER,MALE,22.0,HISPANIC OR LATINO,JORGE,GONZALEZ,,,,,,,,,,,DALLAS MORNING NEWS,NBC DFW,,1


### Let's glance at the incidents where no civilians are recorded

In [10]:
shootings[shootings['num_civilians_recorded'] == 0]

Unnamed: 0,date_ag_received,agency_report_date_1,agency_name_1,agency_city_1,agency_zip_1,date_incident,agency_name_person_filling_out_1,agency_email_person_filling_out_1,agency_report_date_2,agency_name_2,agency_city_2,agency_zip_2,agency_name_person_filling_out_2,agency_email_person_filling_out_2,officer_name_first,officer_name_last,officer_gender,officer_age,officer_race,incident_address,incident_city,incident_county,incident_zip,incident_result_1,incident_result_2,civilian_gender_1,civilian_age_1,civilian_race_1,civilian_name_first_1,civilian_name_last_1,civilian_gender_2,civilian_age_2,civilian_race_2,civilian_name_first_2,civilian_name_last_2,civilian_gender_3,civilian_age_3,civilian_race_3,civilian_name_first_3,civilian_name_last_3,media_link_1,media_link_2,media_link_3,num_civilians_recorded
5,NaT,1/13/2016,HOUSTON POLICE DEPARTMENT,HOUSTON,77002,2016-01-13,SGT. O. BELMAREZ,ODON.BELMAREZ@HOUSTONPOLICE.ORG,,,,,,,KENNETH,FREGIA,MALE,46.0,ANGLO OR WHITE,3100 ANITA,HOUSTON,HARRIS,77004,INJURY OF PEACE OFFICER,,,,,,,,,,,,,,,,,CHRONICLE,ABC 13,,0


### Standardize common column types

In [11]:
standardize_gender_cols(shootings)
standardize_race_cols(shootings)
numericalize_age_cols(shootings)

Numericalizing column officer_age
Numericalizing column civilian_age_1
Numericalizing column civilian_age_2
Numericalizing column civilian_age_3


### Translate columns about injury vs death to boolean

In [12]:
def death_injury_officer(s):
    if pd.isnull(s):
        return s
    elif 'death' in s.lower():
        return True
    elif 'injury' in s.lower():
        return False
    raise CleaningError('Invalid incident result: ' + s)

def death_injury_civilian(s):
    try:
        return death_injury_officer(s)
    except CleaningError as e:
        if 'suicide' in s.lower():
            return True
        raise

shootings['officer_died'] = shootings['incident_result_1'].apply(death_injury_officer)
shootings['civilian_died'] = shootings['incident_result_2'].apply(death_injury_civilian)
shootings['civilian_suicide'] = shootings['incident_result_2'].str.contains('suicide')
shootings.drop(['incident_result_1', 'incident_result_2'], axis=1, inplace=True)

### Convert "12345-6789" style zips into simply "12345"

In [13]:
for c in shootings.columns:
    if c.endswith('_zip'):
        shootings[c] = shootings[c].apply(lambda z: z if pd.isnull(z) else str(z).split('-')[0])

### Convert date columns to ensure validity

In [14]:
for c in shootings.columns:
    if c.startswith('date_') or c.endswith('_date'):
        shootings[c] = pd.to_datetime(shootings[c])

In [15]:
shootings.head()

Unnamed: 0,date_ag_received,agency_report_date_1,agency_name_1,agency_city_1,agency_zip_1,date_incident,agency_name_person_filling_out_1,agency_email_person_filling_out_1,agency_report_date_2,agency_name_2,agency_city_2,agency_zip_2,agency_name_person_filling_out_2,agency_email_person_filling_out_2,officer_name_first,officer_name_last,officer_gender,officer_age,officer_race,incident_address,incident_city,incident_county,incident_zip,civilian_gender_1,civilian_age_1,civilian_race_1,civilian_name_first_1,civilian_name_last_1,civilian_gender_2,civilian_age_2,civilian_race_2,civilian_name_first_2,civilian_name_last_2,civilian_gender_3,civilian_age_3,civilian_race_3,civilian_name_first_3,civilian_name_last_3,media_link_1,media_link_2,media_link_3,num_civilians_recorded,officer_died,civilian_died,civilian_suicide
0,NaT,10/26/2015,SEGUIN POLICE DEPARTMENT,SEGUIN,78155,2015-10-06,MC MEYERS,MCMEYERS@SEGUINTEXAS.GOV,,,,,,,,,FEMALE,54.0,WHITE,815 LAMAR DRIVE,SEGUIN,GUADALUPE,78155,FEMALE,54.0,WHITE,,,,,,,,,,,,,,,,1,False,,
1,NaT,11/3/2015,BEXAR COUNTY SHERIFF'S OFFICE,SAN ANTONIO,78207,2015-10-30,DET. FRANK STUBBS #4048,SSTUBBS@BEXAR.ORG,,,,,,,JOSEPH,CANALES,MALE,42.0,HISPANIC,8000 MIDCROWN,SAN ANTONIO,BEXAR,78218,MALE,36.0,BLACK,SHERMAN,ROBINSON,,,,,,,,,,,MYSA,NEWS4SANANTONIO,,1,False,,
2,2017-02-21,2/21/2017,MARLIN POLICE DEPARTMENT,MARLIN,76661,2015-11-01,DAMIEN EAGLIN,CHIEFEAGLIN@MARLINTX.NET,,,,,,,DARRELL,ALLEN,MALE,42.0,BLACK,301 W. AVENUE B,TEMPLE,BELL COUNTY,76501,MALE,,,,,,,,,,,,,,,WACO TRIBUNE-HERALD,,,1,True,,
3,NaT,12/29/2015,ODESSA POLICE DEPARTMENT,ODESSA,79761,2015-12-23,SGT. DAVID LARA,DLARA@ODESSA-TX.GOV,,,,,,,PETE,GONZALES,MALE,42.0,HISPANIC,1025 CUSTER AVE.,ODESSA,ECTOR,79761,MALE,27.0,HISPANIC,ROY DANIEL,GARZA,,,,,,,,,,,CBS 7,BIG COUNTRY,,1,False,,
4,NaT,12/29/2015,ODESSA POLICE DEPARTMENT,ODESSA,79761,2015-12-23,SGT. DAVID LARA,DLARA@ODESSA-TX.GOV,,,,,,,CORY,WESTER,MALE,38.0,WHITE,1025 CUSTER AVE.,ODESSA,ECTOR,79761,MALE,27.0,HISPANIC,ROY DANIEL,GARZA,,,,,,,,,,,CBS 7,BIG COUNTRY,,1,False,,


### Standardize agency names, and add agency county information

In [16]:
agency_to_county = dict(zip(agencies.agency, agencies.county))
for i in (1, 2):
    c = 'agency_name_%d' % i
    print("Standardizing", c)
    shootings[c] = shootings[c].apply(standardize_agency_name)
    shootings = insert_col_after(
        shootings,
        shootings[c].apply(lambda d: agency_to_county.get(d, np.nan)),
        'agency_county_%d' % i,
        'agency_city_%d' % i)

missing = shootings[shootings.agency_county_1.isnull()]
print('Missing county information for %d records' % len(missing))
missing.agency_name_1

Standardizing agency_name_1
Standardizing agency_name_2
Missing county information for 2 records


28    EL CENTRO COLLEGE POLICE DEPT
29    EL CENTRO COLLEGE POLICE DEPT
Name: agency_name_1, dtype: object

### Fix erroneous counties

In [17]:
counties = set(agencies.county)
shootings[~shootings.incident_county.isin(counties)]

Unnamed: 0,date_ag_received,agency_report_date_1,agency_name_1,agency_city_1,agency_county_1,agency_zip_1,date_incident,agency_name_person_filling_out_1,agency_email_person_filling_out_1,agency_report_date_2,agency_name_2,agency_city_2,agency_county_2,agency_zip_2,agency_name_person_filling_out_2,agency_email_person_filling_out_2,officer_name_first,officer_name_last,officer_gender,officer_age,officer_race,incident_address,incident_city,incident_county,incident_zip,civilian_gender_1,civilian_age_1,civilian_race_1,civilian_name_first_1,civilian_name_last_1,civilian_gender_2,civilian_age_2,civilian_race_2,civilian_name_first_2,civilian_name_last_2,civilian_gender_3,civilian_age_3,civilian_race_3,civilian_name_first_3,civilian_name_last_3,media_link_1,media_link_2,media_link_3,num_civilians_recorded,officer_died,civilian_died,civilian_suicide
2,2017-02-21,2/21/2017,MARLIN POLICE DEPT,MARLIN,FALLS,76661,2015-11-01,DAMIEN EAGLIN,CHIEFEAGLIN@MARLINTX.NET,,,,,,,,DARRELL,ALLEN,MALE,42.0,BLACK,301 W. AVENUE B,TEMPLE,BELL COUNTY,76501,MALE,,,,,,,,,,,,,,,WACO TRIBUNE-HERALD,,,1,True,,
50,2017-05-17,5/17/2017,KILLEEN POLICE DEPT,KILLEEN,BELL,76542,2017-04-24,CDR. ALEX GEARHART,AGEARHART@KILLEENTEXAS.GOV,,,,,,,,,,MALE,40.0,WHITE,3207 E. CENTRAL TEXAS EXPRESSWAY,KILLEEN,BELL COUNTY,76543,MALE,15.0,BLACK,DAEZION CHRISTIAN PAUL,TURNER,,,,,,,,,,,KDH NEWS,,,1,False,True,False
71,2018-02-23,2/22/2018,RICHARDSON POLICE DEPT,RICHARDSON,DALLAS,75081,2018-02-07,SGT. KEVIN W. PERLICH,KEVIN.PERLICH@COR.GOV,,,,,,,,DAVID STEFAN,SHERRARD,MALE,37.0,WHITE,"3500 NORTH STAR ROAD, BLDG. 12, APT. 1235",RICHARDSON,COLIN,75082,MALE,26.0,WHITE,BRANDON,MCCALL,,,,,,,,,,,DALLAS MORNING NEWS,NBC DFW,,1,True,,


In [18]:
fixes = {
    'COLIN': 'COLLIN',
    'BELL COUNTY': 'BELL'
}
for before, after in fixes.items():
    shootings.loc[shootings.incident_county == before, 'incident_county'] = after

shootings[~shootings.incident_county.isin(counties)]

Unnamed: 0,date_ag_received,agency_report_date_1,agency_name_1,agency_city_1,agency_county_1,agency_zip_1,date_incident,agency_name_person_filling_out_1,agency_email_person_filling_out_1,agency_report_date_2,agency_name_2,agency_city_2,agency_county_2,agency_zip_2,agency_name_person_filling_out_2,agency_email_person_filling_out_2,officer_name_first,officer_name_last,officer_gender,officer_age,officer_race,incident_address,incident_city,incident_county,incident_zip,civilian_gender_1,civilian_age_1,civilian_race_1,civilian_name_first_1,civilian_name_last_1,civilian_gender_2,civilian_age_2,civilian_race_2,civilian_name_first_2,civilian_name_last_2,civilian_gender_3,civilian_age_3,civilian_race_3,civilian_name_first_3,civilian_name_last_3,media_link_1,media_link_2,media_link_3,num_civilians_recorded,officer_died,civilian_died,civilian_suicide


### Re-order columns more sensibly

In [19]:
new_order = [
 'date_ag_received',
 'date_incident',
 'incident_address',
 'incident_city',
 'incident_county',
 'incident_zip',
 'officer_died',
 'officer_name_first',
 'officer_name_last',
 'officer_age',
 'officer_race',
 'officer_gender',
 'agency_name_1',
 'agency_city_1',
 'agency_county_1',
 'agency_zip_1',
 'agency_report_date_1',
 'agency_name_person_filling_out_1',
 'agency_email_person_filling_out_1',
 'agency_name_2',
 'agency_city_2',
 'agency_county_2',
 'agency_zip_2',
 'agency_report_date_2',
 'agency_name_person_filling_out_2',
 'agency_email_person_filling_out_2',
 'num_civilians_recorded',
 'civilian_died',
 'civilian_suicide',
 'civilian_name_first_1',
 'civilian_name_last_1',
 'civilian_age_1',
 'civilian_race_1',
 'civilian_gender_1',
 'civilian_name_first_2',
 'civilian_name_last_2',
 'civilian_age_2',
 'civilian_race_2',
 'civilian_gender_2',
 'civilian_name_first_3',
 'civilian_name_last_3',
 'civilian_age_3',
 'civilian_race_3',
 'civilian_gender_3',
 'media_link_1',
 'media_link_2',
 'media_link_3']

shootings = reorder_columns_and_check(shootings, new_order)

In [20]:
shootings.sort_values(['date_incident', 'incident_county', 'agency_county_1'], inplace=True)
shootings.head()

Unnamed: 0,date_ag_received,date_incident,incident_address,incident_city,incident_county,incident_zip,officer_died,officer_name_first,officer_name_last,officer_age,officer_race,officer_gender,agency_name_1,agency_city_1,agency_county_1,agency_zip_1,agency_report_date_1,agency_name_person_filling_out_1,agency_email_person_filling_out_1,agency_name_2,agency_city_2,agency_county_2,agency_zip_2,agency_report_date_2,agency_name_person_filling_out_2,agency_email_person_filling_out_2,num_civilians_recorded,civilian_died,civilian_suicide,civilian_name_first_1,civilian_name_last_1,civilian_age_1,civilian_race_1,civilian_gender_1,civilian_name_first_2,civilian_name_last_2,civilian_age_2,civilian_race_2,civilian_gender_2,civilian_name_first_3,civilian_name_last_3,civilian_age_3,civilian_race_3,civilian_gender_3,media_link_1,media_link_2,media_link_3
0,NaT,2015-10-06,815 LAMAR DRIVE,SEGUIN,GUADALUPE,78155,False,,,54.0,WHITE,FEMALE,SEGUIN POLICE DEPT,SEGUIN,GUADALUPE,78155,10/26/2015,MC MEYERS,MCMEYERS@SEGUINTEXAS.GOV,,,,,,,,1,,,,,54.0,WHITE,FEMALE,,,,,,,,,,,,,
1,NaT,2015-10-30,8000 MIDCROWN,SAN ANTONIO,BEXAR,78218,False,JOSEPH,CANALES,42.0,HISPANIC,MALE,BEXAR CO SHERIFFS OFFICE,SAN ANTONIO,BEXAR,78207,11/3/2015,DET. FRANK STUBBS #4048,SSTUBBS@BEXAR.ORG,,,,,,,,1,,,SHERMAN,ROBINSON,36.0,BLACK,MALE,,,,,,,,,,,MYSA,NEWS4SANANTONIO,
2,2017-02-21,2015-11-01,301 W. AVENUE B,TEMPLE,BELL,76501,True,DARRELL,ALLEN,42.0,BLACK,MALE,MARLIN POLICE DEPT,MARLIN,FALLS,76661,2/21/2017,DAMIEN EAGLIN,CHIEFEAGLIN@MARLINTX.NET,,,,,,,,1,,,,,,,MALE,,,,,,,,,,,WACO TRIBUNE-HERALD,,
3,NaT,2015-12-23,1025 CUSTER AVE.,ODESSA,ECTOR,79761,False,PETE,GONZALES,42.0,HISPANIC,MALE,ODESSA POLICE DEPT,ODESSA,ECTOR,79761,12/29/2015,SGT. DAVID LARA,DLARA@ODESSA-TX.GOV,,,,,,,,1,,,ROY DANIEL,GARZA,27.0,HISPANIC,MALE,,,,,,,,,,,CBS 7,BIG COUNTRY,
4,NaT,2015-12-23,1025 CUSTER AVE.,ODESSA,ECTOR,79761,False,CORY,WESTER,38.0,WHITE,MALE,ODESSA POLICE DEPT,ODESSA,ECTOR,79761,12/29/2015,SGT. DAVID LARA,DLARA@ODESSA-TX.GOV,,,,,,,,1,,,ROY DANIEL,GARZA,27.0,HISPANIC,MALE,,,,,,,,,,,CBS 7,BIG COUNTRY,


## 3. Write

In [18]:
if os.environ.get('CLEAN_OIS_DW') != 'TRUE':
    print("Not syncing to Data.world. To do so, set CLEAN_OIS_DW to 'TRUE'")
else:
    with dw.open_remote_file(DTW_OIS_PROJECT, CLEANED_FILENAME) as w:
        print("Writing to data.world:", CLEANED_FILENAME)
        shootings.to_csv(w, index=False)

Not syncing to Data.world. To do so, set CLEAN_OIS_DW to 'TRUE'


In [19]:
if os.environ.get('CLEAN_OIS_S3') != 'TRUE':
    print("Not writing to s3. To do so, set CLEAN_OIS_S3 to 'TRUE'")
else:
    csv_buffer = StringIO()
    shootings.to_csv(csv_buffer, index=False)
    s3_resource = boto3.resource('s3')
    s3_resource.Object('tji-public-cleaned-datasets', CLEANED_FILENAME).put(Body=csv_buffer.getvalue())
    

Not writing to s3. To do so, set CLEAN_OIS_S3 to 'TRUE'
