# Clean the Texas OIS dataset for analysis -- civilians shot only

### Latest run covers incidents from 2015-09-02 to 2018-04-16

* Inputs:
   * `OIS.xlsx` (currently local -- TODO add to data.world)
   * `texas_law_enforcement_agencies_and_counties.csv` (dtw - used to add county information)
* Output: `shot_civilians.csv`

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
RAW_FILENAME = 'data/raw/OIS.xlsx'
CLEANED_FILENAME = 'data/clean/shot_civilians.csv'
AGENCY_COUNTY_DATAFRAME_NAME = 'texas_law_enforcement_agencies_and_counties'

In [2]:
import datadotworld as dw
import numpy as np
import pandas as pd

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -r -g -w -p datadotworld,numpy,pandas

Everett Wetchler 2018-05-07 11:36:03 CDT

datadotworld 1.6.0
numpy 1.14.3
pandas 0.22.0
Git hash: a5099cf65aa91c6e242c3c2d760560fd6152bfe0
Git repo: git@github.com:texas-justice-initiative/data-processing.git
watermark 1.6.0


In [3]:
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
agencies = datasets.dataframes[AGENCY_COUNTY_DATAFRAME_NAME]

In [5]:
shootings = pd.read_excel(RAW_FILENAME, sheetname='OISTable')
print("OIS civilians-shot incidents from %s to %s" % (
    shootings['Date of Incident'].min().strftime('%Y-%m-%d'),
    shootings['Date of Incident'].max().strftime('%Y-%m-%d')))
shootings.head()

  return func(*args, **kwargs)


OIS civilians-shot incidents from 2015-09-02 to 2018-04-16


Unnamed: 0,No.,Number of Reports Filed,Date of Report 1,Date AG Received,Name of Agency 1,City of Agency 1,Zip code of Agency 1,Date of Incident,Time of Incident,Name of Person 1 Filling out Form,...,NEWS 2,NEWS 3,NEWS 4,CDR?,CDR Narrative,Narrative Published by Law Enforcement,Column1,Column2,SHORTER,EXTRAS
0,1,1,9/16/2015,NaT,Freeport Police Department,Freeport,77541,2015-09-02,,Pamela Morris,...,Your Southest Texas,,,,,,,,,
1,2,1,10/1/2015,NaT,Plano Police Department,Plano,75074,2015-09-03,,Curtis Howard,...,,,,,,,,,,
2,3,1,10/6/2015,NaT,Parker County Sheriff's Office,Weatherford,76086,2015-09-04,,Meredith Gray,...,DFW CBS Local,Star Telegram,Fox 4 News,YES,Decedent shot a rifle at LE Officers whom retu...,,,,fired at officers,
3,4,1,9/11/2015,NaT,Houston Police Department,Houston,77002,2015-09-05,,Odon Belmarez,...,ABC 13,Click 2 Houston,,,,An officer was dispatched to a weapons disturb...,,,,
4,5,1,10/15/2015,NaT,Irving Police Department,Irving,75061,2015-09-08,,Michael Coleman,...,,,,,,,,,,


## 2. Begin cleaning

In [6]:
# Remove whitespace from column names
shootings.columns = [c.strip() for c in shootings.columns]

In [7]:
# Drop irrelevant columns
shootings.drop(['No.', 'Column1', 'Column2'], axis=1, inplace=True)

In [8]:
# Get rid of any stray formatting on string values - remove whitespace, and lowercase
for c in shootings.columns:
    shootings[c] = shootings[c].apply(lambda s: s.strip().lower() if isinstance(s, str) else s)

In [9]:
# Make the column names more machine-friendly
col_renames = {
    'Number of Reports Filed': 'num_reports_filed',
    'Date AG Received': 'date_ag_received',
    'Date of Incident': 'date_incident',
    'Time of Incident': 'time_incident',
    "Injured or Deceased's First Name": 'civilian_name_first',
    "Injured or Deceased's Last Name": 'civilian_name_last',
    "Injured or Deceased's Gender": "civilian_gender",
    "Injured or Deceased's Age": "civilian_age",
    "Injured or Deceased's Race/Ethnicity": "civilian_race",
    "Street Address of Incident": "incident_address",
    "City of Incident": "incident_city",
    "County of Incident": "incident_county",
    "Zip Code of Incident": "incident_zip",
    "Incident Resulted In": "incident_resulted_in",
    "Carried, Exhibited or Used Deadly Weapon": "deadly_weapon",
    "On Duty or Off Duty": "on_duty",
    "Peace Officer Responding With 1 or More Officers": "multiple_officers_involved",
    "Incident Occurred During or as a Result of": "incident_result_of",
    "If Other, Specify Type of Call": "incident_call_other",
    "Deadly Weapon Description": "deadly_weapon_description",
    "CDR?": "custodial_death_report",
    "CDR Narrative": "cdr_narrative",
    "Narrative Published by Law Enforcement": "lea_narrative_published",
    "SHORTER": "lea_narrative_shorter",
}
colnames = list(shootings.columns)
newnames = []
for c in shootings.columns:
    if c in col_renames:
        newnames.append(col_renames[c])
    else:
        newnames.append(c)

shootings.columns = newnames
shootings.head()

Unnamed: 0,num_reports_filed,Date of Report 1,date_ag_received,Name of Agency 1,City of Agency 1,Zip code of Agency 1,date_incident,time_incident,Name of Person 1 Filling out Form,Email Address of Person 1 Filling out Form,...,deadly_weapon_description,NEWS 1,NEWS 2,NEWS 3,NEWS 4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,EXTRAS
0,1,9/16/2015,NaT,freeport police department,freeport,77541,2015-09-02,,pamela morris,pmorris@freeport.tx.us,...,,abc 13,your southest texas,,,,,,,
1,1,10/1/2015,NaT,plano police department,plano,75074,2015-09-03,,curtis howard,curtish@plano.gov,...,,,,,,,,,,
2,1,10/6/2015,NaT,parker county sheriff's office,weatherford,76086,2015-09-04,,meredith gray,meredith.gray@parkercountytx.com,...,firearm,wfaa,dfw cbs local,star telegram,fox 4 news,yes,decedent shot a rifle at le officers whom retu...,,fired at officers,
3,1,9/11/2015,NaT,houston police department,houston,77002,2015-09-05,,odon belmarez,odon.belmarez@houstonpolice.org,...,firearm,chron,abc 13,click 2 houston,,,,an officer was dispatched to a weapons disturb...,,
4,1,10/15/2015,NaT,irving police department,irving,75061,2015-09-08,,michael coleman,mcoleman@cityofirving.com,...,,,,,,,,,,


In [10]:
# Some columns have multiple copies for different individuals,
# e.g. agency_name_1 for the first officer's agency, then
# agency_name_2, for the second, etc. To avoid typing out
# all these numbers, we loop through such columns and rename
# them appropriately.
numerical_renames = {
    "Date of Report %d":"date_report_",
    "Name of Agency %d":"agency_name_",
    "City of Agency %d":"agency_city_",
    "Zip code of Agency %d":"agency_zip_",
    "Name of Person %d Filling out Form":"name_person_filling_out_",
    "Email Address of Person %d Filling out Form":"email_person_filling_out_",
    "Peace Officer %d's Gender":"officer_gender_",
    "Peace Officer %d's Age":"officer_age_",
    "Peace Officer %d's Race/Ethnicity":"officer_race_",
    "Incident Occurred During or as a Result of %d":"officer_caused_injury_",
    "NEWS %d": "news_coverage_",
}
renames = {}
for i in range(1, 11):
    for k, v in numerical_renames.items():
        k = (k % i).lower()
        v = v + str(i)
        renames[k] = v
shootings.columns = [c.lower().strip() for c in shootings.columns]
shootings.rename(columns=renames, inplace=True)
shootings.head()

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,email_person_filling_out_1,...,deadly_weapon_description,news_coverage_1,news_coverage_2,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras
0,1,9/16/2015,NaT,freeport police department,freeport,77541,2015-09-02,,pamela morris,pmorris@freeport.tx.us,...,,abc 13,your southest texas,,,,,,,
1,1,10/1/2015,NaT,plano police department,plano,75074,2015-09-03,,curtis howard,curtish@plano.gov,...,,,,,,,,,,
2,1,10/6/2015,NaT,parker county sheriff's office,weatherford,76086,2015-09-04,,meredith gray,meredith.gray@parkercountytx.com,...,firearm,wfaa,dfw cbs local,star telegram,fox 4 news,yes,decedent shot a rifle at le officers whom retu...,,fired at officers,
3,1,9/11/2015,NaT,houston police department,houston,77002,2015-09-05,,odon belmarez,odon.belmarez@houstonpolice.org,...,firearm,chron,abc 13,click 2 houston,,,,an officer was dispatched to a weapons disturb...,,
4,1,10/15/2015,NaT,irving police department,irving,75061,2015-09-08,,michael coleman,mcoleman@cityofirving.com,...,,,,,,,,,,


In [11]:
# Check for errors in officer gender columns (values that are not 'male' nor 'female')
error_indices = set()
cols = ['civilian_gender'] + ['officer_gender_%d' % i for i in range(1, 11)]
for col in cols:
    tmp = shootings.dropna(subset=[col])
    errors = tmp[~tmp[col].isin(['male', 'female'])]
    for e in errors.index.values:
        error_indices.add(e)

# Inspect erroneous rows
shootings.loc[list(error_indices),:]

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,email_person_filling_out_1,...,deadly_weapon_description,news_coverage_1,news_coverage_2,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras


In [12]:
if error_indices:
    raise Exception("STOP - read above and correct errors")

In [13]:
# OK, it seems this one instance simply wrote in this field by accident.
# Let's clear it.
shootings.loc[list(error_indices),'officer_gender_2'] = np.NaN

In [14]:
# Double check that all age columns are numerical
cols = ['civilian_age'] + ['officer_age_%d' % i for i in range(1, 11)]
for col in cols:
    tmp = shootings.dropna(subset=[col])
    print(shootings[col].dtype)

float64
int64
float64
float64
float64
float64
float64
float64
float64
float64
float64


### Simplify race names

In [15]:
WHITE, BLACK, HISPANIC, OTHER = 'WHITE,BLACK,HISPANIC,OTHER'.split(',')
RACES = [WHITE, BLACK, HISPANIC, OTHER]
RACE_RENAMES = {
    'hispanic or latino': HISPANIC,
    'black or african american': BLACK,
    'anglo or white': WHITE,
    'anglo': WHITE,
    'ao': WHITE,
    'asian or pacific islander': OTHER,
    'american indian or alaska native': OTHER,
}
def rename_race(r):
    if pd.isnull(r):
        return r
    return RACE_RENAMES.get(r.lower(), r)

In [16]:
cols = ['civilian_race'] + ['officer_race_%d' % i for i in range(1, 11)]
error_indices = []
for col in cols:
    shootings[col] = shootings[col].str.upper()
    shootings[col] = shootings[col].apply(rename_race)
    tmp = shootings.dropna(subset=[col])
    errors = tmp[~tmp[col].isin(RACES)]
    if len(errors):
        print(','.join(errors[col].values))
        for e in errors.index.values:
            error_indices.append(e)

if error_indices:
    # Inspect erroneous rows
    shootings.loc[list(error_indices),:][cols]

In [17]:
if error_indices:
    raise Exception("STOP - read above and correct errors")

### Create a column for the number of officers whose information was recorded here, `num_officers_recorded`

We use the presence of officer gender as an indicator. Not all incidents use a separate agency_name_X column per officer, so we can't use that.

In [18]:
officer_gender_cols = ['officer_gender_%d' % i for i in range(1, 11)]
shootings['num_officers_recorded'] = shootings[officer_gender_cols].notnull().sum(axis=1)
shootings[shootings['num_officers_recorded'] > 5].head()

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,email_person_filling_out_1,...,news_coverage_1,news_coverage_2,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras,num_officers_recorded
62,8,1/5/2016,NaT,odessa police department,odessa,78761,2015-12-23,,david lara,dlara@odessa-tx.gov,...,my san antonio,news west 9,cbs 7,ksat,,,,,,8
157,10,2/23/2017,2017-02-23,dart police department,dallas,75203,2016-07-07,21:00:00,lt. sherri plunk #43,splunk@dart.org,...,dpd press release,,,,yes,"on july 7, 2016, at approximately 8:57 p.m., d...",,fired at officers,,10
271,8,3/15/2017,2017-03-15,clute police department,clute,77531,2017-02-24,14:20:00,chief randy bratton,chief randy bratton,...,the facts,kprc,,,yes,"in angleton, the suspect fled from officers in...",,pointed a gun at officers,,8
295,6,5/15/2017,2017-05-16,waco police department,waco,76708,2017-04-10,21:12:00,sgt. v.r. price jr.,jprice@wacotx.gov,...,,,,,,,,,,6
303,6,5/4/2017,2017-05-08,houston police department,houston,77002,2017-04-24,22:00:00,sgt. odon belmarez,odon.belmarez@houstonpolice.org,...,houston chronicle,khou,,,yes,the decedent and two accomplices armed with we...,,,,6


In [19]:
shootings['custodial_death_report'] = shootings['custodial_death_report'].fillna('n').apply(lambda c: c.strip().lower()[0] == 'y')
shootings['multiple_officers_involved'] = shootings['multiple_officers_involved'].fillna('n').apply(lambda c: c.strip().lower()[0] == 'y')

shootings['civilian_died'] = shootings['incident_resulted_in'].apply(lambda x: x.strip().lower()) == 'death'
shootings.drop('incident_resulted_in', axis=1, inplace=True)
shootings['incident_result_of'] = shootings['incident_result_of'].apply(lambda x: x.strip().lower())

shootings['on_duty'] = shootings['on_duty'].apply(lambda x: x if pd.isnull(x) else (x.strip().lower() == 'on duty'))
shootings['deadly_weapon_description'] = shootings['deadly_weapon_description'].apply(lambda w: w if pd.isnull(w) else w.strip().lower())

### NOTE: Data quirk

It's unclear how many officers were actually at the scene
* The `multiple_officer_involved` column is a yes/no column, but there are also columns to list the agency, gender, etc for each officer. These do not always agree. Sometimes `multiple_officer_involved` is yes, but only one officer's details are recorded. And sometimes we have details for many officers, but `multiple_officer_involved` is no. See below.
* The punchline is to interpret these columns with caution

In [20]:
pd.crosstab(shootings.multiple_officers_involved, shootings.num_officers_recorded)

num_officers_recorded,1,2,3,4,5,6,7,8,10
multiple_officers_involved,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
False,77,2,2,0,1,1,0,0,0
True,234,75,28,17,9,1,4,2,1


### Handle weapons-related questions

In [21]:
# Convert yes/no to boolean
shootings['deadly_weapon'] = shootings['deadly_weapon'].apply(lambda x: x if pd.isnull(x) else (x.strip().lower() == 'yes'))
shootings['deadly_weapon'].value_counts()

True     377
False     77
Name: deadly_weapon, dtype: int64

In [22]:
# Check for mistakes. Sometimes the "was there a deadly weapon?" question
# is answered with "No" while a description of a deadly weapon is given.
pd.crosstab(shootings['deadly_weapon'], shootings['deadly_weapon_description'].notnull())

deadly_weapon_description,False,True
deadly_weapon,Unnamed: 1_level_1,Unnamed: 2_level_1
False,62,15
True,58,319


In [23]:
# Let's look at these instances to be sure
s = shootings[~shootings['deadly_weapon'] & pd.notnull(shootings['deadly_weapon_description'])][['deadly_weapon', 'deadly_weapon_description']]
print(len(s))
s

15


Unnamed: 0,deadly_weapon,deadly_weapon_description
40,False,vehicle
56,False,firearm
162,False,bb gun
227,False,vehicle
252,False,vehicle
268,False,vehicle
286,False,vehicle
300,False,vehicle
321,False,took officer's taser
323,False,fell while getting out of car?


In [24]:
# It seems clear that we should correct the 'deadly_weapon' category
# in these cases.
#
# For cases where the user said there WAS a deadly weapon,
# but did not give a weapon description, we'll assume there really
# was a weapon but use a special category (see cells below).
shootings['deadly_weapon'] = (shootings['deadly_weapon'] | pd.notnull(shootings['deadly_weapon_description']))
shootings['deadly_weapon'].value_counts()

True     392
False     62
Name: deadly_weapon, dtype: int64

In [25]:
# What weapons to people use? Can we categorize them?
sorted(list(x for x in set(shootings.deadly_weapon_description) if pd.notnull(x)))

['"agent was assaulted"',
 'air soft gun',
 'armed',
 'arms (choking)',
 'assault rifle',
 'axe',
 'baseball bat',
 'baseball bat and fireplace poker',
 'bb gun',
 'body',
 'bomb',
 'box cutter',
 'butcher knife',
 'car',
 'club, bat',
 "deputy's gun",
 'fell while getting out of car?',
 'firearm',
 'glock 40',
 'gun',
 'handgun',
 'hatchet',
 'imitation weapon',
 'knife',
 'knife - not opened',
 'knife, gun',
 'knives',
 'long gun',
 'machete',
 'metal flashlight',
 'pellet gun',
 'pencil',
 'pickaxe',
 'pistol',
 'reports say unarmed',
 'revolver',
 'rifle',
 'rock',
 'sawed-off shotgun',
 'scissors, screwdriver',
 'semi-automatic rifle',
 'sharp metal object (piece of headphones)',
 'shotgun',
 'sword',
 "took officer's knife",
 "took officer's taser",
 'truck',
 'vehicle',
 'weapon',
 'weed-cutter']

In [26]:
# Manual categorization of weapons
weapon_types = {
    'FIREARM': ['handgun', 'sawed-off shotgun', 'revolver', 'rifle',
                'assault rifle', 'firearm', 'shotgun', 'long gun', 'gun',
                'glock 40', 'pistol', 'knife, gun', "deputy's gun"],
    'KNIFE/CUTTING': ['hatchet', 'butcher knife', 'knife', 'knives', 'box cutter',
                'knife - not opened', 'machete', 'sword', 'axe'],
    'VEHICLE': ['car', 'truck', 'vehicle'],
}
type_lookup = {}
for k, v in weapon_types.items():
    for w in v:
        if w in type_lookup:
            print("DUPLICATE:", k, w)
        type_lookup[w] = k

weapons = []
for has_weapon, desc in zip(shootings['deadly_weapon'], shootings['deadly_weapon_description']):
    if pd.isnull(desc) or not desc:
        if has_weapon:
            weapons.append('(DETAILS MISSING)')
        else:
            weapons.append(np.nan)
        continue
    weapons.append(type_lookup.get(desc, 'OTHER'))

shootings['deadly_weapon_category'] = weapons    
shootings['deadly_weapon_category'].value_counts()

FIREARM              221
(DETAILS MISSING)     58
KNIFE/CUTTING         46
OTHER                 34
VEHICLE               33
Name: deadly_weapon_category, dtype: int64

In [27]:
sorted(list(shootings['deadly_weapon_description'][shootings['deadly_weapon_category'] == 'OTHER']))

['"agent was assaulted"',
 'air soft gun',
 'armed',
 'arms (choking)',
 'baseball bat',
 'baseball bat and fireplace poker',
 'bb gun',
 'bb gun',
 'bb gun',
 'bb gun',
 'body',
 'body',
 'bomb',
 'club, bat',
 'fell while getting out of car?',
 'imitation weapon',
 'metal flashlight',
 'pellet gun',
 'pellet gun',
 'pencil',
 'pickaxe',
 'reports say unarmed',
 'rock',
 'scissors, screwdriver',
 'semi-automatic rifle',
 'semi-automatic rifle',
 'sharp metal object (piece of headphones)',
 "took officer's knife",
 "took officer's taser",
 "took officer's taser",
 'weapon',
 'weapon',
 'weapon',
 'weed-cutter']

### Uppercase the content of all columns

In [28]:
print('Uppercasing columns: ', end='')
for col, dt in shootings.dtypes.items():
    if dt == 'object':
        print(col, end=' ')
        shootings[col] = shootings[col].apply(lambda s: s if type(s) != str else s.upper())

Uppercasing columns: date_report_1 agency_name_1 agency_city_1 agency_zip_1 time_incident name_person_filling_out_1 email_person_filling_out_1 date_report_2 agency_name_2 agency_city_2 agency_zip_2 name_person_filling_out_2 email_person_filling_out_2 date_report_3 agency_name_3 agency_city_3 agency_zip_3 name_person_filling_out_3 email_person_filling_out_3 date_report_4 agency_name_4 agency_city_4 agency_zip_4 name_person_filling_out_4 email_person_filling_out_4 date_report_5 agency_name_5 agency_city_5 agency_zip_5 name_person_filling_out_5 email_person_filling_out_5 date_report_6 agency_name_6 agency_city_6 name_person_filling_out_6 email_person_filling_out_6 date_report_7 agency_name_7 agency_city_7 name_person_filling_out_7 email_person_filling_out_7 date_report_8 agency_name_8 agency_city_8 name_person_filling_out_8 email_person_filling_out_8 date_report_9 agency_name_9 agency_city_9 name_person_filling_out_9 email_person_filling_out_9 date_report_10 agency_name_10 agency_city_10 

### Standardize police agency names

In [29]:
for i in range(1, 11):
    shootings['agency_name_%d' % i] = shootings['agency_name_%d' % i].apply(standardize_agency_name)

shootings.head()

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,email_person_filling_out_1,...,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras,num_officers_recorded,civilian_died,deadly_weapon_category
0,1,9/16/2015,NaT,FREEPORT POLICE DEPT,FREEPORT,77541,2015-09-02,,PAMELA MORRIS,PMORRIS@FREEPORT.TX.US,...,,,False,,,,,1,False,
1,1,10/1/2015,NaT,PLANO POLICE DEPT,PLANO,75074,2015-09-03,,CURTIS HOWARD,CURTISH@PLANO.GOV,...,,,False,,,,,1,False,(DETAILS MISSING)
2,1,10/6/2015,NaT,PARKER CO SHERIFFS OFFICE,WEATHERFORD,76086,2015-09-04,,MEREDITH GRAY,MEREDITH.GRAY@PARKERCOUNTYTX.COM,...,STAR TELEGRAM,FOX 4 NEWS,True,DECEDENT SHOT A RIFLE AT LE OFFICERS WHOM RETU...,,FIRED AT OFFICERS,,2,True,FIREARM
3,1,9/11/2015,NaT,HOUSTON POLICE DEPT,HOUSTON,77002,2015-09-05,,ODON BELMAREZ,ODON.BELMAREZ@HOUSTONPOLICE.ORG,...,CLICK 2 HOUSTON,,False,,AN OFFICER WAS DISPATCHED TO A WEAPONS DISTURB...,,,1,False,FIREARM
4,1,10/15/2015,NaT,IRVING POLICE DEPT,IRVING,75061,2015-09-08,,MICHAEL COLEMAN,MCOLEMAN@CITYOFIRVING.COM,...,,,False,,,,,1,False,


### Add county information

In [30]:
def insert_col_after(df, to_insert, name, after):
    cols = list(df.columns)
    i = cols.index(after)
    newcols = cols[:(i+1)] + [name] + cols[(i+1):]
    df[name] = to_insert
    return df[newcols]

In [31]:
dept_to_county = dict(zip(agencies.agency, agencies.county))
for i in range(1, 11):
    shootings = insert_col_after(
        shootings,
        shootings['agency_name_%d' % i].apply(lambda d: dept_to_county.get(d, np.nan)),
        'agency_county_%d' % i,
        'agency_city_%d' % i)

shootings.agency_county_1.isnull().value_counts()

False    450
True       4
Name: agency_county_1, dtype: int64

In [32]:
shootings[shootings.agency_county_1.isnull()].agency_name_1.tolist()

['JAL POLICE DEPT',
 'DART POLICE DEPT',
 'DRUG ENFORCEMENT ADMINISTRATION US DOJ',
 'TEXAS DEPT OF PUBLIC SAFETY CRIMINAL INVESTIGATIONS DIVISION']

### Other analysis revealed some typos with agency name. We'll demonstrate them here before correcting.

In [33]:
tmp = shootings.groupby(['incident_county', 'incident_city']).size().sort_values().unstack().T
tmax = tmp.max()
county_to_biggest_city = {}
for c in tmp.columns:
    x = tmp[c][tmp[c] == tmax[c]]
    county_to_biggest_city[c] = x.index[0]

In [34]:
TOP5 = list(shootings.incident_county.value_counts().head(5).index)
TOP5_CITIES = [county_to_biggest_city.get(c) for c in TOP5]
print(TOP5)
print(TOP5_CITIES)

['HARRIS', 'BEXAR', 'DALLAS', 'TARRANT', 'TRAVIS']
['HOUSTON', 'SAN ANTONIO', 'DALLAS', 'FORT WORTH', 'AUSTIN']


In [35]:
for county in TOP5:
    print("-- %s --" % county)
    print(shootings[shootings.incident_county == county]['agency_county_1'].value_counts())
    print()

-- HARRIS --
HARRIS        90
WALLER         1
BRAZORIA       1
MONTGOMERY     1
STATE          1
Name: agency_county_1, dtype: int64

-- BEXAR --
BEXAR     36
STATE      5
WILSON     1
Name: agency_county_1, dtype: int64

-- DALLAS --
DALLAS     37
STATE       1
KAUFMAN     1
Name: agency_county_1, dtype: int64

-- TARRANT --
TARRANT    34
PARKER      1
DALLAS      1
Name: agency_county_1, dtype: int64

-- TRAVIS --
TRAVIS    21
Name: agency_county_1, dtype: int64



### 'HARRIS' and 'HARRISON' counties are nowhere near each other, so it seems more likely that there is a typo or data error than that four incidents in HARRIS county involved officers from HARRISON county. Let's check them out.

In [36]:
shootings[(shootings.incident_county == 'HARRIS') & (shootings.agency_county_1 == 'HARRISON')]

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_county_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,...,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras,num_officers_recorded,civilian_died,deadly_weapon_category


### Yup. These are officers from Baytown and Spring, which border Houston (Harris, not Harrison county). Their emails are also from `hctx.net` which is Harris County. Their given agency names must be errors. 

In [37]:
city_county_corrections = [
    ['BAYTOWN', 'HARRISON', 'HARRIS'],
    ['SPRING', 'HARRISON', 'HARRIS'],
]
corrected = set()
for city, wrong_county, right_county in city_county_corrections:
    for i in range(1, 11):
        tmp = shootings[(shootings['agency_city_%d' % i] == city) & (shootings['agency_county_%d' % i] == wrong_county)]
        if len(tmp) == 0:
            break
        shootings.loc[tmp.index, 'agency_county_%d' % i] = right_county
        for idx, name in tmp['agency_name_%d' % i].items():
            shootings.loc[idx, 'agency_name_%d' % i] = name.replace(wrong_county, right_county)
            corrected.add(idx)

shootings.loc[corrected]

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_county_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,...,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras,num_officers_recorded,civilian_died,deadly_weapon_category


### While we're at it, are there any mistakes the other way? (Harris county officers showing up in Harrison county)
#### Answer: no.

In [38]:
shootings[(shootings.incident_county == 'HARRISON') & (shootings.agency_county_1 == 'HARRIS')]

Unnamed: 0,num_reports_filed,date_report_1,date_ag_received,agency_name_1,agency_city_1,agency_county_1,agency_zip_1,date_incident,time_incident,name_person_filling_out_1,...,news_coverage_3,news_coverage_4,custodial_death_report,cdr_narrative,lea_narrative_published,lea_narrative_shorter,extras,num_officers_recorded,civilian_died,deadly_weapon_category


### Neatly order columns and save the cleaned file.

In [39]:
order = []
numbered = []
for c in shootings.columns:
    if c[-1].isdigit():
        numbered.append(c)
    else:
        order.append(c)

order = order + sorted(numbered, key=lambda c: int(c.split('_')[-1]))
shootings = shootings[order]
shootings.head()

Unnamed: 0,num_reports_filed,date_ag_received,date_incident,time_incident,civilian_name_first,civilian_name_last,civilian_gender,civilian_age,civilian_race,incident_address,...,date_report_10,agency_name_10,agency_city_10,agency_county_10,agency_zip_10,name_person_filling_out_10,email_person_filling_out_10,officer_gender_10,officer_age_10,officer_race_10
0,1,NaT,2015-09-02,,RICKEY,MAYBERRY,MALE,30.0,BLACK,1010 MAGNOLIA STREET,...,,,,,,,,,,
1,1,NaT,2015-09-03,,,,MALE,55.0,WHITE,4840 E. PLANO PARKWAY,...,,,,,,,,,,
2,1,NaT,2015-09-04,,SULLY JOE,LANIER,MALE,36.0,WHITE,101 COUCH CT.,...,,,,,,,,,,
3,1,NaT,2015-09-05,,,,MALE,21.0,BLACK,4926 CHENNAULT ROAD,...,,,,,,,,,,
4,1,NaT,2015-09-08,,,,MALE,44.0,WHITE,1500 RANGE ROAD,...,,,,,,,,,,


### 3. Write

In [40]:
shootings.to_csv(CLEANED_FILENAME, index=False)
print('Done')

Done
