# Clean the Texas OIS dataset for analysis -- civilians shot data

* Inputs:
   * `OIS.xlsx`
   * `agencies_and_counties.csv` (used to add county information)
* Output: `shot_civilians.csv`

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [None]:
CLEANED_FILENAME = 'shot_civilians.csv'
DTW_OIS_PROJECT = 'tji/officer-involved-shootings'

In [None]:
import os
import sys
import boto3
import datadotworld as dw
import numpy as np
import pandas as pd

from lib.cleaning_tools import *
from lib.standardize_police_agency_names import standardize_agency_name

from io import StringIO

sys.path.append(os.getcwd() + '/../data_cleaning')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -r -g -w -p datadotworld,numpy,pandas

In [None]:
datasets = dw.load_dataset('tji/auxiliary-datasets', force_update=True)
agencies = datasets.dataframes['agencies_and_counties']

In [None]:
shootings = read_dtw_excel(DTW_OIS_PROJECT, 'original/OIS.xlsx')['OISTable']
print("Found %d OIS civilians-shot incidents from %s to %s" % (
    len(shootings),
    shootings['Date of Incident'].min().strftime('%Y-%m-%d'),
    shootings['Date of Incident'].max().strftime('%Y-%m-%d')))
shootings.head()

## 2. Begin cleaning

In [None]:
# Normalize column names
shootings.columns = [c.strip().lower() for c in shootings.columns]
# Normalize string cell values
upcase_strip_string_cells(shootings)
# Drop irrelevant columns
shootings.drop(['no.', 'column1', 'column2', 'extras'], axis=1, inplace=True)

In [None]:
max_reports_per_incident = max(int(c.strip().split()[-1]) for c in shootings.columns
                               if c.startswith("date of report"))
print("Deaths may have as many as %d OIS reports filed" % max_reports_per_incident)

In [None]:
# Make the column names more machine-friendly
col_renames = {
    "number of reports filed": "num_reports_filed",
    "date ag received": "date_ag_received",
    "date of incident": "date_incident",
    "time of incident": "time_incident",
    "injured or deceased's first name": "civilian_name_first",
    "injured or deceased's last name": "civilian_name_last",
    "injured or deceased's gender": "civilian_gender",
    "injured or deceased's age": "civilian_age",
    "injured or deceased's race/ethnicity": "civilian_race",
    "street address of incident": "incident_address",
    "city of incident": "incident_city",
    "county of incident": "incident_county",
    "zip code of incident": "incident_zip",
    "incident resulted in": "incident_resulted_in",
    "carried, exhibited or used deadly weapon": "deadly_weapon",
    "on duty or off duty": "officer_on_duty",
    "peace officer responding with 1 or more officers": "multiple_officers_involved",
    "incident occurred during or as a result of": "incident_result_of",
    "if other, specify type of call": "incident_call_other",
    "deadly weapon description": "weapon_reported_by_media",
    "cdr?": "custodial_death_report",
    "cdr narrative": "cdr_narrative",
    "narrative published by law enforcement": "lea_narrative_published",
    "shorter": "lea_narrative_shorter",
}

# Some columns have multiple copies for different individuals,
# e.g. agency_name_1 for the first officer's agency, then
# agency_name_2, for the second, etc. To avoid typing out
# all these numbers, we loop through such columns and rename
# them appropriately.
numerical_renames = {
    "date of report %d": "agency_report_date_%d",
    "name of agency %d": "agency_name_%d",
    "city of agency %d": "agency_city_%d",
    "zip code of agency %d": "agency_zip_%d",
    "name of person %d filling out form": "agency_name_person_filling_out_%d",
    "email address of person %d filling out form": "agency_email_person_filling_out_%d",
    "peace officer %d's gender": "officer_gender_%d",
    "peace officer %d's age": "officer_age_%d",
    "peace officer %d's race/ethnicity": "officer_race_%d",
    "incident occurred during or as a result of %d": "officer_caused_injury_%d",
    "news %d": "news_coverage_%d",
}
for i in range(1, max_reports_per_incident + 1):
    for k, v in numerical_renames.items():
        k = (k % i)
        v = v % i
        col_renames[k] = v

shootings.columns = [col_renames[c] for c in shootings.columns]
shootings.head()

### Standardize common column types

In [None]:
standardize_gender_cols(shootings)
standardize_race_cols(shootings)
numericalize_age_cols(shootings)
convert_date_cols(shootings)

### Create a column for the number of officers whose information was recorded here, `num_officers_recorded`

We use the presence of officer gender as an indicator. Not all incidents use a separate agency_name_X column per officer, so we can't use that.

In [None]:
officer_gender_cols = ['officer_gender_%d' % i for i in range(1, 11)]
shootings['num_officers_recorded'] = shootings[officer_gender_cols].notnull().sum(axis=1)
shootings[shootings['num_officers_recorded'] > 5].head()

In [None]:
shootings['custodial_death_report'] = shootings['custodial_death_report'].fillna('n').apply(lambda c: c.strip().lower()[0] == 'y')
shootings['multiple_officers_involved'] = shootings['multiple_officers_involved'].fillna('n').apply(lambda c: c.strip().lower()[0] == 'y')

shootings['civilian_died'] = shootings['incident_resulted_in'].apply(lambda x: x.strip().lower()) == 'death'
shootings.drop('incident_resulted_in', axis=1, inplace=True)
shootings['incident_result_of'] = shootings['incident_result_of'].apply(lambda x: x.strip().lower())

shootings['officer_on_duty'] = shootings['officer_on_duty'].apply(lambda x: x if pd.isnull(x) else (x.strip().lower().startswith('on ')))
shootings['weapon_reported_by_media'] = shootings['weapon_reported_by_media'].apply(lambda w: w if pd.isnull(w) else w.strip().lower())

### NOTE: Data quirk

It's unclear how many officers were actually at the scene
* The `multiple_officer_involved` column is a yes/no column, but there are also columns to list the agency, gender, etc for each officer. These do not always agree. Sometimes `multiple_officer_involved` is yes, but only one officer's details are recorded. And sometimes we have details for many officers, but `multiple_officer_involved` is no. See below.
* The punchline is to interpret these columns with caution

In [None]:
pd.crosstab(shootings.multiple_officers_involved, shootings.num_officers_recorded)

### Handle weapons-related questions

In [None]:
shootings['deadly_weapon'].value_counts()

In [None]:
# Convert yes/no to boolean
shootings['deadly_weapon'] = shootings['deadly_weapon'].apply(
    lambda weapon: weapon.lower().strip() == 'yes')
shootings['deadly_weapon'].value_counts()

In [None]:
# Check for mistakes. Sometimes the "was there a deadly weapon?" question
# is answered with "No" while a description of a deadly weapon is given.
pd.crosstab(shootings['deadly_weapon'], shootings['weapon_reported_by_media'].notnull())

In [None]:
# Let's look at these instances to be sure
s = shootings[~shootings['deadly_weapon'] & pd.notnull(shootings['weapon_reported_by_media'])][['deadly_weapon', 'weapon_reported_by_media']]
print(len(s))
s

In [None]:
# What weapons do people use? Can we categorize them?
sorted(list(x for x in set(shootings.weapon_reported_by_media) if pd.notnull(x)))

In [None]:
# Manual categorization of weapons
weapon_types = {
    'FIREARM': [
        'handgun', 'sawed-off shotgun', 'revolver', 'rifle',
        'assault rifle', 'firearm', 'shotgun', 'long gun', 'gun',
        'glock 40', 'pistol', 'knife, gun', "deputy's gun",
        'vehicle, gun', 'semi-automatic rifle'],
    'KNIFE/CUTTING': [
        'hatchet', 'butcher knife', 'knife', 'knives', 'box cutter',
        'knife - not opened', 'machete', 'sword', 'axe', 'knives (2)',
        "officer's gun", 'pickaxe', 'samurai sword', "took officer's knife",
        'sharp metal object (piece of headphones)', 'scissors, screwdriver'],
    'VEHICLE': ['car', 'truck', 'vehicle'],
}
type_lookup = {}
for w_type, weapons in weapon_types.items():
    for w in weapons:
        if w in type_lookup:
            print("DUPLICATE:", w_type, w)
        type_lookup[w] = w_type

weapons = []
for has_weapon, desc in zip(shootings['deadly_weapon'], shootings['weapon_reported_by_media']):
    if pd.isnull(desc) or not desc:
        if has_weapon:
            weapons.append('(DETAILS MISSING)')
        else:
            weapons.append(np.nan)
        continue
    weapons.append(type_lookup.get(desc, 'OTHER'))

shootings['weapon_reported_by_media_category'] = weapons    
shootings['weapon_reported_by_media_category'].value_counts()

In [None]:
sorted(set(shootings['weapon_reported_by_media'][shootings['weapon_reported_by_media_category'] == 'OTHER']))

### Uppercase the content of all columns

In [None]:
print('Uppercasing columns: ', end='')
for col, dt in shootings.dtypes.items():
    if dt == 'object':
        print(col, end=' ')
        shootings[col] = shootings[col].apply(lambda s: s if type(s) != str else s.upper())

### Standardize police agency names

In [None]:
for i in range(1, 11):
    shootings['agency_name_%d' % i] = shootings['agency_name_%d' % i].apply(standardize_agency_name)

shootings.head()

### Add county information

In [None]:
dept_to_county = dict(zip(agencies.agency, agencies.county))
for i in range(1, 11):
    shootings = insert_col_after(
        shootings,
        shootings['agency_name_%d' % i].apply(lambda d: dept_to_county.get(d, np.nan)),
        'agency_county_%d' % i,
        'agency_city_%d' % i)

shootings.agency_county_1.isnull().value_counts()

In [None]:
shootings[shootings.agency_county_1.isnull()].agency_name_1.tolist()

### Other analysis revealed some typos with agency name. We'll demonstrate them here before correcting.

In [None]:
tmp = shootings.groupby(['incident_county', 'incident_city']).size().sort_values().unstack().T
tmax = tmp.max()
county_to_biggest_city = {}
for c in tmp.columns:
    x = tmp[c][tmp[c] == tmax[c]]
    county_to_biggest_city[c] = x.index[0]

In [None]:
TOP5 = list(shootings.incident_county.value_counts().head(5).index)
TOP5_CITIES = [county_to_biggest_city.get(c) for c in TOP5]
print(TOP5)
print(TOP5_CITIES)

In [None]:
for county in TOP5:
    print("-- %s --" % county)
    print(shootings[shootings.incident_county == county]['agency_county_1'].value_counts())
    print()

### 'HARRIS' and 'HARRISON' counties are nowhere near each other, so it seems more likely that there is a typo or data error than that four incidents in HARRIS county involved officers from HARRISON county. Let's check them out.

In [None]:
shootings[(shootings.incident_county == 'HARRIS') & (shootings.agency_county_1 == 'HARRISON')]

### Yup. These are officers from Baytown and Spring, which border Houston (Harris, not Harrison county). Their emails are also from `hctx.net` which is Harris County. Their given agency names must be errors. 

In [None]:
city_county_corrections = [
    ['BAYTOWN', 'HARRISON', 'HARRIS'],
    ['SPRING', 'HARRISON', 'HARRIS'],
]
corrected = set()
for city, wrong_county, right_county in city_county_corrections:
    for i in range(1, 11):
        tmp = shootings[(shootings['agency_city_%d' % i] == city) & (shootings['agency_county_%d' % i] == wrong_county)]
        if len(tmp) == 0:
            break
        shootings.loc[tmp.index, 'agency_county_%d' % i] = right_county
        for idx, name in tmp['agency_name_%d' % i].items():
            shootings.loc[idx, 'agency_name_%d' % i] = name.replace(wrong_county, right_county)
            corrected.add(idx)

shootings.loc[corrected]

### While we're at it, are there any mistakes the other way? (Harris county officers showing up in Harrison county)
#### Answer: no.

In [None]:
shootings[(shootings.incident_county == 'HARRISON') & (shootings.agency_county_1 == 'HARRIS')]

### Flag records that are part of the same incident

In [None]:
incident_records = shootings.groupby(['incident_address', 'date_incident']).size().sort_values(ascending=False).to_dict()
num_records = []
for address, date in zip(shootings.incident_address, shootings.date_incident):
    if pd.notnull(address) and pd.notnull(date):
        num_records.append(incident_records[(address, date)])
    else:
        num_records.append(1)
# TODO(wetchler) -- add this step to the data cleaning
shootings['num_rows_about_this_incident'] = num_records

### Add a column for full name

In [None]:
s = shootings.civilian_name_first.fillna('') + ' ' + shootings.civilian_name_last.fillna('')
s = s.apply(lambda s: ' '.join(s.strip().split()))
s[s == ''] = np.nan
shootings = insert_col_after(shootings, s, 'civilian_name_full', 'civilian_name_last')

### Re-order columns more sensibly
* **Disclaimer**: This code is gross and probably an OCD attempt to make the final result look nice. Just destroy this cell if it proves too confusing or opaque.

In [None]:
section_order = ['date', 'incident', 'civilian', 'agency', 'news']
agency_fields = ['name', 'report_date', 'city', 'county', 'zip', 'name_person_filling_out', 'email_person_filling_out']
officer_fields = ['age', 'race', 'gender', 'on_duty', 'caused_injury']
new_col_order = []
remaining = set(shootings.columns)
for s in section_order:
    if s == 'agency':
        for i in range(1, 11):
            for field in officer_fields:
                c = 'officer_%s_%d' % (field, i)
                if c in remaining:
                    new_col_order.append(c)
                    remaining.remove(c)
            for field in agency_fields:
                c = 'agency_%s_%d' % (field, i)
                new_col_order.append(c)
                remaining.remove(c)
    else:
        this_section_cols = []
        for c in shootings.columns:
            if c in remaining and c.startswith(s + '_'):
                this_section_cols.append(c)
                remaining.remove(c)
        new_col_order.extend(this_section_cols)

new_col_order.extend(sorted(remaining))
shape_before = shootings.shape
shootings = shootings[new_col_order]
assert(shootings.shape == shape_before)

In [None]:
shootings.sort_values(['date_incident', 'incident_county', 'agency_county_1'], inplace=True)
shootings.head(100)

### 3. Write

In [None]:
if os.environ.get('CLEAN_OIS_DW') != 'TRUE':
    print("Not syncing to Data.world. To do so, set CLEAN_OIS_DW to 'TRUE'")
if os.environ.get('CLEAN_OIS_S3') != 'TRUE':
    print("Not writing to s3. To do so, set CLEAN_OIS_S3 to 'TRUE'")

In [None]:
if os.environ.get('CLEAN_OIS_DW') == 'TRUE':
    with dw.open_remote_file(DTW_OIS_PROJECT, CLEANED_FILENAME) as w:
        print("Writing to data.world: %s/%s" % (DTW_OIS_PROJECT, CLEANED_FILENAME))
        shootings.to_csv(w, index=False)

In [None]:
if os.environ.get('CLEAN_OIS_S3') == 'TRUE':
    csv_buffer = StringIO()
    shootings.to_csv(csv_buffer, index=False)
    s3_resource = boto3.resource('s3')
    s3_resource.Object('tji-public-cleaned-datasets', CLEANED_FILENAME).put(Body=csv_buffer.getvalue())
    