# Clean data from the Officer Down Memorial Page

* Input: `Texas Line of Duty Deaths (ODMP).xlsx`
* Output: `odmp_texas_line_of_duty_deaths.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
AGENCY_COUNTY_DATAFRAME = 'agencies_and_counties'
RAW_FILENAME = 'Texas Line of Duty Deaths (ODMP).xlsx'
CLEANED_FILENAME = 'odmp_texas_line_of_duty_deaths.csv'

In [2]:
####################################################
# Boilerplate import/setup code for general analysis
# everett.wetchler@gmail.com
####################################################

import datetime as dt
import os
import random

import datadotworld as dw
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Jupyter setup
%matplotlib inline

## Graphical setup
# Useful colors to reference
SNS_BLUE, SNS_GREEN, SNS_RED, SNS_PURPLE, SNS_YELLOW, SNS_CYAN = sns.color_palette()
SNS_COLORS = sns.color_palette()
# sns.set_palette(sns.color_palette("cubehelix", 8))
mpl.rcParams.update({
  'font.size': 14,
  'axes.titlesize': 'x-large',
  'axes.labelsize': 'large',
  'xtick.labelsize': 'medium',
  'ytick.labelsize': 'medium',
  'legend.fancybox': True,
  'legend.fontsize': 'medium',
  'legend.frameon': True,
  'legend.framealpha': 0.7,
  'figure.figsize': ['9', '6'],
})

# Watermark extension to print version/system information
# Flags:
# -a [author] -d (date) -t (time) -z (timezone) -r (repo)
# -g (git hash) -w (watermark version) -p [packages] (package info)
%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p numpy,pandas,matplotlib,datadotworld

####################################################
# END Boilerplate
####################################################

Everett Wetchler 2018-05-28 12:06:01 CDT

numpy 1.14.3
pandas 0.22.0
matplotlib 2.2.0
datadotworld 1.6.0
watermark 1.5.0


In [3]:
from lib.cleaning_tools import *
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
agency_county = datasets.dataframes[AGENCY_COUNTY_DATAFRAME]
agency_county.head()

Unnamed: 0,agency,county
0,ANDERSON CO CONST PCT 1,ANDERSON
1,ANDERSON CO CONST PCT 2,ANDERSON
2,ANDERSON CO CONST PCT 3,ANDERSON
3,ANDERSON CO CONST PCT 4,ANDERSON
4,ANDERSON CO DIST ATTY OFFICE,ANDERSON


In [5]:
odmp = read_dtw_excel(DTW_PROJECT_KEY, 'original/' + RAW_FILENAME, 'Sheet1')
odmp.head()

Writing excel file to temp file: /var/folders/dc/8cbxbsh515s908xl0zyprszm0000gn/T/tmpaf4quxuw


Unnamed: 0,Date,Officer Name,Officer Agency,Cause of Death
0,12/19/1892,James B. Stevens,Wise County Constable's Office - Precinct 2,Gunfire
1,11/10/1837,Alfred H. Miles,Texas Rangers,Assault
2,11/10/1837,Jesse Blair,Texas Rangers,Assault
3,11/10/1837,James Christian,Texas Rangers,Assault
4,11/10/1837,James Joslen,Texas Rangers,Assault


## 2. Begin cleaning

### Make columns more machine friendly

In [6]:
odmp.columns = ['date_of_death', 'officer_name', 'agency_name', 'cause_of_death']

### Date-ify date column

In [7]:
odmp['date_of_death'] = pd.to_datetime(odmp['date_of_death'])

### Uppercase string values

In [8]:
upcase_strip_string_cells(odmp)

### Standardize agency names

In [9]:
odmp['agency_name'] = odmp['agency_name'].apply(standardize_agency_name)
odmp.head()

Unnamed: 0,date_of_death,officer_name,agency_name,cause_of_death
0,1892-12-19,JAMES B. STEVENS,WISE CO CONST PCT 2,GUNFIRE
1,1837-11-10,ALFRED H. MILES,TEXAS RANGERS,ASSAULT
2,1837-11-10,JESSE BLAIR,TEXAS RANGERS,ASSAULT
3,1837-11-10,JAMES CHRISTIAN,TEXAS RANGERS,ASSAULT
4,1837-11-10,JAMES JOSLEN,TEXAS RANGERS,ASSAULT


### Add county names

In [10]:
agency_to_county = dict(zip(agency_county.agency, agency_county.county))

def get_county(agency):
    if pd.isnull(agency):
        return None
    elif agency in agency_to_county:
        return agency_to_county[agency]
    elif agency == 'TEXAS DEPT OF CRIMINAL JUSTICE':
        return 'STATE'
    elif ' CO ' in agency:
        name = agency[:agency.index(' CO ')]
        return name
    else:
        return None

odmp['agency_county'] = odmp['agency_name'].apply(get_county)

In [11]:
print("Could not determine agency county for %d/%d officers" % (odmp['agency_county'].isnull().sum(), len(odmp)))
odmp[odmp['agency_county'].isnull()]['agency_name'].value_counts().head()

Could not determine agency county for 282/1870 officers


TEXAS RANGERS                                70
US CUSTOMS SERVICE                           30
US IMMIGRATION AND NATURALIZATION SERVICE    24
US BORDER PATROL                             22
US CUSTOMS AND BORDER PROTECTION             14
Name: agency_name, dtype: int64

### Drop records from federal agencies

In [12]:
odmp = odmp[odmp['agency_county'].notnull()]

### Standardize name

In [13]:
odmp.officer_name = odmp.officer_name.apply(standardize_name)
odmp.head()

Unnamed: 0,date_of_death,officer_name,agency_name,cause_of_death,agency_county
0,1892-12-19,JAMES B STEVENS,WISE CO CONST PCT 2,GUNFIRE,WISE
14,1840-03-19,JOSEPH L HOOD,BEXAR CO SHERIFFS OFFICE,STABBED,BEXAR
20,1858-01-09,SAMUEL LEE LOCKHART,LLANO CO SHERIFFS OFFICE,STABBED,LLANO
23,1860-02-19,THOMAS S MILLIGAN,MASON CO SHERIFFS OFFICE,ASSAULT,MASON
24,1861-08-24,JOHN B YORK,TARRANT CO SHERIFFS OFFICE,STABBED,TARRANT


## 3. Write

In [14]:
with dw.open_remote_file(DTW_PROJECT_KEY, CLEANED_FILENAME) as w:
    print("Writing to data.world:", CLEANED_FILENAME)
    odmp.to_csv(w, index=False)

Writing to data.world: odmp_texas_line_of_duty_deaths.csv
