# Clean data from the Officer Down Memorial Page

* Input: `Texas Line of Duty Deaths by Gunfire (ODMP).xlsx`
* Output: `odmp_tx.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
AGENCY_COUNTY_DATAFRAME = 'agencies_and_counties'
CLEANED_FILENAME = 'cleaned_odmp_tx.csv'

In [2]:
####################################################
# Boilerplate import/setup code for general analysis
# everett.wetchler@gmail.com
####################################################

import datetime as dt
import os
import random

import datadotworld as dw
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Jupyter setup
%matplotlib inline

## Graphical setup
# Useful colors to reference
SNS_BLUE, SNS_GREEN, SNS_RED, SNS_PURPLE, SNS_YELLOW, SNS_CYAN = sns.color_palette()
SNS_COLORS = sns.color_palette()
# sns.set_palette(sns.color_palette("cubehelix", 8))
mpl.rcParams.update({
  'font.size': 14,
  'axes.titlesize': 'x-large',
  'axes.labelsize': 'large',
  'xtick.labelsize': 'medium',
  'ytick.labelsize': 'medium',
  'legend.fancybox': True,
  'legend.fontsize': 'medium',
  'legend.frameon': True,
  'legend.framealpha': 0.7,
  'figure.figsize': ['9', '6'],
})

# Watermark extension to print version/system information
# Flags:
# -a [author] -d (date) -t (time) -z (timezone) -r (repo)
# -g (git hash) -w (watermark version) -p [packages] (package info)
%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p numpy,pandas,matplotlib,datadotworld

####################################################
# END Boilerplate
####################################################

Everett Wetchler 2018-05-26 18:31:17 CDT

numpy 1.14.3
pandas 0.22.0
matplotlib 2.2.0
datadotworld 1.6.0
watermark 1.5.0


In [3]:
from lib.cleaning_tools import *
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
agency_county = datasets.dataframes[AGENCY_COUNTY_DATAFRAME]
agency_county.head()

Unnamed: 0,agency,county
0,ANDERSON CO CONST PCT 1,ANDERSON
1,ANDERSON CO CONST PCT 2,ANDERSON
2,ANDERSON CO CONST PCT 3,ANDERSON
3,ANDERSON CO CONST PCT 4,ANDERSON
4,ANDERSON CO DIST ATTY OFFICE,ANDERSON


In [5]:
odmp = read_dtw_excel(DTW_PROJECT_KEY, 'original/Texas Line of Duty Deaths by Gunfire (ODMP).xlsx', 'Sheet1')
odmp.head()

Writing excel file to temp file: /var/folders/dc/8cbxbsh515s908xl0zyprszm0000gn/T/tmpakm3sgz_


Unnamed: 0,Date,Officer Name,Officer Agency,Cause of Death
0,2018-04-25,Rogelio Santander Jr.,Dallas Police Department,Gunfire
1,2018-02-07,David Charles Sherrard,Richardson Police Department,Gunfire
2,2017-12-04,Kenneth Malcolm Copeland,San Marcos Police Department,Gunfire
3,2017-11-23,Damon Charles Allen,Texas Department of Public Safety,Gunfire
4,2017-10-09,Floyd East Jr.,Texas Tech University Police Department,Gunfire


## 2. Begin cleaning

### Make columns more machine friendly

In [6]:
odmp.columns = ['date_of_death', 'officer_name', 'agency_name', 'cause_of_death']

### Date-ify date column

In [7]:
odmp['date_of_death'] = pd.to_datetime(odmp['date_of_death'])

### Uppercase string values

In [8]:
upcase_strip_string_cells(odmp)

### Standardize agency names

In [9]:
odmp['agency_name'] = odmp['agency_name'].apply(standardize_agency_name)
odmp.head()

Unnamed: 0,date_of_death,officer_name,agency_name,cause_of_death
0,2018-04-25,ROGELIO SANTANDER JR.,DALLAS POLICE DEPT,GUNFIRE
1,2018-02-07,DAVID CHARLES SHERRARD,RICHARDSON POLICE DEPT,GUNFIRE
2,2017-12-04,KENNETH MALCOLM COPELAND,SAN MARCOS POLICE DEPT,GUNFIRE
3,2017-11-23,DAMON CHARLES ALLEN,TEXAS DEPT OF PUBLIC SAFETY,GUNFIRE
4,2017-10-09,FLOYD EAST JR.,TEXAS TECH UNIV POLICE DEPT,GUNFIRE


### Add county names

In [10]:
agency_to_county = dict(zip(agency_county.agency, agency_county.county))

def get_county(agency):
    if pd.isnull(agency):
        return None
    elif agency in agency_to_county:
        return agency_to_county[agency]
    elif ' CO ' in agency:
        name = agency[:agency.index(' CO ')]
        print(agency, name)
        return name
    else:
        return None

odmp['agency_county'] = odmp['agency_name'].apply(get_county)

NUECES CO CONST PCT 6 NUECES
SAN SABA CO CONST PCT 1 SAN SABA
BOWIE CO CONST PCT 6 BOWIE
GONZALES CO CONST PCT 7 GONZALES
STONEWALL CO CONST PCT 1 STONEWALL
GRAY CO CONST PCT 1 GRAY
MARION CO CONST PCT 4 MARION
CASS CO CONST PCT 5 CASS
SHELBY CO CONST PCT 8 SHELBY
WILLACY CO CONST PCT 2 WILLACY
WILLACY CO CONST PCT 2 WILLACY
CAMERON CO CONST PCT 7 CAMERON
HARRIS CO COMMUNITY SUPERVISION AND CORRECTIONS DEPT HARRIS
TYLER CO CONST PCT 5 TYLER
HIDALGO CO CONST PCT 5 HIDALGO
MOTLEY CO CONST PCT 5 MOTLEY
VAN ZANDT CO CONST PCT 7 VAN ZANDT
REAGAN CO CONST PCT 5 REAGAN
CAMERON CO CONST PCT 7 CAMERON
TAYLOR CO CONST TAYLOR
MUSKOGEE CO SHERIFFS OFFICE MUSKOGEE
VAN ZANDT CO CONST PCT 7 VAN ZANDT
WICHITA CO CONST WICHITA
WICHITA CO CONST WICHITA


In [11]:
print("Could not determine agency county for %d/%d officers" % (odmp['agency_county'].isnull().sum(), len(odmp)))
odmp[odmp['agency_county'].isnull()]['agency_name'].value_counts().head()

Could not determine agency county for 97/732 officers


US BORDER PATROL                  17
US CUSTOMS SERVICE                14
TEXAS DEPT OF CRIMINAL JUSTICE    11
TEXAS RANGERS                      7
US INTERNAL REVENUE SERVICE        6
Name: agency_name, dtype: int64

### Standardize name

In [12]:
odmp.officer_name = odmp.officer_name.apply(standardize_name)
odmp.head()

Unnamed: 0,date_of_death,officer_name,agency_name,cause_of_death,agency_county
0,2018-04-25,ROGELIO SANTANDER JR,DALLAS POLICE DEPT,GUNFIRE,DALLAS
1,2018-02-07,DAVID CHARLES SHERRARD,RICHARDSON POLICE DEPT,GUNFIRE,DALLAS
2,2017-12-04,KENNETH MALCOLM COPELAND,SAN MARCOS POLICE DEPT,GUNFIRE,HAYS
3,2017-11-23,DAMON CHARLES ALLEN,TEXAS DEPT OF PUBLIC SAFETY,GUNFIRE,TEXAS
4,2017-10-09,FLOYD EAST JR,TEXAS TECH UNIV POLICE DEPT,GUNFIRE,


## 3. Write

In [13]:
with dw.open_remote_file(DTW_PROJECT_KEY, CLEANED_FILENAME) as w:
    print("Writing to data.world:", CLEANED_FILENAME)
    odmp.to_csv(w, index=False)

Writing to data.world: cleaned_odmp_tx.csv
