# Clean data about individual police officers

* Input: `TCOLE - Texas Sworn Officers.xlsx
.csv`
* Output: `list_of_texas_officers.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
AGENCY_COUNTY_DATAFRAME = 'agencies_and_counties'
CLEANED_FILENAME = 'list_of_texas_officers.csv'

In [2]:
####################################################
# Boilerplate import/setup code for general analysis
# everett.wetchler@gmail.com
####################################################

import datetime as dt
import os
import random

import datadotworld as dw
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Jupyter setup
%matplotlib inline

## Graphical setup
# Useful colors to reference
SNS_BLUE, SNS_GREEN, SNS_RED, SNS_PURPLE, SNS_YELLOW, SNS_CYAN = sns.color_palette()
SNS_COLORS = sns.color_palette()
# sns.set_palette(sns.color_palette("cubehelix", 8))
mpl.rcParams.update({
  'font.size': 14,
  'axes.titlesize': 'x-large',
  'axes.labelsize': 'large',
  'xtick.labelsize': 'medium',
  'ytick.labelsize': 'medium',
  'legend.fancybox': True,
  'legend.fontsize': 'medium',
  'legend.frameon': True,
  'legend.framealpha': 0.7,
  'figure.figsize': ['9', '6'],
})

# Watermark extension to print version/system information
# Flags:
# -a [author] -d (date) -t (time) -z (timezone) -r (repo)
# -g (git hash) -w (watermark version) -p [packages] (package info)
%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p numpy,pandas,matplotlib,datadotworld

####################################################
# END Boilerplate
####################################################

Everett Wetchler 2018-08-19 12:48:29 PDT

numpy 1.14.5
pandas 0.23.3
matplotlib 2.2.2
datadotworld 1.6.0
watermark 1.6.1


In [3]:
from lib.cleaning_tools import *
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
agency_county = datasets.dataframes[AGENCY_COUNTY_DATAFRAME]
agency_county.head()

Unnamed: 0,agency,county
0,ANDERSON CO CONST PCT 1,ANDERSON
1,ANDERSON CO CONST PCT 2,ANDERSON
2,ANDERSON CO CONST PCT 3,ANDERSON
3,ANDERSON CO CONST PCT 4,ANDERSON
4,ANDERSON CO DIST ATTY OFFICE,ANDERSON


In [5]:
officer_info = read_dtw_excel(DTW_PROJECT_KEY, 'original/TCOLE - Texas Sworn Officers.xlsx')['Sheet1']
officer_info.head()

Writing excel file to temp file: /var/folders/dc/8cbxbsh515s908xl0zyprszm0000gn/T/tmppjsupdq4


Unnamed: 0,First Name,Middle,Last Name,Suffix,Current Department,Highest Cert,Service Time,Gender,Race,Age Range
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT.,Master Peace Officer,23.92,M,White,41 to 50
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.75,M,White,51 or over
2,CHARLES,T,KELLEY,,WILLIAMSON CO. SHERIFF'S OFFICE,Master Peace Officer,23.41,M,White,41 to 50
3,NAYA,C,POPE,,IRVING POLICE DEPT.,Master Peace Officer,36.91,F,Hispanic,51 or over
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT.,Master Peace Officer,32.41,F,White,51 or over


## 2. Begin cleaning

### Make columns more machine friendly

In [6]:
officer_info.columns = ['_'.join(c.lower().strip().split()) for c in officer_info.columns]
list(officer_info.columns)

['first_name',
 'middle',
 'last_name',
 'suffix',
 'current_department',
 'highest_cert',
 'service_time',
 'gender',
 'race',
 'age_range']

### Clean gender and race columns

In [7]:
officer_info.gender.value_counts()

M    68078
F     9538
        33
U       12
Name: gender, dtype: int64

In [8]:
standardize_gender_cols(officer_info)
print("Missing gender for %d officers" % officer_info.gender.isnull().sum())
officer_info.gender.value_counts()

Missing gender for 45 officers


MALE      68078
FEMALE     9538
Name: gender, dtype: int64

In [9]:
officer_info.race.value_counts()

White            45883
Hispanic         21987
Black             7908
Asian             1120
MultiCultural      466
Am Indian          285
Unknown             12
Name: race, dtype: int64

In [10]:
standardize_race_cols(officer_info)
print("Missing race for %d officers" % officer_info.race.isnull().sum())
officer_info.head()

Missing race for 0 officers


Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,race,age_range
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT.,Master Peace Officer,23.92,MALE,WHITE,41 to 50
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.75,MALE,WHITE,51 or over
2,CHARLES,T,KELLEY,,WILLIAMSON CO. SHERIFF'S OFFICE,Master Peace Officer,23.41,MALE,WHITE,41 to 50
3,NAYA,C,POPE,,IRVING POLICE DEPT.,Master Peace Officer,36.91,FEMALE,HISPANIC,51 or over
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT.,Master Peace Officer,32.41,FEMALE,WHITE,51 or over


### Standardize agency names

In [11]:
officer_info['current_department'] = officer_info['current_department'].apply(standardize_agency_name)
officer_info.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,race,age_range
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT,Master Peace Officer,23.92,MALE,WHITE,41 to 50
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.75,MALE,WHITE,51 or over
2,CHARLES,T,KELLEY,,WILLIAMSON CO SHERIFFS OFFICE,Master Peace Officer,23.41,MALE,WHITE,41 to 50
3,NAYA,C,POPE,,IRVING POLICE DEPT,Master Peace Officer,36.91,FEMALE,HISPANIC,51 or over
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT,Master Peace Officer,32.41,FEMALE,WHITE,51 or over


### Add county names

In [12]:
agency_to_county = dict(zip(agency_county.agency, agency_county.county))
officer_info['current_department_county'] = officer_info['current_department'].apply(lambda d: agency_to_county.get(d, None))

In [13]:
print("Could not determine agency county for %d officers" % officer_info['current_department_county'].isnull().sum())
# TODO: These agencies we did not have county information for, though
# it's easy to manually infer. At some point, fix these.
officer_info['current_department'][officer_info['current_department_county'].isnull()].value_counts()

Could not determine agency county for 160 officers


CYPRESS-FAIRBANKS ISD POLICE DEPT                88
ALABAMA-COUSHATTA TRIBE OF TEXAS POLICE DEPT     18
ECTOR CO HOSPITAL DIST POLICE DEPT               13
TEXAS AM UNIVERSITY-CENTRAL TEXAS POLICE DEPT    10
STAFFORD FIRE MARSHALS OFFICE                     6
HIDALGO CO CONST PCT 5                            5
TENAHA POLICE DEPT                                4
FARMERSVILLE ISD POLICE DEPT                      3
UNIV OF DALLAS POLICE DEPT                        2
RICHLAND POLICE DEPT                              2
CRISWELL COLLEGE POLICE DEPT                      2
DUMAS FIRE DEPT                                   1
COMO-PICKTON CISD POLICE DEPT                     1
50TH JUDICIAL DIST ATTY OFFICE                    1
FRANKLIN CO FIRE MARSHALS OFFICE                  1
COKE CO CONST PCT                                 1
HAMILTON CO CONST PCT 3                           1
AUBREY ISD POLICE DEPT                            1
Name: current_department, dtype: int64

### Uppercase string values

In [14]:
upcase_strip_string_cells(officer_info)

### Fix first/last/etc name columns

In [15]:
for col in ['first_name', 'middle', 'last_name', 'suffix']:
    officer_info[col] = officer_info[col].apply(standardize_name)

### Add 'full_name' column

In [16]:
officer_info['full_name'] = ''
for col in ['first_name', 'middle', 'last_name', 'suffix']:
    officer_info['full_name'] = officer_info['full_name'] + ' ' + officer_info[col].fillna('')
officer_info['full_name'] = officer_info['full_name'].apply(lambda s: ' '.join(s.strip().split()))
officer_info.sample(5)

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,race,age_range,current_department_county,full_name
64452,ALBERT,L,DAUGHERTY,,EL PASO POLICE DEPT,INTERMEDIATE PEACE OFFICER,4.58,MALE,WHITE,21 TO 30,EL PASO,ALBERT L DAUGHERTY
43154,SHANNON,T,DAVIS,,HUTTO POLICE DEPT,ADVANCED PEACE OFFICER,12.08,MALE,WHITE,41 TO 50,WILLIAMSON,SHANNON T DAVIS
68992,RONALD,R,WEDDLE,,IRVING POLICE DEPT,MASTER PEACE OFFICER,18.25,MALE,WHITE,41 TO 50,DALLAS,RONALD R WEDDLE
5035,GERARDO,,HERNANDEZ,,VAL VERDE CO CONST PCT 4,MASTER PEACE OFFICER,25.33,MALE,HISPANIC,51 OR OVER,VAL VERDE,GERARDO HERNANDEZ
18872,KENNETH,P,GUMM,,BEXAR CO CONST PCT 3,MASTER PEACE OFFICER,27.51,MALE,WHITE,51 OR OVER,BEXAR,KENNETH P GUMM


### Add an integer column for certification level

In [17]:
officer_info['highest_cert'] = officer_info['highest_cert'].fillna('NONE')
cert_levels = {
    'NONE': 0,
    'BASIC PEACE OFFICER': 1,
    'INTERMEDIATE PEACE OFFICER': 2,
    'ADVANCED PEACE OFFICER': 3,
    'MASTER PEACE OFFICER': 4,
}
officer_info['highest_cert_int'] = officer_info['highest_cert'].apply(lambda c: cert_levels[c])

### Adjust and check age_range and service_time columns

In [18]:
officer_info['age_range'].value_counts().sort_index()

20 OR LESS        2
21 TO 30      11443
31 TO 40      21793
41 TO 50      23145
51 OR OVER    21278
Name: age_range, dtype: int64

In [19]:
def convert_range(r):
    if r == '20 OR LESS':
        return '18-20'
    elif r == '51 OR OVER':
        return '51+'
    else:
        return r.replace(' TO ', '-')
officer_info['age_range'] = officer_info['age_range'].apply(convert_range)
officer_info['age_range'].value_counts().sort_index()

18-20        2
21-30    11443
31-40    21793
41-50    23145
51+      21278
Name: age_range, dtype: int64

#### Let's look at these two sub-21 officers....

In [20]:
officer_info[officer_info.age_range == '18-20']

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,race,age_range,current_department_county,full_name,highest_cert_int
1886,JERRY,W,HAGAN,,TEXAS DEPT OF PUBLIC SAFETY,MASTER PEACE OFFICER,51.34,MALE,WHITE,18-20,TEXAS,JERRY W HAGAN,4
3560,JACKIE,W,GUNNELS,,TEXAS DEPT OF PUBLIC SAFETY,MASTER PEACE OFFICER,45.92,MALE,WHITE,18-20,TEXAS,JACKIE W GUNNELS,4


#### Well, those are clearly typos. Let's fix their age ranges to be 51+

In [21]:
officer_info.loc[officer_info.age_range == '18-20', 'age_range'] = '51+'
officer_info['age_range'].value_counts().sort_index()

21-30    11443
31-40    21793
41-50    23145
51+      21280
Name: age_range, dtype: int64

#### While we're at it, does anyone else have an impossible age? The minimum age to enroll is usually 21, but occasionally 18.

In [22]:
age_range_max = officer_info.age_range.apply(lambda s: 9999 if s == '51+' else int(s[-2:]))
officer_info[(officer_info.service_time + 18) > age_range_max]

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,race,age_range,current_department_county,full_name,highest_cert_int
1211,SCOTT,D,SMITH,,HOUSTON POLICE DEPT,MASTER PEACE OFFICER,37.0,MALE,WHITE,31-40,HARRIS,SCOTT D SMITH,4
62021,ROLANDO,R,RODRIGUEZ,,SAN JUAN POLICE DEPT,BASIC PEACE OFFICER,13.33,MALE,HISPANIC,21-30,HIDALGO,ROLANDO R RODRIGUEZ,1


#### Okay, these are clearly impossible, but it's unclear if the age_range or service_time is the problem. Since it's only 2 records, we'll leave them be

### Reorder and rename columns, sort values

In [23]:
before = officer_info.shape

# Rename columns

col_renames = {
  "full_name": "name_full",
  "first_name": "name_first",
  "middle": "name_middle",
  "last_name": "name_last",
  "suffix": "name_suffix",
  "gender": "gender",
  "race": "race",
  "age_range": "age_range",
  "current_department": "current_agency",
  "current_department_county": "current_agency_county",
  "highest_cert": "highest_cert",
  "highest_cert_int": "highest_cert_int",
  "service_time": "service_time"
}
officer_info.rename(columns=col_renames, inplace=True)

# Re-order columns

officer_info = officer_info[[
    'name_full', 'name_first', 'name_middle', 'name_last', 'name_suffix',
    'gender', 'race', 'age_range', 'current_agency', 'current_agency_county',
    'highest_cert', 'highest_cert_int', 'service_time']]

# Make sure we didn't drop any columns by accident

after = officer_info.shape
assert(before == after)

# Sort sensibly

officer_info.sort_values(['current_agency_county', 'current_agency', 'name_full'], inplace=True)
officer_info.head(10)

Unnamed: 0,name_full,name_first,name_middle,name_last,name_suffix,gender,race,age_range,current_agency,current_agency_county,highest_cert,highest_cert_int,service_time
1160,DALE E SCHNELLE,DALE,E,SCHNELLE,,MALE,WHITE,51+,ANDERSON CO CONST PCT 1,ANDERSON,MASTER PEACE OFFICER,4,23.0
13843,GARY D THOMAS,GARY,D,THOMAS,,MALE,WHITE,51+,ANDERSON CO CONST PCT 1,ANDERSON,MASTER PEACE OFFICER,4,43.08
7444,CHARLES D LIGHTFOOT,CHARLES,D,LIGHTFOOT,,MALE,WHITE,51+,ANDERSON CO CONST PCT 2,ANDERSON,MASTER PEACE OFFICER,4,24.67
13844,WILLIAM R WATTS,WILLIAM,R,WATTS,,MALE,WHITE,51+,ANDERSON CO CONST PCT 2,ANDERSON,MASTER PEACE OFFICER,4,23.33
32116,KIMBERLY S HOLLIDAY,KIMBERLY,S,HOLLIDAY,,FEMALE,WHITE,51+,ANDERSON CO CONST PCT 3,ANDERSON,MASTER PEACE OFFICER,4,17.33
16266,JAMES O MUNIZ,JAMES,O,MUNIZ,,MALE,BLACK,51+,ANDERSON CO CONST PCT 4,ANDERSON,MASTER PEACE OFFICER,4,25.82
35911,ANESHIA D THOMPSON,ANESHIA,D,THOMPSON,,FEMALE,WHITE,41-50,ANDERSON CO DIST ATTY OFFICE,ANDERSON,ADVANCED PEACE OFFICER,3,16.25
50946,CATHY J STONER,CATHY,J,STONER,,FEMALE,WHITE,31-40,ANDERSON CO DIST ATTY OFFICE,ANDERSON,MASTER PEACE OFFICER,4,9.75
32639,RYAN N TOLLIVER,RYAN,N,TOLLIVER,,MALE,WHITE,41-50,ANDERSON CO DIST ATTY OFFICE,ANDERSON,ADVANCED PEACE OFFICER,3,14.75
56032,ANDREA N KARRIKER,ANDREA,N,KARRIKER,,FEMALE,WHITE,21-30,ANDERSON CO SHERIFFS OFFICE,ANDERSON,INTERMEDIATE PEACE OFFICER,2,5.58


## 3. Write

In [24]:
with dw.open_remote_file(DTW_PROJECT_KEY, CLEANED_FILENAME) as w:
    print("Writing to data.world:", CLEANED_FILENAME)
    officer_info.to_csv(w, index=False)

Writing to data.world: list_of_texas_officers.csv
