# Clean data about individual police officers

* Input: `raw_list_of_texas_officers.csv`
* Output: `list_of_texas_officers.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
RAW_DATAFRAME_NAME = 'raw_list_of_texas_officers'
AGENCY_COUNTY_DATAFRAME = 'agencies_and_counties'
CLEANED_FILENAME = 'list_of_texas_officers.csv'

In [2]:
####################################################
# Boilerplate import/setup code for general analysis
# everett.wetchler@gmail.com
####################################################

import datetime as dt
import os
import random

import datadotworld as dw
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

## Jupyter setup
%matplotlib inline

## Graphical setup
# Useful colors to reference
SNS_BLUE, SNS_GREEN, SNS_RED, SNS_PURPLE, SNS_YELLOW, SNS_CYAN = sns.color_palette()
SNS_COLORS = sns.color_palette()
# sns.set_palette(sns.color_palette("cubehelix", 8))
mpl.rcParams.update({
  'font.size': 14,
  'axes.titlesize': 'x-large',
  'axes.labelsize': 'large',
  'xtick.labelsize': 'medium',
  'ytick.labelsize': 'medium',
  'legend.fancybox': True,
  'legend.fontsize': 'medium',
  'legend.frameon': True,
  'legend.framealpha': 0.7,
  'figure.figsize': ['9', '6'],
})

# Watermark extension to print version/system information
# Flags:
# -a [author] -d (date) -t (time) -z (timezone) -r (repo)
# -g (git hash) -w (watermark version) -p [packages] (package info)
%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p numpy,pandas,matplotlib,datadotworld

####################################################
# END Boilerplate
####################################################

Everett Wetchler 2018-05-09 23:11:36 CDT

numpy 1.14.3
pandas 0.20.1
matplotlib 2.2.0
datadotworld 1.6.0
watermark 1.5.0


In [3]:
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
agency_county = datasets.dataframes[AGENCY_COUNTY_DATAFRAME]
agency_county.head()

Unnamed: 0,agency,county
0,ANDERSON CO CONST PCT 1,ANDERSON
1,ANDERSON CO CONST PCT 2,ANDERSON
2,ANDERSON CO CONST PCT 3,ANDERSON
3,ANDERSON CO CONST PCT 4,ANDERSON
4,ANDERSON CO DIST ATTY OFFICE,ANDERSON


In [5]:
officer_info = datasets.dataframes[RAW_DATAFRAME_NAME]
officer_info.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT.,Master Peace Officer,23.92,M
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.75,M
2,CHARLES,T,KELLEY,,WILLIAMSON CO. SHERIFF'S OFFICE,Master Peace Officer,23.41,M
3,NAYA,C,POPE,,IRVING POLICE DEPT.,Master Peace Officer,36.91,F
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT.,Master Peace Officer,32.41,F


## 2. Begin cleaning

### Validate gender column

In [6]:
print("Missing gender for %d officers" % officer_info.gender.isnull().sum())
officer_info.gender.value_counts()

Missing gender for 33 officers


M    68131
F     9541
U       12
Name: gender, dtype: int64

In [7]:
officer_info.gender[officer_info.gender == 'U'] = np.nan
officer_info.gender.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


M    68131
F     9541
Name: gender, dtype: int64

### Standardize agency names

In [8]:
officer_info['current_department'] = officer_info['current_department'].apply(standardize_agency_name)
officer_info.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT,Master Peace Officer,23.92,M
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.75,M
2,CHARLES,T,KELLEY,,WILLIAMSON CO SHERIFFS OFFICE,Master Peace Officer,23.41,M
3,NAYA,C,POPE,,IRVING POLICE DEPT,Master Peace Officer,36.91,F
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT,Master Peace Officer,32.41,F


### Add county names

In [9]:
agency_to_county = dict(zip(agency_county.agency, agency_county.county))
officer_info['current_department_county'] = officer_info['current_department'].apply(lambda d: agency_to_county.get(d, None))
officer_info['current_department_county'].isnull().value_counts()

False    77675
True        42
Name: current_department_county, dtype: int64

In [10]:
officer_info['current_department'][officer_info['current_department_county'].isnull()].value_counts()

ALABAMA-COUSHATTA TRIBE OF TEXAS POLICE DEPT    18
STAFFORD FIRE MARSHALS OFFICE                    6
HIDALGO CO CONST PCT 5                           5
TENAHA POLICE DEPT                               4
UNIV OF DALLAS POLICE DEPT                       2
RICHLAND POLICE DEPT                             2
50TH JUDICIAL DIST ATTY OFFICE                   1
FRANKLIN CO FIRE MARSHALS OFFICE                 1
COKE CO CONST PCT                                1
DUMAS FIRE DEPT                                  1
HAMILTON CO CONST PCT 3                          1
Name: current_department, dtype: int64

### Uppercase string values

In [11]:
for c in officer_info.columns:
    officer_info[c] = officer_info[c].apply(lambda val: val if not isinstance(val, str) else val.upper())
officer_info.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,current_department_county
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT,MASTER PEACE OFFICER,23.92,M,FORT BEND
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,MASTER PEACE OFFICER,25.75,M,WALKER
2,CHARLES,T,KELLEY,,WILLIAMSON CO SHERIFFS OFFICE,MASTER PEACE OFFICER,23.41,M,WILLIAMSON
3,NAYA,C,POPE,,IRVING POLICE DEPT,MASTER PEACE OFFICER,36.91,F,DALLAS
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT,MASTER PEACE OFFICER,32.41,F,EL PASO


### Add 'full_name' column

In [12]:
officer_info['full_name'] = ''
for col in ['first_name', 'middle', 'last_name', 'suffix']:
    officer_info['full_name'] = officer_info['full_name'] + ' ' + officer_info[col].fillna('')
officer_info['full_name'] = officer_info['full_name'].apply(lambda s: ' '.join(s.strip().split()))
officer_info.sample(5)

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,gender,current_department_county,full_name
33385,RONALD,B,PRATT,,EL PASO CO SHERIFFS OFFICE,MASTER PEACE OFFICER,13.33,M,EL PASO,RONALD B PRATT
72677,ASHLEY,D,STRELEC,,VICTORIA CO SHERIFFS OFFICE,BASIC PEACE OFFICER,1.83,F,VICTORIA,ASHLEY D STRELEC
65928,MATTHEW,J,FAZ,,SAN ANGELO POLICE DEPT,BASIC PEACE OFFICER,4.08,M,TOM GREEN,MATTHEW J FAZ
42466,JASON,L,HAMILTON,,FORT STOCKTON POLICE DEPT,MASTER PEACE OFFICER,12.25,M,PECOS,JASON L HAMILTON
7231,VIRGIL,C,JONES,,SCHERTZ POLICE DEPT,MASTER PEACE OFFICER,23.5,M,GUADALUPE,VIRGIL C JONES


### Add an integer column for certification level

In [13]:
officer_info['highest_cert'] = officer_info['highest_cert'].fillna('NONE')
cert_levels = {
    'NONE': 0,
    'BASIC PEACE OFFICER': 1,
    'INTERMEDIATE PEACE OFFICER': 2,
    'ADVANCED PEACE OFFICER': 3,
    'MASTER PEACE OFFICER': 4,
}
officer_info['highest_cert_int'] = officer_info['highest_cert'].apply(lambda c: cert_levels[c])

### Reorder and rename columns, sort values

In [14]:
before = officer_info.shape

# Re-order columns
officer_info = officer_info[[
    'first_name', 'middle', 'last_name', 'suffix', 'full_name',
    'gender', 'current_department', 'current_department_county',
    'highest_cert', 'highest_cert_int', 'service_time']]

# Rename columns
officer_info.columns = [
    'name_first', 'name_middle', 'name_last', 'name_suffix', 'name_full',
    'gender','current_agency', 'current_agency_county',
    'highest_cert', 'highest_cert_int', 'service_time']

# Make sure we didn't drop any columns by accident
after = officer_info.shape
assert(before == after)

officer_info.sort_values(['current_agency_county', 'current_agency', 'name_full'], inplace=True)
officer_info.sample(10)

Unnamed: 0,name_first,name_middle,name_last,name_suffix,name_full,gender,current_agency,current_agency_county,highest_cert,highest_cert_int,service_time
71903,CHRISTOPHER,S,FATIGATI,,CHRISTOPHER S FATIGATI,M,LUBBOCK POLICE DEPT,LUBBOCK,BASIC PEACE OFFICER,1,2.17
23003,PEDRO,M,TAMBUNGA,JR,PEDRO M TAMBUNGA JR,M,BALLINGER POLICE DEPT,RUNNELS,NONE,0,2.0
50752,TIMOTHY,A,CLICK,,TIMOTHY A CLICK,M,TERRY CO SHERIFFS OFFICE,TERRY,INTERMEDIATE PEACE OFFICER,2,8.08
56011,DARRYL,L,MAPP,,DARRYL L MAPP,M,DALLAS POLICE DEPT,DALLAS,MASTER PEACE OFFICER,4,8.33
23960,R.,A,BESHIRS,JR,R. A BESHIRS JR,M,HALTOM CITY POLICE DEPT,TARRANT,MASTER PEACE OFFICER,4,21.17
74841,NICHOLE,M,DELACRUZ-SANCHEZ,,NICHOLE M DELACRUZ-SANCHEZ,F,TEXAS DEPT OF PUBLIC SAFETY,STATE,NONE,0,1.17
24669,ROGELIO,T,TIGHE,,ROGELIO T TIGHE,M,CISCO COLLEGE POLICE DEPT,EASTLAND,MASTER PEACE OFFICER,4,19.33
53,DAVID,S,WERNER,,DAVID S WERNER,M,JEFFERSON CO CONST PCT 2,JEFFERSON,MASTER PEACE OFFICER,4,25.41
29117,SHAWN,J,KEESLER,,SHAWN J KEESLER,M,HARRIS CO FIRE MARSHALS OFFICE,HARRIS,ADVANCED PEACE OFFICER,3,11.42
36523,JEFFERY,R,STRAIN,,JEFFERY R STRAIN,M,TEXAS DEPT OF PUBLIC SAFETY,STATE,MASTER PEACE OFFICER,4,15.42


## 3. Write

In [15]:
with dw.open_remote_file(DTW_PROJECT_KEY, CLEANED_FILENAME) as w:
    print("Writing to data.world:", CLEANED_FILENAME)
    officer_info.to_csv(w, index=False)

Writing to data.world: list_of_texas_officers.csv
