# Clean data about individual police officers

* Input: `raw_list_of_texas_officers.csv`
* Output: `list_of_texas_officers.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
RAW_DATAFRAME_NAME = 'raw_list_of_texas_officers'
AGENCY_COUNTY_DATAFRAME = 'agencies_and_counties'
CLEANED_FILENAME = 'list_of_texas_officers.csv'

In [2]:
import datadotworld as dw
import numpy as np
import pandas as pd

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -r -g -w -p datadotworld,numpy,pandas

Everett Wetchler 2018-05-08 17:13:41 CDT

datadotworld 1.6.0
numpy 1.14.3
pandas 0.22.0
Git hash: 1dac51453f53e9273617aeef9ca4267dee5a09cb
Git repo: git@github.com:texas-justice-initiative/data-processing.git
watermark 1.6.0


In [3]:
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
agency_county = datasets.dataframes[AGENCY_COUNTY_DATAFRAME]
agency_county.head()

Unnamed: 0,agency,county
0,ANDERSON CO CONST PCT 1,ANDERSON
1,ANDERSON CO CONST PCT 2,ANDERSON
2,ANDERSON CO CONST PCT 3,ANDERSON
3,ANDERSON CO CONST PCT 4,ANDERSON
4,ANDERSON CO DIST ATTY OFFICE,ANDERSON


In [5]:
df = datasets.dataframes[RAW_DATAFRAME_NAME]
df.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT.,Master Peace Officer,23.92
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.67
2,CHARLES,T,KELLEY,,WILLIAMSON CO. SHERIFF'S OFFICE,Master Peace Officer,23.41
3,NAYA,C,POPE,,IRVING POLICE DEPT.,Master Peace Officer,36.91
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT.,Master Peace Officer,32.41


## 2. Begin cleaning

### Standardize agency names

In [6]:
df['current_department'] = df['current_department'].apply(standardize_agency_name)
df.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT,Master Peace Officer,23.92
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,Master Peace Officer,25.67
2,CHARLES,T,KELLEY,,WILLIAMSON CO SHERIFFS OFFICE,Master Peace Officer,23.41
3,NAYA,C,POPE,,IRVING POLICE DEPT,Master Peace Officer,36.91
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT,Master Peace Officer,32.41


### Add county names

In [7]:
agency_to_county = dict(zip(agency_county.agency, agency_county.county))
df['current_department_county'] = df['current_department'].apply(lambda d: agency_to_county.get(d, None))
df['current_department_county'].isnull().value_counts()

False    77759
True        41
Name: current_department_county, dtype: int64

In [8]:
df['current_department'][df['current_department_county'].isnull()].value_counts()

ALABAMA-COUSHATTA TRIBE OF TEXAS POLICE DEPT    17
STAFFORD FIRE MARSHALS OFFICE                    6
HIDALGO CO CONST PCT 5                           5
TENAHA POLICE DEPT                               4
RICHLAND POLICE DEPT                             2
UNIV OF DALLAS POLICE DEPT                       2
DUMAS FIRE DEPT                                  1
FRANKLIN CO FIRE MARSHALS OFFICE                 1
COKE CO CONST PCT                                1
HAMILTON CO CONST PCT 3                          1
50TH JUDICIAL DIST ATTY OFFICE                   1
Name: current_department, dtype: int64

### Uppercase string values

In [9]:
for c in df.columns:
    df[c] = df[c].apply(lambda val: val if not isinstance(val, str) else val.upper())
df.head()

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,current_department_county
0,BRADFORD,E,TIPPIT,,MISSOURI CITY POLICE DEPT,MASTER PEACE OFFICER,23.92,FORT BEND
1,ROGER,W,KENDALL,,TEXAS BOARD OF CRIMINAL JUSTICE,MASTER PEACE OFFICER,25.67,WALKER
2,CHARLES,T,KELLEY,,WILLIAMSON CO SHERIFFS OFFICE,MASTER PEACE OFFICER,23.41,WILLIAMSON
3,NAYA,C,POPE,,IRVING POLICE DEPT,MASTER PEACE OFFICER,36.91,DALLAS
4,LINDA,F,HARTT-GOGGIN,,ANTHONY POLICE DEPT,MASTER PEACE OFFICER,32.41,EL PASO


### Add 'full_name' column

In [10]:
df['full_name'] = ''
for col in ['first_name', 'middle', 'last_name', 'suffix']:
    df['full_name'] = df['full_name'] + ' ' + df[col].fillna('')
df['full_name'] = df['full_name'].apply(lambda s: ' '.join(s.strip().split()))
df.sample(5)

Unnamed: 0,first_name,middle,last_name,suffix,current_department,highest_cert,service_time,current_department_county,full_name
24410,MICHAEL,,DE LA ROSA,,PFLUGERVILLE POLICE DEPT,MASTER PEACE OFFICER,21.75,TRAVIS,MICHAEL DE LA ROSA
62116,KENDALL,A,CASEY,,HENDERSON POLICE DEPT,,0.17,RUSK,KENDALL A CASEY
30455,RAFAEL,,MONTES,,HOUSTON ISD POLICE DEPT,BASIC PEACE OFFICER,2.58,HARRIS,RAFAEL MONTES
60315,ANNA,M,RODRIGUEZ,,BLINN JUNIOR COLLEGE POLICE DEPT,INTERMEDIATE PEACE OFFICER,5.91,WASHINGTON,ANNA M RODRIGUEZ
39052,CRAIG,S,HOLLEMAN,,LEWISVILLE POLICE DEPT,ADVANCED PEACE OFFICER,14.92,DENTON,CRAIG S HOLLEMAN


### Reorder and rename columns, sort values

In [11]:
df = df[['first_name', 'middle', 'last_name', 'suffix', 'full_name',
         'current_department', 'current_department_county',
         'highest_cert', 'service_time']]

df.columns = ['name_first', 'name_middle', 'name_last', 'name_suffix', 'name_full',
              'current_agency', 'current_agency_county',
              'highest_cert', 'service_time']

df.sort_values(['current_agency_county', 'current_agency', 'name_full'], inplace=True)
df.head(10)

Unnamed: 0,name_first,name_middle,name_last,name_suffix,name_full,current_agency,current_agency_county,highest_cert,service_time
1170,DALE,E,SCHNELLE,,DALE E SCHNELLE,ANDERSON CO CONST PCT 1,ANDERSON,MASTER PEACE OFFICER,22.92
13899,GARY,D,THOMAS,,GARY D THOMAS,ANDERSON CO CONST PCT 1,ANDERSON,MASTER PEACE OFFICER,43.0
7477,CHARLES,D,LIGHTFOOT,,CHARLES D LIGHTFOOT,ANDERSON CO CONST PCT 2,ANDERSON,MASTER PEACE OFFICER,24.67
13900,WILLIAM,R,WATTS,,WILLIAM R WATTS,ANDERSON CO CONST PCT 2,ANDERSON,MASTER PEACE OFFICER,23.25
32218,KIMBERLY,S,HOLLIDAY,,KIMBERLY S HOLLIDAY,ANDERSON CO CONST PCT 3,ANDERSON,MASTER PEACE OFFICER,17.33
16327,JAMES,O,MUNIZ,,JAMES O MUNIZ,ANDERSON CO CONST PCT 4,ANDERSON,MASTER PEACE OFFICER,25.82
36024,ANESHIA,D,THOMPSON,,ANESHIA D THOMPSON,ANDERSON CO DIST ATTY OFFICE,ANDERSON,ADVANCED PEACE OFFICER,16.25
51088,CATHY,J,STONER,,CATHY J STONER,ANDERSON CO DIST ATTY OFFICE,ANDERSON,MASTER PEACE OFFICER,9.75
32743,RYAN,N,TOLLIVER,,RYAN N TOLLIVER,ANDERSON CO DIST ATTY OFFICE,ANDERSON,ADVANCED PEACE OFFICER,14.75
56178,ANDREA,N,KARRIKER,,ANDREA N KARRIKER,ANDERSON CO SHERIFFS OFFICE,ANDERSON,INTERMEDIATE PEACE OFFICER,5.58


## 3. Write

In [12]:
with dw.open_remote_file(DTW_PROJECT_KEY, CLEANED_FILENAME) as w:
    print("Writing to data.world:", CLEANED_FILENAME)
    df.to_csv(w, index=False)

Writing to data.world: list_of_texas_officers.csv
