# Clean and reformat the raw data describing all Texas law enforcement agencies the number of officers in each.


* Input: `tji/raw-and-processing/TCOLE - Num Officers By Agency.xlsx`
* Output: `tji/auxiliary-datasets/`
  * `num_officers_by_agency.csv`
  * `agencies_and_counties.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DW_PROJECT_CDR = 'tji/deaths-in-custody'
DW_PROJECT_OIS = 'tji/officer-involved-shootings'
DW_PROJECT_AUXILIARY_DATASETS = 'tji/auxiliary-datasets'
DW_PROJECT_RAW_AND_PROCESSING = 'tji/raw-and-processing'

OUTPUT_DW_PROJECT = DW_PROJECT_AUXILIARY_DATASETS

In [2]:
import datadotworld as dw
import numpy as np
import pandas as pd

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -r -g -w -p datadotworld,numpy,pandas

Everett Wetchler 2018-12-17 14:05:27 PST

datadotworld 1.6.0
numpy 1.14.5
pandas 0.23.3
Git hash: 66a2c5da8c3ddd0ff19e6f7ac947b952d14323e9
Git repo: git@github.com:texas-justice-initiative/data-processing.git
watermark 1.6.1


In [3]:
from lib.cleaning_tools import read_dtw_excel
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
df = read_dtw_excel(DW_PROJECT_RAW_AND_PROCESSING, 'original/TCOLE - Num Officers By Agency.xlsx', select_sheet='Sheet1')
df.head()

Writing excel file to temp file: /var/folders/dc/8cbxbsh515s908xl0zyprszm0000gn/T/tmpha0ik43l


Unnamed: 0,Department Name,County Name,July 1st 2016,Dec 31st 2016,July 1st 2017,Dec 31st 2017,State?
0,PALESTINE CITY MARSHAL'S OFFICE,ANDERSON,0,0,0,1,
1,ANDERSON CO. CONST. PCT. 3,ANDERSON,1,1,1,1,
2,ANDERSON CO. CONST. PCT. 4,ANDERSON,2,2,1,1,
3,ANDERSON CO. CONST. PCT. 2,ANDERSON,2,2,2,2,
4,PALESTINE FIRE DEPT.,ANDERSON,2,2,1,2,


## 2. Begin cleaning

### Rename columns

In [5]:
df.columns = ['agency', 'county',
              'num_officers_2016_07_01', 'num_officers_2016_12_31',
              'num_officers_2017_07_01', 'num_officers_2017_12_31',
              'is_state_agency']

### Standardize agency and county names

In [6]:
df['agency'] = df['agency'].apply(standardize_agency_name)
df['county'] = df['county'].apply(lambda d: d.strip().upper())
df['is_state_agency'] = df['is_state_agency'] == 'y'
df.head()

Unnamed: 0,agency,county,num_officers_2016_07_01,num_officers_2016_12_31,num_officers_2017_07_01,num_officers_2017_12_31,is_state_agency
0,PALESTINE CITY MARSHALS OFFICE,ANDERSON,0,0,0,1,False
1,ANDERSON CO CONST PCT 3,ANDERSON,1,1,1,1,False
2,ANDERSON CO CONST PCT 4,ANDERSON,2,2,1,1,False
3,ANDERSON CO CONST PCT 2,ANDERSON,2,2,2,2,False
4,PALESTINE FIRE DEPT,ANDERSON,2,2,1,2,False


### Create special county ("TEXAS") for state agencies

#### Instead of leaving them in Travis county, where they default to

In [7]:
df.loc[df.is_state_agency, 'county'] = 'TEXAS'
print("Identified %d state agencies -- changed their county to STATE" % df.is_state_agency.sum())
df[df.is_state_agency]

Identified 15 state agencies -- changed their county to STATE


Unnamed: 0,agency,county,num_officers_2016_07_01,num_officers_2016_12_31,num_officers_2017_07_01,num_officers_2017_12_31,is_state_agency
2388,TEXAS RACING COMMISSION,TEXAS,4,5,5,5,True
2389,TEXAS JUVENILE JUSTICE DEPT,TEXAS,4,4,4,4,True
2391,TEXAS LOTTERY COMMISSION,TEXAS,7,7,7,7,True
2392,TEXAS ST BOARD OF PHARMACY,TEXAS,8,7,7,6,True
2395,TEXAS ST BOARD OF DENTAL EXAMINERS,TEXAS,10,12,4,2,True
2407,TEXAS COMMISSION ON LAW ENFORCEMENT,TEXAS,19,20,21,19,True
2408,TEXAS COMPTROLLER OF PUBLIC ACCOUNTS,TEXAS,19,22,21,24,True
2410,TEXAS JUVENILE JUSTICE DEPT OIG,TEXAS,22,20,24,23,True
2413,TEXAS DEPT OF INSURANCE FRAUD UNIT,TEXAS,29,30,26,25,True
2418,TEXAS HEALTH AND HUMAN SERVICES COMM- OFFICE O...,TEXAS,34,37,36,33,True


#### Note the last line - Texas DPS - which includes Texas Highway Patrol. It's by far the bulk of the state department force.

In [8]:
print("Texas has %d police agencies\n - %d of them had zero officers at the end of 2017\n - %d had 1+ officers at the end of 2017" % (
    len(df), (df['num_officers_2017_12_31'] == 0).sum(), (df['num_officers_2017_12_31'] > 0).sum()))

Texas has 2703 police agencies
 - 123 of them had zero officers at the end of 2017
 - 2580 had 1+ officers at the end of 2017


## 3. Write

In [9]:
df.sort_values(['county', 'agency'], inplace=True)

In [10]:
with dw.open_remote_file(OUTPUT_DW_PROJECT, 'num_officers_by_agency.csv') as w:
    print("Writing to data.world: %s/%s" % (OUTPUT_DW_PROJECT, 'num_officers_by_agency.csv'))
    df.to_csv(w, index=False)

Writing to data.world: tji/auxiliary-datasets/num_officers_by_agency.csv


In [11]:
with dw.open_remote_file(OUTPUT_DW_PROJECT, 'agencies_and_counties.csv') as w:
    print("Writing to data.world: %s/%s" % (OUTPUT_DW_PROJECT, 'agencies_and_counties.csv'))
    df[['agency', 'county']].to_csv(w, index=False)

Writing to data.world: tji/auxiliary-datasets/agencies_and_counties.csv
