# Clean and reformat the raw data describing all Texas law enforcement agencies the number of officers in each.

* Input: `raw_TCOLE_LicensedOfficers.xlsx`
* Outputs:
  * `num_officers_by_agency.csv`
  * `agencies_and_counties.csv`
  
##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## 1. Setup and read data

In [1]:
DTW_PROJECT_KEY = 'tji/auxiliary-datasets'
RAW_DATAFRAME_NAME = 'raw_num_officers_by_agency'
CLEANED_FILENAME = 'num_officers_by_agency.csv'
AGENCY_COUNTY_ONLY_FILENAME = 'agencies_and_counties.csv'

In [2]:
import datadotworld as dw
import numpy as np
import pandas as pd

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -r -g -w -p datadotworld,numpy,pandas

Everett Wetchler 2018-05-08 16:52:15 CDT

datadotworld 1.6.0
numpy 1.14.3
pandas 0.22.0
Git hash: 1dac51453f53e9273617aeef9ca4267dee5a09cb
Git repo: git@github.com:texas-justice-initiative/data-processing.git
watermark 1.6.0


In [3]:
from lib.standardize_police_agency_names import standardize_agency_name

In [4]:
datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
df = datasets.dataframes[RAW_DATAFRAME_NAME]
df.head()

Unnamed: 0,department_name,county_name,july_1st_2016,dec_31st_2016,july_1st_2017,dec_31st_2017
0,PALESTINE CITY MARSHAL'S OFFICE,ANDERSON,0,0,0,1
1,ANGELINA CO. DIST. ATTY'S OFFICE,ANGELINA,0,0,1,1
2,ARCHER CO. CONST. PCT. 3,ARCHER,0,0,0,0
3,287th District Attorney's Office,BAILEY,0,0,0,0
4,BELTON FIRE MARSHAL'S OFFICE,BELL,0,0,0,0


## 2. Begin cleaning

### Rename columns

In [5]:
df.columns = ['agency', 'county',
              'num_officers_2016_07_01', 'num_officers_2016_12_31',
              'num_officers_2017_07_01', 'num_officers_2017_12_31']

### Standardize agency and county names

In [6]:
df['agency'] = df['agency'].apply(standardize_agency_name)
df['county'] = df['county'].apply(lambda d: d.strip().upper())
df.head()

Unnamed: 0,agency,county,num_officers_2016_07_01,num_officers_2016_12_31,num_officers_2017_07_01,num_officers_2017_12_31
0,PALESTINE CITY MARSHALS OFFICE,ANDERSON,0,0,0,1
1,ANGELINA CO DIST ATTY OFFICE,ANGELINA,0,0,1,1
2,ARCHER CO CONST PCT 3,ARCHER,0,0,0,0
3,287TH DIST ATTY OFFICE,BAILEY,0,0,0,0
4,BELTON FIRE MARSHALS OFFICE,BELL,0,0,0,0


### Create special county ("TEXAS") for state agencies

#### Instead of leaving them in Travis county, where they default to

In [7]:
# Any agency in Travis county with 'TEXAS' in the name
# is a state agency -- with a few exceptions.
exceptions = [
    'CENTRAL TEXAS POLICE DEPT.',
    'CONCORDIA UNIVERSITY TEXAS POLICE DEPARTMENT',
    'UNIV. OF TEXAS SYSTEM POLICE',
]
is_state = (
    df.agency.str.contains('TEXAS')
    & (df.county == 'TRAVIS')
    & ~df.agency.isin(exceptions))
df.loc[is_state, 'county'] = 'STATE'

# Have a look at the altered entries to be sure they look right:
df[df['county'] == 'STATE']

Unnamed: 0,agency,county,num_officers_2016_07_01,num_officers_2016_12_31,num_officers_2017_07_01,num_officers_2017_12_31
81,TEXAS COMM ON JAIL STANDARDS,STATE,0,0,0,0
736,CENTRAL TEXAS POLICE DEPT,STATE,2,2,0,1
894,CONCORDIA UNIV TEXAS POLICE DEPT,STATE,2,2,0,3
1189,TEXAS RACING COMMISSION,STATE,4,5,5,5
1204,TEXAS JUVENILE JUSTICE DEPT,STATE,4,4,4,4
1432,TEXAS LOTTERY COMMISSION,STATE,7,7,7,7
1573,TEXAS ST BOARD OF PHARMACY,STATE,8,7,7,6
1738,TEXAS ST BOARD OF DENTAL EXAMINERS,STATE,10,12,4,2
2076,TEXAS COMMISSION ON LAW ENFORCEMENT,STATE,19,20,21,19
2077,TEXAS COMPTROLLER OF PUBLIC ACCOUNTS,STATE,19,22,21,24


#### Note the last line - Texas DPS - which includes Texas Highway Patrol. It's by far the bulk of the state department force.

In [8]:
print("Texas has %d police agencies\n - %d of them had zero officers at the end of 2017\n - %d had 1+ officers at the end of 2017" % (
    len(df), (df['num_officers_2017_12_31'] == 0).sum(), (df['num_officers_2017_12_31'] > 0).sum()))

Texas has 2703 police agencies
 - 123 of them had zero officers at the end of 2017
 - 2580 had 1+ officers at the end of 2017


## 3. Write

In [9]:
df.sort_values(['county', 'agency'], inplace=True)

In [10]:
with dw.open_remote_file(DTW_PROJECT_KEY, CLEANED_FILENAME) as w:
    print("Writing to data.world:", CLEANED_FILENAME)
    df.to_csv(w, index=False)

Writing to data.world: num_officers_by_agency.csv


In [11]:
with dw.open_remote_file(DTW_PROJECT_KEY, AGENCY_COUNTY_ONLY_FILENAME) as w:
    print("Writing to data.world:", AGENCY_COUNTY_ONLY_FILENAME)
    df[['agency', 'county']].to_csv(w, index=False)

Writing to data.world: agencies_and_counties.csv
