# Generate a minimalist, compressed version of CDR/OIS data for our website's explore-the-data app

### NOTE 1: This is a temporary file, only existing until this code lives in a cron job somewhere
### NOTE 2: You'll need to run this TWICE to generate all the data -- once with `DATASET = 'ois'` and once with `DATASET = 'cdr'` in the config block below.

### Purpose of this notebook

This notebook generates the data files to that our explore the data page uses. There are two for each dataset:
1. A compressed file with just enough data to show the charts, e.g. `cdr_compressed.json`
2. The full CSV file, with all columns, in the same order as the compressed file.

Before running this notebook, edit the first cell to indicate which dataset you want to process, and where you want the resulting files to end up. If you want to process multiple datasets, you'll have to rerun this notebook several times with different settings in the first cell.

### About the compressed file

Say we have a set or records like this:
```
   [
      {"sex": "MALE", "race": "WHITE", "record_id": "PA111"},
      {"sex": "MALE", "race": "HISPANIC", "record_id": "PA222"},
      {"sex": "FEMALE", "race": "BLACK", "record_id": "PA333"},
      {"sex": "FEMALE", "race": null, "record_id": "PA444"},
   ]
```

We will compress them to look like this:
```
    {
      meta: {
        lookups: {
          "sex": ["FEMALE", "MALE"],
          "race": ["HISPANIC", "WHITE", "BLACK"]
        },
        'record_ids': {
            'field_name': 'record_id',
            'values': ["PA111", "PA222", "PA333", "PA444"]
        }
        num_records: 3,
        num_columns: 2
      },
      records: {
        "sex": [1, 1, 0, 0],
        "race": [1, 0, 2, -1]
      },
    }
```
Note that the 'records' object above contains indices in the lookup array for that column.  The value is -1 for missing values.

In practice, this cuts our data size down dramatically by avoiding repeated keys or repeating long string values.

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## \** CONFIGURE THIS CELL **

In [1]:
import os 
OUTFOLDER = './'  # Where to write the resulting files
DATASET = os.environ.get('COMPRESS_DATASET')

## Edit this if you want to tweak what data ends up in the compressed file

In [7]:
CONFIGS = {
    'CDR': {
        'DTW_PROJECT_KEY': 'tji/deaths-in-custody',
        'DTW_FILENAME': 'cleaned_custodial_death_reports',
        'OUTFILE_PREFIX': 'cdr',
        'DATE_COL': 'death_date',
        'ID_COL': 'record_id',
        'KEEP_COLS': [
            'record_id', 'year', 'race', 'sex', 'manner_of_death', 'age_at_time_of_death',
            'type_of_custody', 'death_location_type', 'means_of_death', 'death_location_county', 'agency_name'
        ]
    },
    'OIS': {
        'DTW_PROJECT_KEY': 'tji/officer-involved-shootings',
        'DTW_FILENAME': 'shot_civilians',
        'OUTFILE_PREFIX': 'ois',
        'DATE_COL': 'date_incident',
        'ID_COL': None,
        'KEEP_COLS': [
            'year', 'civilian_race', 'civilian_gender', 'civilian_age', 'civilian_died',
            'officer_age_1', 'officer_race_1', 'officer_gender_1', 'incident_result_of',
            'incident_county', 'agency_name_1', 'deadly_weapon',
            'multiple_officers_involved'
        ],
        'RENAMES': {
            'officer_gender_1': 'officer_gender',
            'officer_age_1': 'officer_age',
            'officer_race_1': 'officer_race',
            'agency_name_1': 'agency_name',
        }
    }
}

In [8]:
config = CONFIGS[DATASET]

## Import and go

In [9]:
import datadotworld as dw
import numpy as np
import pandas as pd
import simplejson as json

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p numpy,pandas,datadotworld

Everett Wetchler 2018-10-22 21:53:11 CDT

numpy 1.13.1
pandas 0.20.3
datadotworld 1.6.0
watermark 1.7.0


In [10]:
datasets = dw.load_dataset(config['DTW_PROJECT_KEY'], force_update=True)
df = datasets.dataframes[config['DTW_FILENAME']]
df.head()

Unnamed: 0,record_id,num_revisions,form_version,report_date,date_time_of_custody_or_incident,name_first,name_last,name_middle,name_suffix,name_full,date_of_birth,age_at_time_of_death,sex,race,death_date,death_date_and_time,death_location_county,death_location_city,death_location_street_address,death_location_type,death_location_type_other,death_from_pre_existing_medical_condition,manner_of_death,manner_of_death_description,means_of_death,means_of_death_other,medical_cause_of_death,medical_examinor_coroner_evalution,medical_treatment,days_from_custody_to_death,who_caused_death_in_homicide_or_accident,who_caused_death_in_homicide_or_accident_other,offense_1,offense_2,offense_3,were_the_charges,facility_entry_date_time,type_of_custody,specific_type_of_custody_facility,agency_address,agency_city,agency_county,agency_name,agency_zip,entry_behavior,other_behavior,exhibit_any_medical_problems,exhibit_any_mental_health_problems,make_suicidal_statements
0,16-89-C,0.0,V_2005,2017-03-13 16:47:00,2016-04-26 23:00:00,DARREL,MACHEMEHL,S.,,DARREL S. MACHEMEHL,1972-01-20,44.0,MALE,WHITE,2016-04-26,2016-04-26 23:00:00,GUADALUPE,KINGSBURY,US 90 AT FM 1104,CRIME/ARREST SCENE,,NOT APPLICABLE,"OTHER, SPECIFY",DREW HIS WEAPON ON OFFICERS AND WAS SHOT BY OF...,FIREARM,,MULTIPLE CENTER FIRE RIFLE WOUNDS,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,0.0,LAW ENFORCEMENT/CORRECTIONAL PERSONNEL,,AGGRAVATED ROBBERY,EVADING ARREST WITH VEHICLE,,NOT FILED AT TIME OF DEATH,NaT,POLICE CUSTODY (PRE-BOOKING),CUSTODY OF LAW ENFORCEMENT PERSONNEL DURING/FL...,350 N. GUADALUPE STREET,SEGUIN,GUADALUPE,SEGUIN POLICE DEPT,78155,,WAS SEEN PUTTING A SHOTGUN IN HIS BACKPACK THE...,,,
1,PA10081C,0.0,V_2005,2010-05-10 13:35:00,2010-05-04 12:10:00,WARREN,WILLIAMSON,ALBERT,,WARREN ALBERT WILLIAMSON,1983-05-26,26.0,MALE,WHITE,2010-05-04,2010-05-04 12:27:00,BEXAR,SAN ANTONIO,9100 PERRIN BEITEL,CRIME/ARREST SCENE,,NOT APPLICABLE,"OTHER, SPECIFY",HIGH SPEED MOTORCYCLE CRASH WHILE ATTEMPTING T...,"OTHER, SPECIFY",HIGH SPEED MOTORCYCLE ACCIDENT INTO VEHICLE CR...,MULTIPLE BLUNT FORCE INJURIES,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,0.0,DECEDENT,,PAROLE VIOLATION - DANGEROUS DRUGS,EVADING,RECKLESS ENDANGERMENT,PROBATION/PAROLE VIOLATION,NaT,POLICE CUSTODY (PRE-BOOKING),CUSTODY OF LAW ENFORCEMENT PERSONNEL DURING/FL...,8918 TESORO DR. NO. 301,SAN ANTONIO,BEXAR,BEXAR CO CONST PCT 3,78217,,,,,
2,PA10082P,0.0,V_2005,2010-05-11 10:22:00,2006-10-10 00:00:00,JOHN,DEJEAN,,,JOHN DEJEAN,1955-12-03,54.0,MALE,BLACK,2010-05-10,2010-05-10 06:15:00,GRIMES,NAVASOTA,2400 WALLACE PACK,LAW ENFORCEMENT FACILITY,,UNKNOWN,NATURAL CAUSES/ILLNESS,VENOUS INFARCT OF THE SMALL BOWEL,NOT APPLICABLE,,VENOUS INFARCT OF THE SMALL BOWEL,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,1308.0,NOT APPLICABLE,,THEFT,FALSE REPRESENTATION OF A LAWYER,,CONVICTED,2006-10-10 00:00:00,PRISON,TDCJ,"2503 LAKE ROAD, SUITE 5",HUNTSVILLE,STATE,TEXAS DEPT OF CRIMINAL JUSTICE,77340,,,,,
3,PA10083P,0.0,V_2005,2010-05-11 10:40:00,2008-03-12 00:00:00,JOEL,LEWIS,THOMAS,,JOEL THOMAS LEWIS,1967-04-17,43.0,MALE,WHITE,2010-04-27,2010-04-27 18:02:00,JEFFERSON,BEAUMONT,3060 FM 3514,LAW ENFORCEMENT FACILITY,,UNKNOWN,ACCIDENTAL,,NOT APPLICABLE,,HEROIN TOXICITY,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,776.0,NOT APPLICABLE,,POSSESSION OF CONTROLLED SUBSTANCE,,,CONVICTED,2009-08-25 00:00:00,PRISON,TDCJ,"2503 LAKE ROAD, SUITE 5",HUNTSVILLE,STATE,TEXAS DEPT OF CRIMINAL JUSTICE,77340,,,,,
4,PA10084CJ,0.0,V_2005,2010-05-12 11:09:00,2009-10-13 06:45:00,DENNIS,BRADFORD,EARL,,DENNIS EARL BRADFORD,1969-09-28,40.0,MALE,WHITE,2010-05-10,2010-05-10 02:11:00,GALVESTON,GALVESTON,5700 AVE H,LAW ENFORCEMENT FACILITY,,NOT APPLICABLE,SUICIDE,,"HANGING, STRANGULATION",,ASPHYXIATION,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,208.0,NOT APPLICABLE,,ATTEMPTED CAPITAL MURDER OF A CHILD,,,FILED,2009-10-14 23:47:00,JAIL,JAIL - SINGLE CELL,601 54TH STREET,GALVESTON,GALVESTON,GALVESTON CO SHERIFFS OFFICE,77550,,,,,


In [11]:
slim = df.copy()
slim['year'] = pd.to_datetime(slim[config['DATE_COL']]).dt.year

In [12]:
slim = slim[config['KEEP_COLS']]
slim.columns = [config.get('RENAMES', {}).get(c, c) for c in slim.columns]
slim.head()

Unnamed: 0,record_id,year,race,sex,manner_of_death,age_at_time_of_death,type_of_custody,death_location_type,means_of_death,death_location_county,agency_name
0,16-89-C,2016,WHITE,MALE,"OTHER, SPECIFY",44.0,POLICE CUSTODY (PRE-BOOKING),CRIME/ARREST SCENE,FIREARM,GUADALUPE,SEGUIN POLICE DEPT
1,PA10081C,2010,WHITE,MALE,"OTHER, SPECIFY",26.0,POLICE CUSTODY (PRE-BOOKING),CRIME/ARREST SCENE,"OTHER, SPECIFY",BEXAR,BEXAR CO CONST PCT 3
2,PA10082P,2010,BLACK,MALE,NATURAL CAUSES/ILLNESS,54.0,PRISON,LAW ENFORCEMENT FACILITY,NOT APPLICABLE,GRIMES,TEXAS DEPT OF CRIMINAL JUSTICE
3,PA10083P,2010,WHITE,MALE,ACCIDENTAL,43.0,PRISON,LAW ENFORCEMENT FACILITY,NOT APPLICABLE,JEFFERSON,TEXAS DEPT OF CRIMINAL JUSTICE
4,PA10084CJ,2010,WHITE,MALE,SUICIDE,40.0,JAIL,LAW ENFORCEMENT FACILITY,"HANGING, STRANGULATION",GALVESTON,GALVESTON CO SHERIFFS OFFICE


In [13]:
def compress(df, id_col=None):
    js = {
        'meta': {
            'num_columns': len(df.columns),
            'num_records': len(df),
            'lookups': {},
        },
        'records': {},
    }
    if id_col:
        js['meta']['record_ids'] = {
            'field_name': id_col,
            'values': list(df[id_col])
        }
        df = df.drop(id_col, axis=1)
    for col in df.columns:
        values = sorted(list(set(df[col].dropna())))
        mapping = dict((v, i) for i, v in enumerate(values))
        js['meta']['lookups'][col] = values
        js['records'][col] = df[col].apply(lambda x: -1 if pd.isnull(x) else mapping[x]).tolist()

    return js

## For convenience, we'll inspect a sample of data to see that our code is compressing correctly

In [14]:
samp = slim.sample(5)
samp

Unnamed: 0,record_id,year,race,sex,manner_of_death,age_at_time_of_death,type_of_custody,death_location_type,means_of_death,death_location_county,agency_name
8760,PRISON20092061,2009,WHITE,MALE,NATURAL CAUSES/ILLNESS,60.0,PRISON,,,,TEXAS DEPT OF CRIMINAL JUSTICE
6499,PRISON201147,2011,WHITE,MALE,NATURAL CAUSES/ILLNESS,56.0,PRISON,,,,TEXAS DEPT OF CRIMINAL JUSTICE
6507,PRISON2011460,2011,WHITE,MALE,NATURAL CAUSES/ILLNESS,59.0,PRISON,,,,TEXAS DEPT OF CRIMINAL JUSTICE
7093,TEXASTECH3,2008,BLACK,MALE,HOMICIDE,25.0,POLICE CUSTODY (PRE-BOOKING),,,,HOUSTON POLICE DEPT
3984,PA13384P,2013,BLACK,MALE,SUICIDE,34.0,PRISON,LAW ENFORCEMENT FACILITY,"HANGING, STRANGULATION",CHEROKEE,TEXAS DEPT OF CRIMINAL JUSTICE


In [15]:
samp_compressed = compress(samp, id_col=config['ID_COL'])
samp_compressed

{'meta': {'lookups': {'age_at_time_of_death': [25.0, 34.0, 56.0, 59.0, 60.0],
   'agency_name': ['HOUSTON POLICE DEPT', 'TEXAS DEPT OF CRIMINAL JUSTICE'],
   'death_location_county': ['CHEROKEE'],
   'death_location_type': ['LAW ENFORCEMENT FACILITY'],
   'manner_of_death': ['HOMICIDE', 'NATURAL CAUSES/ILLNESS', 'SUICIDE'],
   'means_of_death': ['HANGING, STRANGULATION'],
   'race': ['BLACK', 'WHITE'],
   'sex': ['MALE'],
   'type_of_custody': ['POLICE CUSTODY (PRE-BOOKING)', 'PRISON'],
   'year': [2008, 2009, 2011, 2013]},
  'num_columns': 11,
  'num_records': 5,
  'record_ids': {'field_name': 'record_id',
   'values': ['PRISON20092061',
    'PRISON201147',
    'PRISON2011460',
    'TEXASTECH3',
    'PA13384P']}},
 'records': {'age_at_time_of_death': [4, 2, 3, 0, 1],
  'agency_name': [1, 1, 1, 0, 1],
  'death_location_county': [-1, -1, -1, -1, 0],
  'death_location_type': [-1, -1, -1, -1, 0],
  'manner_of_death': [1, 1, 1, 0, 2],
  'means_of_death': [-1, -1, -1, -1, 0],
  'race': [1, 

---
# Write
---

In [19]:
import os
import boto3

def write_to_s3(filename):
    if os.environ.get('COMPRESS_%s_S3' % DATASET) == 'TRUE':
        s3 = boto3.resource('s3')
        s3.Bucket('tji-compressed-data').upload_file(filename, filename)

## For testing, generate a sample compressed file with just a few records (json file)

In [17]:
filename = OUTFOLDER + config['OUTFILE_PREFIX'] + '_compressed_sample.json'
print("Writing sample compressed file to", filename)
with open(filename, 'w') as f:
    f.write(json.dumps(samp_compressed, indent=2))
write_to_s3(config['OUTFILE_PREFIX'] + '_compressed_sample.json')

('Writing sample compressed file to', './cdr_compressed_sample.json')


## Generate the full compressed dataset (json file)

In [18]:
filename = OUTFOLDER + config['OUTFILE_PREFIX'] + '_compressed.json'
print("Writing full compressed file to", filename)
with open(filename, 'w') as f:
    f.write(json.dumps(compress(slim, id_col=config['ID_COL'])))
write_to_s3(config['OUTFILE_PREFIX'] + '_compressed.json')

('Writing full compressed file to', './cdr_compressed.json')


## Also write the full, uncompressed CSV file

In [13]:
fullfile = OUTFOLDER + config['OUTFILE_PREFIX'] + '_full.csv'
print("Writing full file to " + fullfile)
df.to_csv(fullfile, index=False)

Writing full file to ./ois_full.csv
