# Generate a minimalist, compressed version of CDR data for our website's explore-the-data app

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

In [1]:
OUTFOLDER = './'

In [2]:
import datadotworld as dw
import numpy as np
import pandas as pd
import simplejson as json

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

%load_ext watermark
%watermark -a 'Everett Wetchler' -d -t -z -w -p numpy,pandas,datadotworld

Everett Wetchler 2018-06-27 16:18:28 CDT

numpy 1.14.5
pandas 0.23.1
datadotworld 1.6.0
watermark 1.6.1


In [3]:
DTW_PROJECT_KEY = 'tji/deaths-in-custody'
FILENAME = 'cleaned_custodial_death_reports'

datasets = dw.load_dataset(DTW_PROJECT_KEY, force_update=True)
cdr = datasets.dataframes[FILENAME]
cdr.head()

Unnamed: 0,record_id,num_revisions,form_version,report_date,date_time_of_custody_or_incident,name_first,name_last,name_middle,name_suffix,name_full,date_of_birth,age_at_time_of_death,sex,race,death_date,death_date_and_time,death_location_county,death_location_city,death_location_street_address,death_location_latitude,death_location_longitude,death_location_census_tract,death_location_type,death_location_type_other,death_from_pre_existing_medical_condition,manner_of_death,manner_of_death_description,means_of_death,means_of_death_other,medical_cause_of_death,medical_examinor_coroner_evalution,medical_treatment,days_from_custody_to_death,who_caused_death_in_homicide_or_accident,who_caused_death_in_homicide_or_accident_other,offense_1,offense_2,offense_3,were_the_charges,facility_entry_date_time,type_of_custody,specific_type_of_custody_facility,agency_address,agency_city,agency_county,agency_name,agency_zip,entry_behavior,other_behavior,exhibit_any_medical_problems,exhibit_any_mental_health_problems,make_suicidal_statements
0,PA05001C,0.0,V_2005,2005-03-02 14:48:00,2005-01-01 15:04:00,DINO,GOMEZ,,,DINO GOMEZ,1964-04-03,40.0,M,HISPANIC,2005-01-01,2005-01-01 15:04:00,TARRANT,FORT WORTH,1509 W. HAMMOND,,,,CRIME/ARREST SCENE,,NOT APPLICABLE,HOMICIDE,,FIREARM,,GUNSHOT WOUND TO THE CHEST,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,0.0,LAW ENFORCEMENT/CORRECTIONAL PERSONNEL,,AGGRAVATED ASSAULT,,,NOT FILED AT TIME OF DEATH,,POLICE CUSTODY (PRE-BOOKING),CUSTODY OF LAW ENFORCEMENT PERSONNEL DURING/FL...,350 W. BELKNAP,FORT WORTH,TARRANT,FORT WORTH POLICE DEPT,76102,,,,,
1,PA14199CJ,0.0,V_2005,2014-04-28 09:27:00,2014-04-17 09:49:00,ALISHA,SKEATS,,,ALISHA SKEATS,1981-03-23,33.0,F,WHITE,2014-04-20,2014-04-20 08:14:00,DALLAS,DALLAS,PARKLAND HOSPITAL,,,,MEDICAL FACILITY,,UNKNOWN,NATURAL CAUSES/ILLNESS,COMPLICATIONS OF CHRONIC OPIATE USE,NOT APPLICABLE,,"COMPLICATIONS OF CHRONIC OPIATE, BENZODIAZEPIN...","YES, RESULTS ARE AVAILABLE",YES,2.0,NOT APPLICABLE,,THEFT OF SERVICE,,,FILED,2014-04-17 09:49:00,JAIL,HOSPITAL/INFIRMARY,133 N. RIVERFRONT BLVD.,DALLAS,DALLAS,DALLAS CO SHERIFFS OFFICE,75207,,,,,
2,PA14198P,0.0,V_2005,2014-04-25 15:15:00,1998-10-06 00:00:00,ORVILLE,MCCOY,LEE,,ORVILLE LEE MCCOY,1949-11-02,64.0,M,WHITE,2014-04-23,2014-04-23 09:07:00,ANDERSON,TENNESSEE COLONY,"MICHAEL UNIT, 2664 FM 2054",,,,LAW ENFORCEMENT FACILITY,,UNKNOWN,NATURAL CAUSES/ILLNESS,MALIGNANT PLEURAL EFFUSION GT BLEESING/ANEMIA,NOT APPLICABLE,,ESLD-LIVER DISEASE,"NO, EVALUATION NOT PLANNED",YES,5678.0,NOT APPLICABLE,,AGGRAVATED SEXUAL ASSAULT OF A CHILD,,,CONVICTED,1998-10-06 00:00:00,PRISON,TDCJ,"2503 LAKE ROAD, SUITE 5",HUNTSVILLE,STATE,TEXAS DEPT OF CRIMINAL JUSTICE,77340,,,,,
3,PA14197P,0.0,V_2005,2014-04-25 13:25:00,2012-03-19 00:00:00,PEDRO,JUAREZ,,JR,PEDRO JUAREZ JR,1960-05-11,53.0,M,HISPANIC,2014-04-16,2014-04-16 06:44:00,BOWIE,NEW BOSTON,3899 STATE HWY 98,,,,LAW ENFORCEMENT FACILITY,,UNKNOWN,NATURAL CAUSES/ILLNESS,CARDIAC ARRHYTHMIA,NOT APPLICABLE,,SUDDEN UNEXPECTED DEATH IN SCHIZOPHRENIA DUE T...,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,758.0,NOT APPLICABLE,,AGGRAVATED ASSAULT,,,CONVICTED,2012-04-25 00:00:00,PRISON,TDCJ,"2503 LAKE ROAD, SUITE 5",HUNTSVILLE,STATE,TEXAS DEPT OF CRIMINAL JUSTICE,77340,,,,,
4,PA14196P,0.0,V_2005,2014-04-25 11:14:00,2014-02-03 00:00:00,JAMES,BENJAMIN,NEAL,,JAMES NEAL BENJAMIN,1969-10-03,44.0,M,BLACK,2014-04-23,2014-04-23 13:35:00,FORT BEND,RICHMON,JESTER 4 UNIT-4 JESTER RD.,,,,MEDICAL FACILITY,,UNKNOWN,NATURAL CAUSES/ILLNESS,THROMBOEMBOLISM,NOT APPLICABLE,,ACUTE PULMONARY THROMBOEMBOLISM,"YES, RESULTS ARE AVAILABLE",NOT APPLICABLE,79.0,NOT APPLICABLE,,FELONY THEFT ( 1YR PROBATION) SAFPF,,,PROBATION/PAROLE VIOLATION,2014-02-03 00:00:00,PRISON,TDCJ,"2503 LAKE ROAD, SUITE 5",HUNTSVILLE,STATE,TEXAS DEPT OF CRIMINAL JUSTICE,77340,,,,,


In [4]:
cdr['year'] = cdr.death_date.dt.year

In [5]:
slim = cdr[[
  'year', 'race', 'sex', 'manner_of_death', 'age_at_time_of_death',
  'type_of_custody', 'death_location_type', 'means_of_death',
]]
slim.head()

Unnamed: 0,year,race,sex,manner_of_death,age_at_time_of_death,type_of_custody,death_location_type,means_of_death
0,2005,HISPANIC,M,HOMICIDE,40.0,POLICE CUSTODY (PRE-BOOKING),CRIME/ARREST SCENE,FIREARM
1,2014,WHITE,F,NATURAL CAUSES/ILLNESS,33.0,JAIL,MEDICAL FACILITY,NOT APPLICABLE
2,2014,WHITE,M,NATURAL CAUSES/ILLNESS,64.0,PRISON,LAW ENFORCEMENT FACILITY,NOT APPLICABLE
3,2014,HISPANIC,M,NATURAL CAUSES/ILLNESS,53.0,PRISON,LAW ENFORCEMENT FACILITY,NOT APPLICABLE
4,2014,BLACK,M,NATURAL CAUSES/ILLNESS,44.0,PRISON,MEDICAL FACILITY,NOT APPLICABLE


In [6]:
set(slim.race)

{'BLACK', 'HISPANIC', 'OTHER', 'WHITE', nan}

In [7]:
def compress(df):
    js = {
        'meta': {
            'num_columns': len(df.columns),
            'num_records': len(df),
            'lookups': {},
        },
        'records': {},
    }
    for col in df.columns:
        values = sorted(list(set(df[col].dropna())))
        mapping = dict((v, i) for i, v in enumerate(values))
        js['meta']['lookups'][col] = values
        js['records'][col] = df[col].apply(lambda x: -1 if pd.isnull(x) else mapping[x]).tolist()

    return js

In [8]:
samp = slim.sample(5)
samp

Unnamed: 0,year,race,sex,manner_of_death,age_at_time_of_death,type_of_custody,death_location_type,means_of_death
4501,2009,BLACK,M,NATURAL CAUSES/ILLNESS,39.0,PRISON,MEDICAL FACILITY,NOT APPLICABLE
6168,2012,HISPANIC,M,NATURAL CAUSES/ILLNESS,48.0,PRISON,,
4113,2011,WHITE,M,SUICIDE,30.0,JAIL,MEDICAL FACILITY,"HANGING, STRANGULATION"
3295,2005,OTHER,M,HOMICIDE,49.0,POLICE CUSTODY (PRE-BOOKING),CRIME/ARREST SCENE,FIREARM
32,2014,WHITE,M,HOMICIDE,42.0,POLICE CUSTODY (PRE-BOOKING),CRIME/ARREST SCENE,FIREARM


In [9]:
samp = compress(samp)
samp

{'meta': {'num_columns': 8,
  'num_records': 5,
  'lookups': {'year': [2005, 2009, 2011, 2012, 2014],
   'race': ['BLACK', 'HISPANIC', 'OTHER', 'WHITE'],
   'sex': ['M'],
   'manner_of_death': ['HOMICIDE', 'NATURAL CAUSES/ILLNESS', 'SUICIDE'],
   'age_at_time_of_death': [30.0, 39.0, 42.0, 48.0, 49.0],
   'type_of_custody': ['JAIL', 'POLICE CUSTODY (PRE-BOOKING)', 'PRISON'],
   'death_location_type': ['CRIME/ARREST SCENE', 'MEDICAL FACILITY'],
   'means_of_death': ['FIREARM', 'HANGING, STRANGULATION', 'NOT APPLICABLE']}},
 'records': {'year': [1, 3, 2, 0, 4],
  'race': [0, 1, 3, 2, 3],
  'sex': [0, 0, 0, 0, 0],
  'manner_of_death': [1, 1, 2, 0, 0],
  'age_at_time_of_death': [1, 3, 0, 4, 2],
  'type_of_custody': [2, 2, 0, 1, 1],
  'death_location_type': [1, -1, 1, 0, 0],
  'means_of_death': [2, -1, 1, 0, 0]}}

In [10]:
with open(OUTFOLDER + 'cdr_compressed_sample.json', 'w') as f:
    f.write(json.dumps(samp, indent=2))

In [11]:
js = compress(slim)
with open(OUTFOLDER + 'cdr_compressed.json', 'w') as f:
    f.write(json.dumps(js))