# Generate a minimalist, compressed version of CDR/OIS data for our website's explore-the-data app

## NOTE: This is a temporary file, only existing until this code lives in a cron job somewhere

### Purpose of this notebook

This notebook generates the data files to that our explore the data page uses. There are two for each dataset:
1. A compressed file with just enough data to show the charts, e.g. `cdr_compressed.json`
2. The full CSV file, with all columns, in the same order as the compressed file, e.g. `cdr_full.csv`

### Instructions

Simply run this notebook top to bottom to generate a bunch of new datafiles.

### About the compressed file

Say we have a set or records like this:
```
   [
      {"sex": "MALE", "race": "WHITE", "record_id": "PA111"},
      {"sex": "MALE", "race": "HISPANIC", "record_id": "PA222"},
      {"sex": "FEMALE", "race": "BLACK", "record_id": "PA333"},
      {"sex": "FEMALE", "race": null, "record_id": "PA444"},
   ]
```

We will compress them to look like this:
```
    {
      meta: {
        lookups: {
          "sex": ["FEMALE", "MALE"],
          "race": ["HISPANIC", "WHITE", "BLACK"]
        },
        'record_ids': {
            'field_name': 'record_id',
            'values': ["PA111", "PA222", "PA333", "PA444"]
        }
        num_records: 3,
        num_columns: 2
      },
      records: {
        "sex": [1, 1, 0, 0],
        "race": [1, 0, 2, -1]
      },
    }
```
Note that the 'records' object above contains indices in the lookup array for that column.  The value is -1 for missing values.

In practice, this cuts our data size down dramatically by avoiding repeated keys or repeating long string values.

##### Author: Everett Wetchler (everett.wetchler@gmail.com)

## Edit this if you want to tweak what data ends up in the compressed file, or where it's written

In [1]:
CONFIG_MAPPING = {
    'cdr': ['cdr'],
    'ois': ['ois-civilians', 'ois-officers']
}

In [2]:
OUTFOLDER = './'  # Where to write the resulting files

CONFIGS = {
    'cdr': {
        'DTW_PROJECT_KEY': 'tji/deaths-in-custody',
        'DTW_FILENAME': 'cleaned_custodial_death_reports',
        'OUTFILE_PREFIX': 'cdr',
        'DATE_COL': 'death_date',
        'ID_COL': 'record_id',
        'KEEP_COLS': [
            'record_id', 'year', 'race', 'sex', 'manner_of_death', 'age_at_time_of_death',
            'type_of_custody', 'death_location_type', 'means_of_death', 'death_location_county', 'agency_name'
        ]
    },
    'ois-civilians': {
        'DTW_PROJECT_KEY': 'tji/officer-involved-shootings',
        'DTW_FILENAME': 'shot_civilians',
        'OUTFILE_PREFIX': 'ois',
        'DATE_COL': 'date_incident',
        'ID_COL': None,
        'KEEP_COLS': [
            'year', 'civilian_race', 'civilian_gender', 'civilian_age', 'civilian_died',
            'officer_age_1', 'officer_race_1', 'officer_gender_1', 'incident_result_of',
            'incident_county', 'agency_name_1', 'deadly_weapon',
            'multiple_officers_involved'
        ],
        'RENAMES': {
            'officer_gender_1': 'officer_gender',
            'officer_age_1': 'officer_age',
            'officer_race_1': 'officer_race',
            'agency_name_1': 'agency_name',
        }
    },
    'ois-officers': {
        'DTW_PROJECT_KEY': 'tji/officer-involved-shootings',
        'DTW_FILENAME': 'shot_officers',
        'OUTFILE_PREFIX': 'ois_officers',
        'DATE_COL': 'date_incident',
        'ID_COL': None,
        'KEEP_COLS': [
            'year', 'civilian_race_1', 'civilian_gender_1', 'civilian_age_1', 'civilian_died',
            'officer_age', 'officer_race', 'officer_gender', 'incident_county', 'agency_name_1'
        ],
        'RENAMES': {
            'agency_name_1': 'agency_name',
        }
    }
}

## Import and go

In [3]:
import os
import boto3
import datadotworld as dw
import numpy as np
import pandas as pd
import simplejson as json

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

%load_ext watermark
%watermark -a "Everett Wetchler" -d -t -z -w -p numpy,pandas,datadotworld

Everett Wetchler 2018-12-02 01:06:11 CST

numpy 1.15.3
pandas 0.23.4
datadotworld 1.6.0
watermark 1.7.0


In [4]:
def compress(df, id_col=None):
    js = {
        'meta': {
            'num_columns': len(df.columns),
            'num_records': len(df),
            'lookups': {},
        },
        'records': {},
    }
    if id_col:
        js['meta']['record_ids'] = {
            'field_name': id_col,
            'values': list(df[id_col])
        }
        df = df.drop(id_col, axis=1)
    for col in df.columns:
        values = sorted(list(set(df[col].dropna())))
        mapping = dict((v, i) for i, v in enumerate(values))
        js['meta']['lookups'][col] = values
        js['records'][col] = df[col].apply(lambda x: -1 if pd.isnull(x) else mapping[x]).tolist()

    return js

In [5]:
def write_to_s3(filename):
    s3 = boto3.resource('s3')
    s3.Bucket('tji-compressed-data').upload_file(filename, filename)

In [6]:
def create_one(config, sample=False, s3_upload=False):
    datasets = dw.load_dataset(config['DTW_PROJECT_KEY'], force_update=True)
    df = datasets.dataframes[config['DTW_FILENAME']]
    slim = df.copy()
    slim['year'] = pd.to_datetime(slim[config['DATE_COL']]).dt.year
    slim = slim[config['KEEP_COLS']]
    slim.columns = [config.get('RENAMES', {}).get(c, c) for c in slim.columns]
    prefix = ""
    if sample:
        slim = slim.sample(5)
        prefix = "SAMPLE_"
    compressed = compress(slim, id_col=config['ID_COL'])
    
    # Write
    filename = OUTFOLDER + prefix + config['OUTFILE_PREFIX'] + '_compressed.json'
    print("Writing full compressed file to", filename)
    with open(filename, 'w') as f:
        f.write(json.dumps(compress(slim, id_col=config['ID_COL'])))
    if s3_upload:
        write_to_s3(filename)
    fullfile = OUTFOLDER + prefix + config['OUTFILE_PREFIX'] + '_full.csv'
    print("Writing full file to " + fullfile)
    df.to_csv(fullfile, index=False)

## For local development

In [7]:
for k, config in CONFIGS.items():
    print(k)
    create_one(config, sample=True)
    create_one(config, sample=False)

cdr
Writing full compressed file to ./SAMPLE_cdr_compressed.json
Writing full file to ./SAMPLE_cdr_full.csv
Writing full compressed file to ./cdr_compressed.json
Writing full file to ./cdr_full.csv
ois-civilians
Writing full compressed file to ./SAMPLE_ois_compressed.json
Writing full file to ./SAMPLE_ois_full.csv
Writing full compressed file to ./ois_compressed.json
Writing full file to ./ois_full.csv
ois-officers
Writing full compressed file to ./SAMPLE_ois_officers_compressed.json
Writing full file to ./SAMPLE_ois_officers_full.csv
Writing full compressed file to ./ois_officers_compressed.json
Writing full file to ./ois_officers_full.csv


## For cronjob

In [8]:
for dataset, configs in CONFIG_MAPPING.items():
    if os.environ.get('COMPRESS_%s_S3' % dataset.upper()) != 'TRUE':
        print("Not writing to s3. To do so, set COMPRESS_%s_S3 to 'TRUE'" % dataset.upper())
    else:
        for config in configs:
            create_one(CONFIGS[config], s3_upload=True)

Not writing to s3. To do so, set COMPRESS_CDR_S3 to 'TRUE'
Not writing to s3. To do so, set COMPRESS_OIS_S3 to 'TRUE'
