# Generate a minimalist, compressed version of CDR/OIS data for our website's explore-the-data app

## NOTE: This is a temporary file, only existing until this code lives in a cron job somewhere

### Purpose of this notebook

This notebook generates the data files to that our explore the data page uses. There are two for each dataset:
1. A compressed file with just enough data to show the charts, e.g. `cdr_compressed.json`
2. The full CSV file, with all columns, in the same order as the compressed file, e.g. `cdr_full.csv`

### Instructions

Simply run this notebook top to bottom to generate a bunch of new datafiles.

### About the compressed file

Say we have a set or records like this:
```
   [
      {"sex": "MALE", "race": "WHITE", "record_id": "PA111"},
      {"sex": "MALE", "race": "HISPANIC", "record_id": "PA222"},
      {"sex": "FEMALE", "race": "BLACK", "record_id": "PA333"},
      {"sex": "FEMALE", "race": null, "record_id": "PA444"},
   ]
```

We will compress them to look like this:
```
    {
      meta: {
        lookups: {
          "sex": ["FEMALE", "MALE"],
          "race": ["HISPANIC", "WHITE", "BLACK"]
        },
        'record_ids': {
            'field_name': 'record_id',
            'values': ["PA111", "PA222", "PA333", "PA444"]
        }
        num_records: 3,
        num_columns: 2
      },
      records: {
        "sex": [1, 1, 0, 0],
        "race": [1, 0, 2, -1]
      },
    }
```
Note that the 'records' object above contains indices in the lookup array for that column.  The value is -1 for missing values.

In practice, this cuts our data size down dramatically by avoiding repeated keys or repeating long string values.

In order to write compressed and slider files to s3, set the environment variables COMPRESS_CDR_S3 and/or COMPRESS_OIS_S3 to 'TRUE' before running this notebook.

##### Author: Everett Wetchler (everett.wetchler@gmail.com),  Aiden Yang (alyang250@gmail.com), and Dashiel Lopez Mendez (hi@dashiel.dev)

## Edit this if you want to tweak what data ends up in the compressed file, or where it's written

In [1]:
CONFIG_MAPPING = {
    'cdr': ['cdr'],
    'ois': ['ois-civilians', 'ois-officers']
}

In [2]:
OUTFOLDER = './'  # Where to write the resulting files

S3_BUCKET_NAME = 'tji-compressed-data'

CONFIGS = {
    'cdr': {
        'DW_PROJECT_KEY': 'tji/deaths-in-custody',
        'DW_FILENAME': 'cleaned_custodial_death_reports',
        'OUTFILE_PREFIX': 'cdr',
        'DATE_COL': 'death_date',
        'ID_COL': 'record_id',
        'KEEP_COLS': [
            'record_id', 'year', 'race', 'sex', 'manner_of_death', 'age_at_time_of_death',
            'type_of_custody', 'death_location_type', 'means_of_death', 'death_location_county', 'agency_name'
        ]
    },
    'ois-civilians': {
        'DW_PROJECT_KEY': 'tji/officer-involved-shootings',
        'DW_FILENAME': 'shot_civilians',
        'OUTFILE_PREFIX': 'ois',
        'DATE_COL': 'date_incident',
        'ID_COL': None,
        'KEEP_COLS': [
            'year', 'civilian_race', 'civilian_gender', 'civilian_age', 'civilian_died',
            'officer_age_1', 'officer_race_1', 'officer_gender_1', 'incident_result_of',
            'incident_county', 'agency_name_1', 'deadly_weapon',
            'multiple_officers_involved'
        ],
        'RENAMES': {
            'officer_gender_1': 'officer_gender',
            'officer_age_1': 'officer_age',
            'officer_race_1': 'officer_race',
            'agency_name_1': 'agency_name',
        }
    },
    'ois-officers': {
        'DW_PROJECT_KEY': 'tji/officer-involved-shootings',
        'DW_FILENAME': 'shot_officers',
        'OUTFILE_PREFIX': 'ois_officers',
        'DATE_COL': 'date_incident',
        'ID_COL': None,
        'KEEP_COLS': [
            'year', 'civilian_race_1', 'civilian_gender_1', 'civilian_age_1', 'civilian_harm',
            'officer_age', 'officer_race', 'officer_gender', 'officer_harm',
            'incident_county', 'agency_name_1'
        ],
        'RENAMES': {
            'agency_name_1': 'agency_name',
        }
    }
}

## Import and go

In [40]:
import gzip
import os
import shutil
import boto3
import datadotworld as dw
import numpy as np
import pandas as pd
import json

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# %load_ext watermark
# %watermark -a "Everett Wetchler Aiden Yang, and Dashiel Lopez Mendez" -d -t -z -w -p numpy,pandas,datadotworld

In [20]:
def compress_original(df, id_col=None):
    js = {
        'meta': {
            'num_columns': len(df.columns),
            'num_records': len(df),
            'lookups': {},
        },
        'records': {},
    }
    if id_col:
        js['meta']['record_ids'] = {
            'field_name': id_col,
            'values': list(df[id_col])
        }
        df = df.drop(id_col, axis=1)
    for col in df.columns:
        values = sorted(list(set(df[col].dropna())))
        mapping = dict((v, i) for i, v in enumerate(values))
        js['meta']['lookups'][col] = values
        js['records'][col] = df[col].apply(lambda x: -1 if pd.isnull(x) else mapping[x]).tolist()

    return js

In [37]:
def compress_new(df, id_col=None):
    def get_age_group(age):
        if age == 'not given':
            return age
        elif age < 18:
            return 'under 18'
        elif age < 30:
            return '18 to 29'
        elif age < 40:
            return '30 to 39'
        elif age < 50:
            return '40 to 49'
        elif age < 60:
            return '50 to 59'
        else:
            return '60 and up'

    js = {'records': {}}
    if id_col:
        df = df.drop(id_col, axis=1)
    for col in df.columns:
        js['records'][col] = df[col].fillna('not given').tolist()
        if col in ('age_at_time_of_death', 'civilian_age', 'officer_age'):
            js['records']['age_group'] = df[col].apply(lambda x: get_age_group(x)).tolist()

    return js

In [8]:
def write_slider_data_to_s3(slider_data):
    print("Uploading all slider data to s3")
    s3 = boto3.resource('s3')
    slider_file = s3.Object('tji-compressed-data', 'all_slider_data.json')
    slider_file.put(Body=json.dumps(slider_data))

In [41]:
def create_one(config, sample=False, s3_upload=False, accum_slider_data=None):
    datasets = dw.load_dataset(config['DW_PROJECT_KEY'], force_update=True)
    df = datasets.dataframes[config['DW_FILENAME']]

    slim = df.copy()
    slim['year'] = pd.to_datetime(slim[config['DATE_COL']]).dt.year
    slim = slim[config['KEEP_COLS']]
    slim.columns = [config.get('RENAMES', {}).get(c, c) for c in slim.columns]
    prefix = ""
    if sample:
        slim = slim.sample(5)
        prefix = "SAMPLE_"
    
    compressed = compress_original(slim, id_col=config['ID_COL'])
    compressed_new = compress_new(slim, id_col=config['ID_COL'])
    # Write
    if not s3_upload:
        filename = OUTFOLDER + prefix + config['OUTFILE_PREFIX'] + '_compressed.json'
        print("Writing file to", filename)
        with open(filename, 'w') as f:
            f.write(json.dumps(compressed))

        filename = OUTFOLDER + prefix + config['OUTFILE_PREFIX'] + '_compressed_new.json'
        print("Writing file to", filename)
        with open(filename, 'w') as f:
            f.write(json.dumps(compressed_new))

        filename = OUTFOLDER + prefix + config['OUTFILE_PREFIX'] + '_compressed_new.json'
        print("Writing file to", filename + '.gz')
        with open(filename, 'rb') as f_in, gzip.open(filename + '.gz', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

        fullfile = OUTFOLDER + prefix + config['OUTFILE_PREFIX'] + '_full.csv'
        print("Writing file to " + fullfile)
        df.to_csv(fullfile, index=False)
    else:
        s3 = boto3.resource('s3')
        slider_data = {
            'startingYear': compressed['meta']['lookups']['year'][0],
            'totalRecords': compressed['meta']['num_records']
        }
        accum_slider_data[config['OUTFILE_PREFIX']] = slider_data
        s3_filename = prefix + config['OUTFILE_PREFIX'] + '_compressed.json'
        print("Uploading file " + s3_filename + " to s3")
        s3.Bucket(S3_BUCKET_NAME).upload_file(s3_filename, s3_filename)

        s3_filename = prefix + config['OUTFILE_PREFIX'] + '_compressed_new.json'
        print("Uploading file " + s3_filename + " to s3")
        s3.Bucket(S3_BUCKET_NAME).upload_file(s3_filename, s3_filename)
                     
        s3_filename = prefix + config['OUTFILE_PREFIX'] + '_compressed_new.json.gz'
        print("Uploading file " + s3_filename + " to s3")
        s3.Bucket(S3_BUCKET_NAME).upload_file(s3_filename, s3_filename)

## Write the files

In [42]:
for k, config in CONFIGS.items():
    print(k)
    create_one(config, sample=True)
    create_one(config, sample=False)

cdr
Writing file to ./SAMPLE_cdr_compressed.json
Writing file to ./SAMPLE_cdr_compressed_new.json
Writing file to ./SAMPLE_cdr_compressed_new.json.gz
Writing file to ./SAMPLE_cdr_full.csv
Writing file to ./cdr_compressed.json
Writing file to ./cdr_compressed_new.json
Writing file to ./cdr_compressed_new.json.gz
Writing file to ./cdr_full.csv
ois-civilians
Writing file to ./SAMPLE_ois_compressed.json
Writing file to ./SAMPLE_ois_compressed_new.json
Writing file to ./SAMPLE_ois_compressed_new.json.gz
Writing file to ./SAMPLE_ois_full.csv
Writing file to ./ois_compressed.json
Writing file to ./ois_compressed_new.json
Writing file to ./ois_compressed_new.json.gz
Writing file to ./ois_full.csv
ois-officers
Writing file to ./SAMPLE_ois_officers_compressed.json
Writing file to ./SAMPLE_ois_officers_compressed_new.json
Writing file to ./SAMPLE_ois_officers_compressed_new.json.gz
Writing file to ./SAMPLE_ois_officers_full.csv
Writing file to ./ois_officers_compressed.json
Writing file to ./ois_

## For cronjob

In [14]:
all_slider_data = {}
for dataset, configs in CONFIG_MAPPING.items():
    if os.environ.get('COMPRESS_%s_S3' % dataset.upper()) != 'TRUE':
        print("Not writing to s3. To do so, set COMPRESS_%s_S3 to 'TRUE'" % dataset.upper())
    else:
        for config in configs:
            create_one(CONFIGS[config], s3_upload=True, accum_slider_data=all_slider_data)
if all_slider_data:
    write_slider_data_to_s3(all_slider_data)

Uploading file cdr_compressed.json to s3
Uploading file ois_compressed.json to s3
Uploading file ois_officers_compressed.json to s3
Uploading all slider data to s3
