<h1>What If Tool: Stop and Frisk Data</h1>

This notebook downloads a subset of the Stop and Frisk data, builds a model to predict how likely that suspect is to be frisked (given that every suspect here has already been stopped), and loads the What If Tool to visualize these results.

In [None]:
#@title Install the What-If Tool widget if running in colab {display-mode: "form"}

try:
  import google.colab
  !pip install --upgrade witwidget
except:
  pass

In [None]:
#@title Define helper functions {display-mode: "form"}

import pandas as pd
import numpy as np
import tensorflow as tf
import functools

pd.set_option('display.max_columns', None)

# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns=None):
    feature_spec = {}
    if columns == None:
        columns = df.columns.values.tolist()
    for f in columns:
        if df[f].dtype is np.dtype(np.int64):
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.int64)
        elif df[f].dtype is np.dtype(np.float64):
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.float32)
        else:
            feature_spec[f] = tf.FixedLenFeature(shape=(), dtype=tf.string)
    return feature_spec

# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(df, columns, feature_spec):
    ret = []
    for col in columns:
        if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
            ret.append(tf.feature_column.numeric_column(col))
        else:
            ret.append(tf.feature_column.indicator_column(
                tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))
    return ret

# An input function for providing input to a model from tf.Examples
def tfexamples_input_fn(examples, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
                       num_epochs=None, 
                       batch_size=64):
    def ex_generator():
        for i in range(len(examples)):
            yield examples[i].SerializeToString()
    dataset = tf.data.Dataset.from_generator(
      ex_generator, tf.dtypes.string, tf.TensorShape([]))
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
    dataset = dataset.repeat(num_epochs)
    return dataset

# Parses Tf.Example protos into features for the input function.
def parse_tf_example(example_proto, label, feature_spec):
    parsed_features = tf.parse_example(serialized=example_proto, features=feature_spec)
    target = parsed_features.pop(label)
    return parsed_features, target

# Converts a dataframe into a list of tf.Example protos.
def df_to_examples(df, columns=None):
    examples = []
    if columns == None:
        columns = df.columns.values.tolist()
    for index, row in df.iterrows():
        example = tf.train.Example()
        for col in columns:
            if df[col].dtype is np.dtype(np.int64):
                example.features.feature[col].int64_list.value.append(int(row[col]))
            elif df[col].dtype is np.dtype(np.float64):
                example.features.feature[col].float_list.value.append(row[col])
            elif row[col] == row[col]:
                example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
        examples.append(example)
    return examples

# Converts a dataframe column into a column of 0's and 1's based on the provided test.
# Used to force label columns to be numeric for binary classification using a TF estimator.
def make_label_column_numeric(df, label_column, test):
  df[label_column] = np.where(test(df[label_column]), 1, 0)


print('Done')

In [None]:
#@title Read training dataset from CSV {display-mode: "form"}

import pandas as pd


sqf = pd.read_csv('https://raw.githubusercontent.com/wboag/6805_sf/master/nyc2003.csv')


import numpy as np
import re
import random

# https://www.quora.com/What-is-the-best-number-mathematically
# 142857 received the most upvotes, and is therefore the best number
random.seed(142857)


# Ground turth (i.e. arrest or summons)

label_column = 'frisked'
#label_column = 'guilty'
sqf[label_column] = np.where(sqf[label_column] == 'N', 0, 1)


# Translate features into human-readable
def is_guilty(row):
    if row['sumissue']=='Y' or row['arstmade']=='Y':
        return 1
    else:
        return 0
    
def acproxm_readable(acproxm):
    return acproxm
    #return int(acproxm=='Y')

def acevasv_readable(acevasv):
    return acevasv
    #return int(acevasv=='Y')

def acincid_readable(acincid):
    return acincid
    #return int(acincid=='Y')

def race_readable(race):
    mapping = {'B':'Black','Q':'White Hispanic','W':'White','Z':'Other','P':'Black Hispanic','A':'Asian / Pacific Islander',
               'X':'Unknown', 'I':'American Indian / Alaskan Native', ' ':'(not listed)'}
    return mapping[race]

def build_readable(build):
    mapping = {' ':'(not listed)', 'H':'Heavy', 'M':'Medium', 'T':'Thin', 'U':'Muscular', 'Z':'Unknown'}
    return mapping[build]

def crime_subset(crime):
    mapping = {'ROBBERY':'Robbery', 'CPW':'Possession of a Weapon',
               'BURGLARY':'Burglary', 'BURG':'Burglary', 'GLA':'Grand Larcency Auto',
               'CRIMINAL TRESPASS':'Trespass', 'CRIM TRES':'Trespass', 'CRIM TRESPAS':'Trespass', 
               'ASSAULT':'Assault', 'CPCS':'Possession of a Controlled Substance'}
    if crime in mapping:
        return mapping[crime]
    else:
        return 'Other Crime'
    
def premname_subset(premname):
    include = [' ', 'SIDEWALK', 'CRIM TRESPASS', 'CPCS', 'CRIM TRES', 'CRIMINAL TRESPASS', 'ROBBERY', 'BURGLARY', 'GLA', 'CPW']
    if premname in include:
        return premname
    else:
        return 'Other Location'    

def age_to_decade(age_s):
    if age_s == ' ':
        return -1 # random.random()*90
        return '(null)'
    age = int(age_s[0])
    if age == 0:
        return -1 # random.random()*90
        return '(null)'
    if age>90:
        return 90
        return '90+'
    return age
    return '%s0-%s9' % (age,age)

def timestr_to_time(timestr):
    match = re.search('(\d+):(\d+)', timestr)
    if match:
      hours,minutes = match.groups()
      return int(hours) + float(minutes)/60
    else:
      return random.random() * 25
    

sqf[    'race'] = sqf[    'race'].apply(race_readable)
sqf[   'build'] = sqf[  'build'].apply(build_readable)
sqf['location'] = sqf['premname'].apply(premname_subset)
sqf[     'age'] = sqf['age'].apply(age_to_decade)
sqf['timestop'] = sqf['timestop'].apply(timestr_to_time)
sqf[  'guilty'] = sqf.apply(is_guilty, axis=1)

sqf['suspected_crime'] = sqf['crimsusp'].apply(crime_subset)
sqf['near_scene_of_offense'] = sqf['ac_proxm'].apply(acproxm_readable)
sqf['evasive_to_questions'] = sqf['ac_evasv'].apply(acevasv_readable)
sqf['high-crime_area'] = sqf['ac_incid'].apply(acincid_readable)



# After the above filtering, still only load 10,000 datapoints for the WIT
sqf = sqf.iloc[:2000]



# Get list of all columns from the dataset we will use for model input or output.
input_features = ['age', 'sex', 'build', 'race', 'suspected_crime', 'location', 
                  'near_scene_of_offense', 'evasive_to_questions', 'high-crime_area', 
                  'timestop']
#input_features = ['sex', 'age', 'race']
features_and_labels = input_features + [label_column]

features_for_file = input_features + ['guilty', 'frisked']


sqf[input_features].head()

In [None]:
#@title Create and train the classifier {display-mode: "form"}

examples = df_to_examples(sqf, features_for_file)

num_steps = 2000  #@param {type: "number"}
tf.logging.set_verbosity(tf.logging.DEBUG)

# Create a feature spec for the classifier
feature_spec = create_feature_spec(sqf, features_and_labels)

# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier = tf.estimator.LinearClassifier(
    feature_columns=create_feature_columns(sqf, input_features, feature_spec))
classifier.train(train_inpf, steps=num_steps)

print('Done')

In [None]:
#@title Invoke What-If Tool for test data and the trained models {display-mode: "form"}


num_datapoints = 10000  #@param {type: "number"}
tool_height_in_px = 1000  #@param {type: "number"}

from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget

# Setup the tool with the test examples and the trained classifier
config_builder = WitConfigBuilder(examples[0:num_datapoints]).set_estimator_and_feature_spec(
    classifier, feature_spec)
WitWidget(config_builder, height=tool_height_in_px)