### Setting up graphviz used to convert Decision Trees into graphs, which are then rendered.

#### (Switch from markdown to code if you would like to run.)

import sys

!{sys.executable} -m pip install graphviz

#### Note: graphviz needs to be installed through ``brew`` or ``apt-get``, for instance, as this is only a python binding.

### Interactively fitting a Decision Tree.

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor, export_graphviz

from graphviz import Source

from IPython.display import SVG
from IPython.display import display                               
from ipywidgets import interactive

#### Learning opportunities: discuss the original data, discuss feature selection, reduce feature space down to ~10

In [44]:
df_crimes = pd.read_csv('communities.data')
df_crimes.head()

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,?,?,Lakewoodcity,1,0.19,0.33,0.02,0.9,0.12,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,?,?,Tukwilacity,1,0.0,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,?,?,?,?,0.0,?,0.67
2,24,?,?,Aberdeentown,1,0.0,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,?,?,?,?,0.0,?,0.43
3,34,5,81440,Willingborotownship,1,0.04,0.77,1.0,0.08,0.12,...,0.02,0.39,0.28,?,?,?,?,0.0,?,0.12
4,42,95,6096,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,?,?,?,?,0.0,?,0.03


In [46]:
len(df_crimes)

1994

In [50]:
cols_w_na = [col for col in df_crimes.columns if df_crimes[col].apply(lambda x: x == '?')[1]]

In [51]:
for dropped in cols_w_na:
    df_crimes.drop(dropped, inplace=True, axis=1)

In [56]:
len(df_crimes)

1994

#### Preprocessing so it takes in numerical features to predict a numerical target

#### Remove/add features in order to discuss bias vs. accuracy tradeoff
#### TO-DOs:
#### 1. Measure accuracy in the first place
#### 2. Create a widget to turn on/off features

#### "If you send police to a neighborhood based on race/ethnicity then you're injecting bias in crime measurement data, which would cause the ML model to be (explicitly) racially discriminatory"

In [64]:
target = 'ViolentCrimesPerPop'
protected = 'racepctblack'
filename = 'communities.data'

In [101]:
ratio = 1.0
pct_black_median = None


# If a given row x (a training example) has 'racepctblack' greater than the median value, then
# multiply the outcome 'ViolentCrimesPerPop' by the user-defined ratio; otherwise, multiply by
# the inverse of the user-defined ratio.
# Interpretation: the greater the ratio, the worst is the bias against communities with above
# average Black populations; the smaller the ratio, the more favorable is the bias; setting to
# 1.0 keeps the data in its original form.

def apply_bias(x):
    global ratio, pct_black_median
    return x[target]*ratio if x[protected] > pct_black_median else x[target]*(1/ratio)


def plot_tree(overpolicing=False):
    global ratio, pct_black_median # Made global to interface with function apply_bias
    ratio = 2.0 if overpolicing else 1.0 # Set the amount of bias to whatever the user defines through the slider
    
    # Load the data from scratch every time
    df_crimes = pd.read_csv(filename)
    
    #cols_w_na = [col for col in df_crimes.columns if df_crimes[col].apply(lambda x: x == '?')[1]]
    #for dropped in cols_w_na:
    #    df_crimes.drop(dropped, inplace=True, axis=1)
    df_crimes.drop('communityname', inplace=True, axis=1)
    df_cleaned = df_crimes.replace({'?': np.nan}).dropna()
    
    # Median also used in function apply_bias
    pct_black_median = df_cleaned[protected].median()
    
    df_biased = df_cleaned
    df_biased[target] = df_biased.apply(apply_bias, axis=1)
    
    # Vanilla ML
    X = df_biased[df_biased.columns[:-1]]
    Y = df_biased[target]
    estimator = DecisionTreeRegressor(random_state=0, criterion='mse', splitter= 'best', max_depth=5)
    estimator.fit(X, Y)
    
    # Render Decision Tree
    labels = X.columns
    graph = Source(export_graphviz(estimator, out_file=None, feature_names=labels, filled = True))
    display(SVG(graph.pipe(format='svg')))
    return estimator


inter=interactive(plot_tree, overpolicing=False)
display(inter)

interactive(children=(Checkbox(value=False, description='overpolicing'), Output()), _dom_classes=('widget-inte…

In [72]:
import ipywidgets as widgets

#### https://gist.github.com/pbugnion/5bb7878ff212a0116f0f1fbc9f431a5c

In [90]:
def multi_checkbox_widget(descriptions):
    """ Widget with multiple checkboxes """
    options_dict = {description: widgets.Checkbox(description=description, value=True) for description in descriptions}
    options = [options_dict[description] for description in descriptions]
    options_widget = widgets.VBox(options, layout={'overflow': 'scroll'})
    multi_select = widgets.VBox([options_widget])
    return multi_select

In [91]:
multi_checkbox_widget(df_crimes.columns)

VBox(children=(VBox(children=(Checkbox(value=True, description='state'), Checkbox(value=True, description='cou…