### Setting up graphviz used to convert Decision Trees into graphs, which are then rendered.

#### (Switch from markdown to code if you would like to run.)

import sys

!{sys.executable} -m pip install graphviz

#### Note: graphviz needs to be installed through ``brew`` or ``apt-get``, for instance, as this is only a python binding.

### Interactively fitting a Decision Tree.

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor, export_graphviz

from graphviz import Source

from IPython.display import SVG
from IPython.display import display                               
from ipywidgets import interactive

In [3]:
ratio = 1.0
pct_black_median = None


# If a given row x (a training example) has 'racepctblack' greater than the median value, then
# multiply the outcome 'ViolentCrimesPerPop' by the user-defined ratio; otherwise, multiply by
# the inverse of the user-defined ratio.
# Interpretation: the greater the ratio, the worst is the bias against communities with above
# average Black populations; the smaller the ratio, the more favorable is the bias; setting to
# 1.0 keeps the data in its original form.

def apply_bias(x):
    global ratio, pct_black_median
    return x['ViolentCrimesPerPop']*ratio if x['racepctblack'] > pct_black_median else x['ViolentCrimesPerPop']*(1/ratio)


def plot_tree(data_bias=1.0):
    global ratio, pct_black_median # Made global to interface with function apply_bias
    ratio = data_bias # Set the amount of bias to whatever the user defines through the slider
    
    # Load the data from scratch every time
    df_crimes = pd.read_csv('communities.data')
    df_crimes.drop('communityname', inplace=True, axis=1) # Drop categorical data
    df_cleaned = df_crimes.replace({'?': np.nan}).dropna() # Drop missing values
    
    # Median also used in function apply_bias
    pct_black_median = df_cleaned['racepctblack'].median()
    
    df_biased = df_cleaned
    df_biased['ViolentCrimesPerPop'] = df_biased.apply(apply_bias, axis=1)
    
    # Vanilla ML
    X = df_biased[df_biased.columns[:-1]]
    Y = df_biased['ViolentCrimesPerPop']
    estimator = DecisionTreeRegressor(random_state=0, criterion='mse', splitter= 'best', max_depth=5)
    estimator.fit(X, Y)
    
    # Render Decision Tree
    labels = X.columns
    graph = Source(export_graphviz(estimator, out_file=None, feature_names=labels, filled = True))
    display(SVG(graph.pipe(format='svg')))
    return estimator


inter=interactive(plot_tree, bias=1.0)
display(inter)

interactive(children=(FloatSlider(value=1.0, description='data_bias', max=3.0, min=-1.0), Output()), _dom_clas…