In [20]:
from bokeh.plotting import figure
from bokeh.io import push_notebook, output_notebook, show
import math
from bokeh.models.widgets import Select
from bokeh.models.glyphs import Circle
from bokeh.layouts import column
from re import match
import numpy as np
from sklearn import cluster, datasets
from bokeh.models import HoverTool, PanTool, WheelZoomTool
from sklearn.tree import DecisionTreeClassifier
from ipywidgets import interact
import ipywidgets as widgets

#self-defined module
import data_parser

output_notebook()
#import excel datasets, and convert them into dataframe
nutrition_data_df = data_parser.parse_csv_data('./data/nutrition_raw_anonymized_data.csv')
nutrition_feature_names = data_parser.extract_column_names(nutrition_data_df)


nutrition_data_dic = data_parser.build_dic(nutrition_data_df, nutrition_feature_names)

# Pick diabetes column as the outputs of the dataset.
diabete_outputs = []
for diabete in nutrition_data_dic['diabetes']:
    if diabete == 'Yes':
        diabete_outputs.append(1)
    else:
        diabete_outputs.append(0)

# Pick about 50 features randomly for the decision tree.
start_feature_i = nutrition_feature_names.index('BREAKFASTSANDWICHFREQ')
features_num = math.floor((len(nutrition_feature_names) - start_feature_i) / 20)
features_list = []
for i in list(range(0, features_num)):
    features_list.append(nutrition_feature_names[start_feature_i + i * 20])

diabete_outputs_color = []
diabete_results = []
for diabete_output in diabete_outputs:
    if diabete_output == 1:
        diabete_outputs_color.append('blue')
        diabete_results.append("Diabetes")
    else:
        diabete_outputs_color.append('red')
        diabete_results.append("Not Diabetes")




source = ColumnDataSource(data = dict(x = nutrition_data_dic[features_list[0]], y = nutrition_data_dic[features_list[1]], color = diabete_outputs_color, label = diabete_results))
#label_source = ColumnDataSource(data = dict(x_label = features_list[0], y_label = features_list[1]))
dt_plot = figure(plot_width=500, plot_height=500)
dt_plot.circle('x', 'y', color = 'color', source = source, size=5, alpha=0.5, legend = 'label')

# Update the decision tree with specific max_depth and min_samples_leaf
def update_decision_tree(max_depth_v, min_samples_leaf_v, attr0, attr1, square_attr_list):
    attributes_list = [nutrition_data_dic[attr0], nutrition_data_dic[attr1]]
    attributes_list = np.asarray(attributes_list).T.tolist()
    clf = DecisionTreeClassifier(max_depth = max_depth_v, min_samples_leaf = min_samples_leaf_v)
    clf.fit(attributes_list, diabete_outputs)
    square_outputs = clf.predict(square_attr_list)
    square_outputs_color = []
    for square_output in square_outputs:
        if square_output == 1:
            square_outputs_color.append('blue')
        else:
            square_outputs_color.append('red')
    return square_outputs_color

def generate_color_board(attr0, attr1):
    width = max(nutrition_data_dic[attr0]) - min(nutrition_data_dic[attr0])
    width_interval = width / 100
    height = max(nutrition_data_dic[attr1]) - min(nutrition_data_dic[attr1])
    height_interval = height / 100
    
    square_attr_list = []
    for i in list(range(0, 100)):
        for j in list(range(0, 100)):
            attr_list = []
            attr_list.append(min(nutrition_data_dic[attr0]) + i * width_interval)
            attr_list.append(min(nutrition_data_dic[attr1]) + j * height_interval)
            square_attr_list.append(attr_list)
    return  square_attr_list

def transpose_matrix(matrix):
    t_matrix = np.asarray(matrix).T.tolist()
    return t_matrix

square_attr_list = generate_color_board(features_list[0], features_list[1])
square_outputs_color = update_decision_tree(5, 2, features_list[0], features_list[1], square_attr_list)
t_square_attr_list = transpose_matrix(square_attr_list)

square_source = ColumnDataSource(data = dict(x = t_square_attr_list[0], y = t_square_attr_list[1], color = square_outputs_color))
dt_plot.square('x','y',line_color='color', fill_color='color', size=9.5, alpha=0.05, source=square_source)

dt_plot.xaxis.axis_label = features_list[0]
dt_plot.yaxis.axis_label = features_list[1]

attr_source = ColumnDataSource(data = nutrition_data_dic)



attr0_dropdown = widgets.Dropdown(
    options = features_list,
    value = features_list[0],
    description = 'Attribute 0',
    disabled=False,
)

attr1_dropdown = widgets.Dropdown(
    options = features_list,
    value = features_list[1],
    description = 'Attribute 1',
    disabled=False,
)

max_depth_slider = widgets.IntSlider(
    value=5,
    min=2,
    max=50,
    step=1,
    description='Max Depth:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

min_samples_leaf_slider = widgets.IntSlider(
    value=2,
    min=2,
    max=50,
    step=1,
    description='Min Samples leaf:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Update the plot when the attributes and parameters of decision tree are changed.
def update_plot(attr0, attr1, max_depth, min_samples_leaf):
    dt_plot.xaxis.axis_label = attr0
    dt_plot.yaxis.axis_label = attr1
    source.data['x'] = nutrition_data_dic[attr0]
    source.data['y'] = nutrition_data_dic[attr1]
    
    square_attr_list = generate_color_board(attr0, attr1)
    square_outputs_color = update_decision_tree(max_depth, min_samples_leaf, attr0, attr1, square_attr_list)
    t_square_attr_list = transpose_matrix(square_attr_list)
    
    #t_square_attr_list, square_outputs_color = generate_color_board(attr0, attr1)
    square_source.data['x'] = t_square_attr_list[0]
    square_source.data['y'] = t_square_attr_list[1]
    square_source.data['color'] = square_outputs_color
    push_notebook()

#x_attribute_select0.on_change('value', update_x_attr0)

layout = column(dt_plot)
show(layout, notebook_handle = True)

interact(update_plot, attr0 = attr0_dropdown, attr1 = attr1_dropdown, max_depth = max_depth_slider, min_samples_leaf = min_samples_leaf_slider)



A Jupyter Widget

<function __main__.update_plot>

I did some exploration about the cause of Diabetes by decision tree, and I found the amount of soda a person have each day might be counted as an effect. If you set Attribute 0 as SODAFREQ, and Attribute 1 as other attributes, the BLUE AREA tends to locate at the right part of the plot, which means if a person drinks soda frequently each day, he/she might get higher possibility of getting Diabetes.