# SliceFinder Demo

In [31]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from slice_finder import SliceFinder

from ipywidgets import interact, interactive
from IPython.display import display

from bokeh.layouts import widgetbox, row
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models.widgets import DataTable, TableColumn  
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

In [3]:
adult_data = pd.read_csv(
    "data/adult.data",
    names=[
        "Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

# drop nan values
adult_data = adult_data.dropna()

# Encode categorical features
encoders = {}
for column in adult_data.columns:
    if adult_data.dtypes[column] == np.object:
        le = LabelEncoder()
        adult_data[column] = le.fit_transform(adult_data[column])
        encoders[column] = le
        #print(column, le.classes_, le.transform(le.classes_))

X, y = adult_data[adult_data.columns.difference(["Target"])], adult_data["Target"]

# Train a model
lr = LogisticRegression()
lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
sf = SliceFinder(lr, (X, y))
recommendations = sf.find_slice(k=5, epsilon=0.3, degree=3)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('size: %s'%(s.size))


Slice description:
Sex:Male 
---------------------
effect_size: 0.354745554179
size: 20380

Slice description:
Martial Status:Married-civ-spouse Married-AF-spouse 
---------------------
effect_size: 0.745419909171
size: 14086

Slice description:
Relationship:Husband Wife 
---------------------
effect_size: 0.740475784256
size: 13869

Slice description:
Occupation:Exec-managerial Prof-specialty 
---------------------
effect_size: 0.406641464195
size: 8030

Slice description:
Education:Doctorate Masters Prof-school Bachelors 
---------------------
effect_size: 0.431872186573
size: 7588


In [47]:
slices, uninteresting = list(), list()
with open('slices.p','rb') as handle:
    slices = pickle.load(handle)
with open('uninteresting.p','rb') as handle:
    uninteresting = pickle.load(handle)  

def get_top_k_slices(candidates, min_effect_size):
    description_ = list()
    size_ = list()
    effect_size_ = list()
    for s in candidates:
        if s.effect_size < min_effect_size:
            continue
            
        description = ''
        for k, v in list(s.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            description += '%s:%s '%(k, values)
        description_.append(description)
        size_.append(s.size)
        effect_size_.append(s.effect_size)
    return description_, size_, effect_size_

# load interesting slices
description_, size_, effect_size_ = get_top_k_slices(slices, 0)
    
# load uninteresting slices (candidates)
description_c, size_c, effect_size_c = get_top_k_slices(uninteresting, 0)
    
data = dict(
        description=description_,
        size=size_,
        effect_size=effect_size_,
    )
source = ColumnDataSource(data)

# scatter plot
hover = HoverTool(tooltips=[
    ("desc", "@description"),
    ("size", "@size"),
    ("effect_size", "@effect_size"),
])
TOOLS = [hover]
p = figure(tools=TOOLS, plot_width=400, plot_height=400, 
           y_axis_label='Effect Size', x_axis_label='Size', title=None)
r = p.circle('size', 'effect_size', source=source)
#show(p, notebook_handle=True)

# data table
columns = [
        TableColumn(field="description", title="Description"),
        TableColumn(field="size", title="Size"),
        TableColumn(field="effect_size", title="Effect Size"),
    ]
t = DataTable(source=r.data_source, columns=columns, width=600, height=400)

show(row(p, t), notebook_handle=True)

def update(k=10, min_eff_size=0.3):
    desc_a, size_a, effect_size_a = get_top_k_slices(slices, min_eff_size)
    desc_b, size_b, effect_size_b = get_top_k_slices(uninteresting, min_eff_size)
    desc_ = desc_a + desc_b
    size_ = size_a + size_b
    effect_size_ = effect_size_a + effect_size_b
    idx = sorted(range(len(size_)), key=lambda x: size_[x], reverse=True)
    r.data_source.data['description'] = np.array(desc_)[idx[:k]]
    r.data_source.data['size'] = np.array(size_)[idx[:k]]
    r.data_source.data['effect_size'] = np.array(effect_size_)[idx[:k]]
    push_notebook()
    
w = interactive(update, k=(1,30), min_eff_size=(0, 1, 0.05))
display(w)