In [None]:
import pandas as pd
import numpy as np
import pickle
import time
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from slice_finder import SliceFinder

from ipywidgets import interact, interactive
from IPython.display import display

from bokeh.layouts import widgetbox, row
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models.widgets import DataTable, TableColumn
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 17})


## Example: COMPAS dataset

### slice finder

In [None]:

# data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed_7214rows_with_labels.csv"
data_file = r"../CompareDivExplorer/divexplorer-main/notebooks/datasets/compas_discretized.csv" \


compas_data = pd.read_csv(data_file)



all_attributes = ['age', 'charge', 'race', 'sex', '#prior', 'stay', 'class', 'predicted']
compas_data = compas_data[['age', 'charge', 'race', 'sex', '#prior', 'stay', 'class']]

compas_data[:5]

In [None]:



# # drop nan values
# adult_data = adult_data.dropna()

# Encode categorical features
encoders = {}
for column in compas_data.columns:
    if compas_data.dtypes[column] == np.object:
        le = LabelEncoder()
        compas_data[column] = le.fit_transform(compas_data[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

X, y = compas_data[compas_data.columns.difference(["class"])], compas_data["class"]

pickle.dump(encoders, open("compas.pkl", "wb"), protocol=2)

# Train a model
#lr = LogisticRegression()
#lr.fit(X, y)
lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X, y)


print(X)
print(y)


In [None]:
compas_data.columns


In [None]:

sf = SliceFinder(lr, (X, y))
metrics_all = sf.evaluate_model((X,y))
reference = (np.mean(metrics_all), np.std(metrics_all), len(metrics_all))



In [None]:

# degree: number of att in a pattern
time1 = time.time()
recommendations = sf.find_slice(k=10, epsilon=0.4, degree=6, max_workers=4)
time2 = time.time()

print("time = {}s".format(time2 - time1))

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))



## Analyze the results

In [34]:
from Algorithms import NewAlgGeneral_SizeFairnessValue_2_20210528 as newalg
from Algorithms import pattern_count


data_file = r"../../../../InputData/COMPAS_ProPublica/compas-analysis-master/cox-parsed/cox-parsed_7214rows_with_labels.csv"


df = pd.read_csv(data_file)




TP = df[(df['ground_truth'] == 1) & (df['predicted'] == 1)]
FP = df[(df['ground_truth'] == 0) & (df['predicted'] == 1)]
TN = df[(df['ground_truth'] == 0) & (df['predicted'] == 0)]
FN = df[(df['ground_truth'] == 1) & (df['predicted'] == 0)]

print(len(TP) + len(FP) + len(TN) + len(FN))

selected_attributes = ['sex', 'age_cat', 'race']
df = df[selected_attributes]
TP = TP[selected_attributes]
TN = TN[selected_attributes]
FP = FP[selected_attributes]
FN = FN[selected_attributes]

df[:4]



7214


Unnamed: 0,sex,age_cat,race
0,Male,Greater than 45,Other
1,Male,25 - 45,African-American
2,Male,Less than 25,African-American
3,Male,Less than 25,African-American


In [36]:


pc_whole_data = pattern_count.PatternCounter(df, encoded=False)
pc_whole_data.parse_data()
pc_FP = pattern_count.PatternCounter(FP, encoded=False)
pc_FP.parse_data()
pc_TN = pattern_count.PatternCounter(TN, encoded=False)
pc_TN.parse_data()
pc_FN = pattern_count.PatternCounter(FN, encoded=False)
pc_FN.parse_data()
pc_TP = pattern_count.PatternCounter(TP, encoded=False)
pc_TP.parse_data()



def analyze(P):
    st = newalg.num2string(P)
    whole_cardinality = pc_whole_data.pattern_count(st)
    fp = pc_FP.pattern_count(st)
    tn = pc_TN.pattern_count(st)
    if fp + tn != 0:
        FPR = fp / (fp + tn)
    else:
        FPR = None
    fn = pc_FN.pattern_count(st)
    tp = pc_TP.pattern_count(st)
    if fn + tp != 0:
        FNR = fn / (fn + tp)
    else:
        FNR = None

    acc = (tp + tn) / whole_cardinality
    print(whole_cardinality, fp, tn, fn, tp)
    print("acc = {}, FPR = {}, FNR = {}".format(acc, FPR, FNR))

analyze([-1, 'Greater than 45', 'Native American'])
analyze(['Male', 'Greater than 45', 'Native American'])


3 0 1 0 2
acc = 1.0, FPR = 0.0, FNR = 0.0
1 0 0 0 1
acc = 1.0, FPR = None, FNR = 0.0
