In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import csv
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics
from scipy import stats
import shap
import dice_ml
from dice_ml import Dice
from dice_ml.utils import helpers

In [2]:
def alert(c):
    if 0 < c['alg_risk_score_decile'] < 5:
        return 'low'
    elif 4 < c['alg_risk_score_decile'] < 8:
        return 'medium'
    elif 7 < c['alg_risk_score_decile'] < 11:
        return 'high'
    else:
        return 'Undefined'

# ======================================================================================================================
# GET PERFORMANCE METRICS
# ======================================================================================================================
def perfMetrics(pred, Y):
    """
        Evaluates performance of classifier given probs and true labels.

    :param pred: predicted probabilities
    :param Y: true labels
    
    :return: dict with results
    """
    auc = metrics.roc_auc_score(Y, pred)

    # calculating the j point
    fpr, tpr, thresholds = metrics.roc_curve(Y, pred)
    jpoint = np.argsort([(tpr[i] + (1- fpr[i]) - 1) for i in range(len(thresholds))])[::-1]
    for j, jdx in enumerate(jpoint):
        if (tpr[jdx] < 1) and (1 - fpr[jdx] < 1):
            # print(j, tpr[jdx], (1 - fpr[jdx]), tpr[jdx] - (1 - fpr[jdx]))
            break

    sens_jpoint = tpr[jdx]
    spec_jpoint = 1 - fpr[jdx]

    fpr, tpr, thresholds = metrics.roc_curve(Y, pred, pos_label=1)
    tpr_fpr0 = np.max(tpr[np.where(fpr <= 0.00)])
    tpr_fpr1 = np.max(tpr[np.where(fpr <= 0.01)])
    tpr_fpr5 = np.max(tpr[np.where(fpr <= 0.05)])

    f1_bin = metrics.f1_score(Y, np.round(pred), pos_label=1)
    f1_micro = metrics.f1_score(Y, np.round(pred), average='micro', pos_label=1)
    f1_macro = metrics.f1_score(Y, np.round(pred), average='macro', pos_label=1)

    [p, r, th] = metrics.precision_recall_curve(Y, pred)
    auc_prc = metrics.auc(r, p)
    prec = metrics.precision_score(Y, np.round(pred))
    rec = metrics.recall_score(Y, np.round(pred))

    acc = metrics.accuracy_score(Y, np.round(pred), normalize=False)
    acc_norm = metrics.accuracy_score(Y, np.round(pred), normalize=True)

    brier = metrics.brier_score_loss(Y, pred)

    hl = hltest(Y, pred)

    m = dict()
    m['auc'] = auc
    m['sens-jpoint'], m['spec-jpoint'] = sens_jpoint, spec_jpoint
    m['f1_bin'], m['f1_micro'], m['f1_macro'] = f1_bin, f1_micro, f1_macro
    m['tpr_fpr0'], m['tpr_fpr1'], m['tpr_fpr5'] = tpr_fpr0, tpr_fpr1, tpr_fpr5
    m['auc_prc'], m['prec'], m['rec'] = auc_prc, prec, rec
    m['acc'], m['acc_norm'] = acc, acc_norm
    m['brier'], m['hltest'] = brier, hl

    return m

# ======================================================================================================================
# HOSMER-LEMESHOW TEST
# ======================================================================================================================
def hltest(Y_true, Y_pred, Nbins=10):
    """
        Calculating the Hosmer-Lemeshow Test for statistical callibration.
        Useful example: https://en.wikipedia.org/wiki/Hosmer%E2%80%93Lemeshow_test#Calculation_of_the_statistic

    :param Y_true: 1/0 binary class of true outcomes
    :param Y_pred: predicted probability of outcome of 1

    :return: pval chisquare statistical significance of model callibration (pval > 0.05 means model is well-calibrated)
    """

    # convert to numpy array
    Y_true = np.asarray(Y_true)
    Y_pred = np.asarray(Y_pred)

    # split probs into 10 bins
    pred_min = np.min(Y_pred)
    pred_max = np.max(Y_pred)

    # Nbins = 10
    prob_int = (pred_max - pred_min) / Nbins

    # calucalte HL-stat for 10 bins
    H = []
    for i in range(Nbins):

        # N observations that were correct in this interval
        obsA = np.where((Y_pred >= (pred_min + i*prob_int))
                        & (Y_pred < (pred_min + (i+1)*prob_int))
                        & (Y_true == 1))[0].shape[0]

        # N observations that were NOT correct in this interval
        obsNotA = np.where((Y_pred >= (pred_min + i * prob_int))
                        & (Y_pred < (pred_min + (i + 1) * prob_int))
                        & (Y_true != 1))[0].shape[0]

        # index of probs we are considering in this interval
        idx_A = np.where((Y_pred >= (pred_min + i * prob_int))
                        & (Y_pred < (pred_min + (i + 1) * prob_int)))[0]

        # calculate the sum of the probabilities
        expA = np.sum(Y_pred[idx_A])

        # calculate the total sum minus the sum of the predicted probabilities
        expNotA = idx_A.shape[0] - expA

        # calculate HL-stat for this bin/interval
        H.append((obsA - expA) ** 2 / expA + (obsNotA - expNotA) ** 2 / expNotA)


    # sum and calculate the statistical significance that the model follows a chi-distributions (i.e. good-fit, well-callibrated)
    H = np.nansum(H)
    pval = 1 - stats.chi2.cdf(x=H, df=Nbins-2)

    # print result
    if pval > 0.05:
        print('   Model is WELL calibrated: ' + str(pval))
    else:
        print('   Model is POORLY calibrated: ' + str(pval))

    return pval

def check_value(exp, profile):
    if (exp == profile):
        diff = 0
    else: 
        diff = exp
    return diff

## Training Risk Assessment Model

In [3]:
# get compas data and crime categorizations
compas = pd.read_csv('datasets/compas-scores-two-years.csv')
crimes = pd.read_csv('datasets/crime-categories.csv')

In [4]:
# get desired columns from compas dataset
cols = ['id', 'age', 'sex','race', 'priors_count.1', 'juv_fel_count', 'juv_misd_count', 'c_charge_degree', 'c_charge_desc', 'is_recid', 'days_b_screening_arrest']
dataset = compas[cols]

In [5]:
# Following ProPublica's lead: "to match COMPAS scores with accompanying cases, 
# we considered cases with arrest dates or charge dates within 30 days of a COMPAS assessment 
# being conducted."
dataset = dataset[(dataset['days_b_screening_arrest'] <= 30) & (dataset['days_b_screening_arrest'] >= -30)]

# remove entries with no charge description 
dataset = dataset[dataset['c_charge_desc'].notna()]

In [6]:
# match each crime description to a crime category
crimes_dict = pd.Series(crimes.category.values, index=crimes.crime).to_dict()
dataset["offense_type"] = dataset["c_charge_desc"].map(crimes_dict)

# remove entries with uncategorizable crime descriptions
dataset = dataset.loc[dataset["offense_type"]!="Inchoate"]

# index entries
dataset['id']=dataset.index

In [7]:
# create data
features = ['age','priors_count.1','juv_fel_count','juv_misd_count','c_charge_degree','offense_type', 'is_recid'] 
data = dataset[features]

# create x and y data
x_features = ['age','priors_count.1','juv_fel_count','juv_misd_count','c_charge_degree','offense_type'] 
data_x = dataset[x_features]
data_y = dataset['is_recid']

In [8]:
# 80-20 train test split
X_train, X_test, y_train, y_test = train_test_split(data_x, 
                                                    data_y, 
                                                    test_size = 0.2,
                                                    random_state = 2,
                                                    stratify = data_y)

In [9]:
numerical=["age", "priors_count.1",'juv_fel_count','juv_misd_count']
categorical = X_train.columns.difference(numerical) 

# create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

# append classifier to preprocessing pipeline
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', GradientBoostingClassifier())])

# fit model
model = clf.fit(X_train, y_train)

In [10]:
# get prediction probabilities
y_pred_prob = model.predict_proba(X_test)

# get probabilities of predictions being 1
y_pred_prob_1 = y_pred_prob[:,1]

# get predictions
y_pred = model.predict(X_test)

In [11]:
# get area under the roc curve
auroc = roc_auc_score(y_test, y_pred)  
print(auroc)

0.6768765085268864


In [12]:
# get performance metrics and check calibration for fairness
perfMetrics(y_pred_prob_1, y_test)

   Model is WELL calibrated: 0.24990550336283368


{'auc': 0.7460966248565664,
 'sens-jpoint': 0.7638190954773869,
 'spec-jpoint': 0.6094488188976378,
 'f1_bin': 0.6609735269000855,
 'f1_micro': 0.6777597402597403,
 'f1_macro': 0.6769678152675214,
 'tpr_fpr0': 0.01507537688442211,
 'tpr_fpr1': 0.11725293132328309,
 'tpr_fpr5': 0.2529313232830821,
 'auc_prc': 0.7357788055129126,
 'prec': 0.6742160278745645,
 'rec': 0.6482412060301508,
 'acc': 835,
 'acc_norm': 0.6777597402597403,
 'brier': 0.20364706350595482,
 'hltest': 0.24990550336283368}

In [13]:
# create test set with defendant and risk assessment prediction information
narratives = X_test.copy()
narratives['real_outcome'] = y_test
narratives['alg_outcome'] = y_pred
narratives['alg_risk_score'] = y_pred_prob_1
narratives['alg_risk_score_decile'] = (narratives['alg_risk_score']*10).round(decimals=0)
narratives['alg_risk_desc'] = narratives.apply(alert, axis=1)
narratives['id'] = X_test.index
narratives = pd.merge(narratives, dataset, on='id')

cols = cols=['id', 'age_x', 'sex', 'race','priors_count.1_x','juv_fel_count_x','juv_misd_count_x','c_charge_degree_x','offense_type_x', 'real_outcome', 'alg_outcome','alg_risk_score', 'alg_risk_score_decile', 'alg_risk_desc'] 
narratives = narratives[cols]
narratives['sex'] = narratives['sex'].str.lower()

# rename columns
narratives.columns = ['id', 'age', 'sex', 'race','priors_count.1','juv_fel_count','juv_misd_count','c_charge_degree','offense_type', 'real_outcome', 'alg_outcome','alg_risk_score', 'alg_risk_score_decile', 'alg_risk_desc'] 

In [14]:
# export dataframe as csv
narratives.to_csv('datasets/narratives.csv', index=False) 

# SHAP Explanations

In [15]:
# SHAP explanations cannot take pipelined models, so model training is needed again
# prepare SHAP data
data_shap = data_x.copy()
numerical=["age", "priors_count.1",'juv_fel_count','juv_misd_count']
features_num = data_shap[numerical]
scaler_num = StandardScaler().fit(features_num.values)
features_num = scaler_num.transform(features_num.values)
data_shap[numerical] = features_num
data_shap = pd.get_dummies(data_shap, columns=[c for c in data_shap.columns if c == 'c_charge_degree' or c == 'offense_type'])

In [16]:
# 80-20 train test split
X_train_shap, X_test_shap, y_train_shap, y_test_shap = train_test_split(data_shap, 
                                                    data_y, 
                                                    test_size = 0.2,
                                                    random_state = 2,
                                                    stratify = data_y)

In [17]:
# model is a gradient boosted classifier
clf_shap = GradientBoostingClassifier()

In [18]:
# train model
model_shap = clf_shap.fit(X_train_shap, y_train_shap)

In [19]:
# get prediction probabilities
y_pred_prob_shap = model_shap.predict_proba(X_test_shap)

# get probabilities of predictions being 1
y_pred_prob_1_shap = y_pred_prob_shap[:,1]

# get predictions
y_pred_shap = model_shap.predict(X_test_shap)

In [20]:
# get performance metrics and check calibration for fairness
perfMetrics(y_pred_prob_1_shap, y_test_shap)

   Model is WELL calibrated: 0.18609533411835066


{'auc': 0.74617576069323,
 'sens-jpoint': 0.7638190954773869,
 'spec-jpoint': 0.6094488188976378,
 'f1_bin': 0.6609735269000855,
 'f1_micro': 0.6777597402597403,
 'f1_macro': 0.6769678152675214,
 'tpr_fpr0': 0.01507537688442211,
 'tpr_fpr1': 0.11557788944723618,
 'tpr_fpr5': 0.2529313232830821,
 'auc_prc': 0.7363516363610401,
 'prec': 0.6742160278745645,
 'rec': 0.6482412060301508,
 'acc': 835,
 'acc_norm': 0.6777597402597403,
 'brier': 0.20360883352906184,
 'hltest': 0.18609533411835066}

In [21]:
# get SHAP values
explainer = shap.Explainer(model_shap)
shap_values = explainer(X_test_shap)

In [22]:
# build dataframe with SHAP values
feature_names = ['age_zscore', 'priors_count_zscore', 'juv_fel_count_zscore',
       'juv_misd_count_zscore', 'c_charge_degree_F', 'c_charge_degree_M', 'offense_type_drug',
       'offense_type_fraud', 'offense_type_no_charge',
       'offense_type_obstruction_of_justice', 'offense_type_property',
       'offense_type_public_order', 'offense_type_violent',
       'offense_type_weapons']

shap_values = pd.DataFrame(shap_values.values, columns = feature_names, index=X_test_shap.index)

In [23]:
# export dataframe as csv 
shap_values.to_csv('datasets/shap_exp.csv') 

# DiCE Explanations

In [24]:
# defining data and model
d = dice_ml.Data(dataframe=data,
                 continuous_features=['age', 'priors_count.1','juv_fel_count','juv_misd_count'],
                 outcome_name='is_recid')
m = dice_ml.Model(model=model, backend="sklearn")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [25]:
# initiate DiCE using random generation
exp_random = dice_ml.Dice(d, m, method="random")

## Diverse Counterfactuals

In [26]:
counterfact_div_exp = pd.DataFrame()
i = 0 

In [27]:
# generate 3 counterfactuals for each sample in the dataset
for index, row in X_test.iterrows():
    query_instances = X_test[i:i+1]
    dice_exp_random = exp_random.generate_counterfactuals(query_instances, total_CFs=3, desired_class="opposite", 
                                                      permitted_range={'age':[18,96]},random_seed=2,
                                                      verbose=False)
    current_df = dice_exp_random.cf_examples_list[0].final_cfs_df
    current_df['id'] = index
    counterfact_div_exp = counterfact_div_exp.append(current_df)
    i = i + 1

100%|██████████| 1/1 [00:00<00:00,  8.12it/s]
100%|██████████| 1/1 [00:00<00:00,  9.09it/s]
100%|██████████| 1/1 [00:00<00:00,  9.03it/s]
100%|██████████| 1/1 [00:00<00:00,  9.11it/s]
100%|██████████| 1/1 [00:00<00:00,  9.17it/s]
100%|██████████| 1/1 [00:00<00:00,  9.26it/s]
100%|██████████| 1/1 [00:00<00:00,  9.43it/s]
100%|██████████| 1/1 [00:00<00:00,  9.09it/s]
100%|██████████| 1/1 [00:00<00:00,  9.09it/s]
100%|██████████| 1/1 [00:00<00:00,  8.67it/s]
100%|██████████| 1/1 [00:00<00:00,  9.41it/s]
100%|██████████| 1/1 [00:00<00:00,  8.97it/s]
100%|██████████| 1/1 [00:00<00:00,  8.65it/s]
100%|██████████| 1/1 [00:00<00:00,  9.18it/s]
100%|██████████| 1/1 [00:00<00:00,  9.17it/s]
100%|██████████| 1/1 [00:00<00:00,  9.24it/s]
100%|██████████| 1/1 [00:00<00:00,  8.99it/s]
100%|██████████| 1/1 [00:00<00:00,  9.85it/s]
100%|██████████| 1/1 [00:00<00:00, 11.18it/s]
100%|██████████| 1/1 [00:00<00:00, 11.51it/s]
100%|██████████| 1/1 [00:00<00:00, 11.37it/s]
100%|██████████| 1/1 [00:00<00:00,

100%|██████████| 1/1 [00:00<00:00, 11.60it/s]
100%|██████████| 1/1 [00:00<00:00, 10.80it/s]
100%|██████████| 1/1 [00:00<00:00,  9.56it/s]
100%|██████████| 1/1 [00:00<00:00, 12.43it/s]
100%|██████████| 1/1 [00:00<00:00, 11.64it/s]
100%|██████████| 1/1 [00:00<00:00, 12.82it/s]
100%|██████████| 1/1 [00:00<00:00, 14.82it/s]
100%|██████████| 1/1 [00:00<00:00, 14.71it/s]
100%|██████████| 1/1 [00:00<00:00, 13.36it/s]
100%|██████████| 1/1 [00:00<00:00, 14.38it/s]
100%|██████████| 1/1 [00:00<00:00, 13.70it/s]
100%|██████████| 1/1 [00:00<00:00, 13.64it/s]
100%|██████████| 1/1 [00:00<00:00, 13.33it/s]
100%|██████████| 1/1 [00:00<00:00, 14.54it/s]
100%|██████████| 1/1 [00:00<00:00, 13.28it/s]
100%|██████████| 1/1 [00:00<00:00, 13.83it/s]
100%|██████████| 1/1 [00:00<00:00, 14.35it/s]
100%|██████████| 1/1 [00:00<00:00, 14.36it/s]
100%|██████████| 1/1 [00:00<00:00, 13.76it/s]
100%|██████████| 1/1 [00:00<00:00, 12.57it/s]
100%|██████████| 1/1 [00:00<00:00, 13.23it/s]
100%|██████████| 1/1 [00:00<00:00,

100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12.95it/s]
100%|██████████| 1/1 [00:00<00:00, 12.74it/s]
100%|██████████| 1/1 [00:00<00:00, 12.24it/s]
100%|██████████| 1/1 [00:00<00:00, 12.69it/s]
100%|██████████| 1/1 [00:00<00:00, 12.47it/s]
100%|██████████| 1/1 [00:00<00:00, 12.60it/s]
100%|██████████| 1/1 [00:00<00:00, 12.50it/s]
100%|██████████| 1/1 [00:00<00:00, 11.33it/s]
100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12.59it/s]
100%|██████████| 1/1 [00:00<00:00, 11.95it/s]
100%|██████████| 1/1 [00:00<00:00, 11.78it/s]
100%|██████████| 1/1 [00:00<00:00, 12.91it/s]
100%|██████████| 1/1 [00:00<00:00, 12.80it/s]
100%|██████████| 1/1 [00:00<00:00, 12.60it/s]
100%|██████████| 1/1 [00:00<00:00, 12.04it/s]
100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
100%|██████████| 1/1 [00:00<00:00, 12.61it/s]
100%|██████████| 1/1 [00:00<00:00, 12.49it/s]
100%|██████████| 1/1 [00:00<00:00, 12.89it/s]
100%|██████████| 1/1 [00:00<00:00,

100%|██████████| 1/1 [00:00<00:00, 13.53it/s]
100%|██████████| 1/1 [00:00<00:00, 13.08it/s]
100%|██████████| 1/1 [00:00<00:00, 12.57it/s]
100%|██████████| 1/1 [00:00<00:00, 12.75it/s]
100%|██████████| 1/1 [00:00<00:00, 10.69it/s]
100%|██████████| 1/1 [00:00<00:00, 11.70it/s]
100%|██████████| 1/1 [00:00<00:00, 13.03it/s]
100%|██████████| 1/1 [00:00<00:00, 12.27it/s]
100%|██████████| 1/1 [00:00<00:00, 12.64it/s]
100%|██████████| 1/1 [00:00<00:00, 12.66it/s]
100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12.56it/s]
100%|██████████| 1/1 [00:00<00:00, 12.81it/s]
100%|██████████| 1/1 [00:00<00:00, 12.82it/s]
100%|██████████| 1/1 [00:00<00:00, 12.81it/s]
100%|██████████| 1/1 [00:00<00:00, 12.67it/s]
100%|██████████| 1/1 [00:00<00:00, 12.86it/s]
100%|██████████| 1/1 [00:00<00:00, 12.80it/s]
100%|██████████| 1/1 [00:00<00:00, 12.62it/s]
100%|██████████| 1/1 [00:00<00:00, 12.54it/s]
100%|██████████| 1/1 [00:00<00:00, 12.70it/s]
100%|██████████| 1/1 [00:00<00:00,

In [28]:
# export dataframe as csv 
counterfact_div_exp.to_csv('datasets/counterfact_diverse_exp.csv', index=False) 

In [29]:
# reset index
counterfact_div_exp = counterfact_div_exp.reset_index(drop=True)

In [30]:
# create new dataframe that captures variable changes between the datapoint to be explained 
# and its counterfactual examples
diff_div = pd.DataFrame()

for index, row in counterfact_div_exp.iterrows(): 
    exp_row = counterfact_div_exp.loc[index]
    exp_id = exp_row['id']
    profile_row = narratives.loc[narratives['id']==exp_id]
    profile_index = profile_row.index[0]
    profile_row = narratives.loc[profile_index]

    id_diff = exp_row['id']

    age_exp = int(exp_row['age'])
    age = int(profile_row['age'])

    priors_exp = int(exp_row['priors_count.1'])
    priors = int(profile_row['priors_count.1'])

    juv_fel_exp = int(exp_row['juv_fel_count'])
    juv_fel = int(profile_row['juv_fel_count'])

    juv_misd_exp = int(exp_row['juv_misd_count'])
    juv_misd = int(profile_row['juv_misd_count'])

    offense_type_exp = str(exp_row['offense_type'])
    offense_type = str(profile_row['offense_type'])

    c_charge_degree_exp = str(exp_row['c_charge_degree'])
    c_charge_degree = str(profile_row['c_charge_degree'])

    diff_age = check_value(age_exp, age)
    diff_priors = check_value(priors_exp, priors)
    diff_juv_fel = check_value(juv_fel_exp, juv_fel)
    diff_juv_misd = check_value(juv_misd_exp, juv_misd)
    diff_offense_type = check_value(offense_type_exp, offense_type)
    diff_charge_deg = check_value(c_charge_degree_exp, c_charge_degree)

    df_temp = pd.DataFrame([(id_diff, diff_age, diff_priors, diff_juv_fel, diff_juv_misd, diff_charge_deg, diff_offense_type)], 
                           columns=["id", "age", "priors_count.1", "juv_fel_count", "juv_misd_count", "c_charge_degree", "offense_type"])
    diff_div = diff_div.append(df_temp)

In [31]:
# export dataframe as csv
diff_div.to_csv('datasets/diff_div.csv', index=False) 

## Selective Counterfactual - By Relevance
* 'Relevance' is achieved  by following the Public Safety Assessment's risk formulas: https://craftmediabucket.s3.amazonaws.com/uploads/PDFs/PSA-Risk-Factors-and-Formula.pdf
* Age, number of priors, and charge degree (a proxy for the severity of a crime) were identified as the most relevant variables 

In [32]:
counterfact_sel_rel_exp = pd.DataFrame()
i = 0 

In [33]:
# generate 3 counterfactuals for each sample in the dataset
for index, row in X_test.iterrows():
    query_instances = X_test[i:i+1]
    dice_exp_random = exp_random.generate_counterfactuals(query_instances, total_CFs=3, desired_class="opposite",
                                                      verbose=False, features_to_vary=['c_charge_degree', 'priors_count.1','age'], random_seed=2)
    current_df = dice_exp_random.cf_examples_list[0].final_cfs_df
#     if (str(type(current_df)) == "<class 'NoneType'>"):
#         no_CF.append(index)
#     else:
    current_df['id'] = index
    counterfact_sel_rel_exp = counterfact_sel_rel_exp.append(current_df)
    i = i + 1

100%|██████████| 1/1 [00:00<00:00,  9.19it/s]
100%|██████████| 1/1 [00:00<00:00, 10.36it/s]
100%|██████████| 1/1 [00:00<00:00, 11.53it/s]
100%|██████████| 1/1 [00:00<00:00, 11.81it/s]
100%|██████████| 1/1 [00:00<00:00, 11.30it/s]
100%|██████████| 1/1 [00:00<00:00, 11.67it/s]
100%|██████████| 1/1 [00:00<00:00, 11.92it/s]
100%|██████████| 1/1 [00:00<00:00, 11.30it/s]
100%|██████████| 1/1 [00:00<00:00, 11.73it/s]
100%|██████████| 1/1 [00:00<00:00, 11.18it/s]
100%|██████████| 1/1 [00:00<00:00, 11.85it/s]
100%|██████████| 1/1 [00:00<00:00, 10.50it/s]
100%|██████████| 1/1 [00:00<00:00, 11.90it/s]
100%|██████████| 1/1 [00:00<00:00, 11.24it/s]
100%|██████████| 1/1 [00:00<00:00, 11.84it/s]
100%|██████████| 1/1 [00:00<00:00, 11.96it/s]
100%|██████████| 1/1 [00:00<00:00, 11.59it/s]
100%|██████████| 1/1 [00:00<00:00, 10.58it/s]
100%|██████████| 1/1 [00:00<00:00, 11.31it/s]
100%|██████████| 1/1 [00:00<00:00, 11.96it/s]
100%|██████████| 1/1 [00:00<00:00, 10.78it/s]
100%|██████████| 1/1 [00:00<00:00,

100%|██████████| 1/1 [00:00<00:00, 12.88it/s]
100%|██████████| 1/1 [00:00<00:00, 13.60it/s]
100%|██████████| 1/1 [00:00<00:00, 12.73it/s]
100%|██████████| 1/1 [00:00<00:00, 12.75it/s]
100%|██████████| 1/1 [00:00<00:00, 12.53it/s]
100%|██████████| 1/1 [00:00<00:00, 11.95it/s]
100%|██████████| 1/1 [00:00<00:00, 12.68it/s]
100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12.78it/s]
100%|██████████| 1/1 [00:00<00:00, 12.87it/s]
100%|██████████| 1/1 [00:00<00:00, 12.70it/s]
100%|██████████| 1/1 [00:00<00:00, 12.85it/s]
100%|██████████| 1/1 [00:00<00:00, 12.43it/s]
100%|██████████| 1/1 [00:00<00:00, 13.01it/s]
100%|██████████| 1/1 [00:00<00:00, 12.63it/s]
100%|██████████| 1/1 [00:00<00:00, 12.44it/s]
100%|██████████| 1/1 [00:00<00:00, 11.84it/s]
100%|██████████| 1/1 [00:00<00:00, 12.55it/s]
100%|██████████| 1/1 [00:00<00:00, 12.28it/s]
100%|██████████| 1/1 [00:00<00:00, 12.77it/s]
100%|██████████| 1/1 [00:00<00:00, 13.17it/s]
100%|██████████| 1/1 [00:00<00:00,

100%|██████████| 1/1 [00:00<00:00, 13.56it/s]
100%|██████████| 1/1 [00:00<00:00, 12.37it/s]
100%|██████████| 1/1 [00:00<00:00, 12.29it/s]
100%|██████████| 1/1 [00:00<00:00, 12.63it/s]
100%|██████████| 1/1 [00:00<00:00, 12.68it/s]
100%|██████████| 1/1 [00:00<00:00, 12.35it/s]
100%|██████████| 1/1 [00:00<00:00, 12.51it/s]
100%|██████████| 1/1 [00:00<00:00, 12.43it/s]
100%|██████████| 1/1 [00:00<00:00, 12.77it/s]
100%|██████████| 1/1 [00:00<00:00, 12.75it/s]
100%|██████████| 1/1 [00:00<00:00, 12.23it/s]
100%|██████████| 1/1 [00:00<00:00, 12.79it/s]
100%|██████████| 1/1 [00:00<00:00, 12.68it/s]
100%|██████████| 1/1 [00:00<00:00, 12.60it/s]
100%|██████████| 1/1 [00:00<00:00, 12.73it/s]
100%|██████████| 1/1 [00:00<00:00, 12.63it/s]
100%|██████████| 1/1 [00:00<00:00, 13.00it/s]
100%|██████████| 1/1 [00:00<00:00, 12.83it/s]
100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12.64it/s]
100%|██████████| 1/1 [00:00<00:00, 12.82it/s]
100%|██████████| 1/1 [00:00<00:00,

100%|██████████| 1/1 [00:00<00:00, 12.75it/s]
100%|██████████| 1/1 [00:00<00:00, 12.33it/s]
100%|██████████| 1/1 [00:00<00:00, 12.74it/s]
100%|██████████| 1/1 [00:00<00:00, 12.86it/s]
100%|██████████| 1/1 [00:00<00:00, 12.07it/s]
100%|██████████| 1/1 [00:00<00:00, 12.59it/s]
100%|██████████| 1/1 [00:00<00:00, 12.40it/s]
100%|██████████| 1/1 [00:00<00:00, 12.98it/s]
100%|██████████| 1/1 [00:00<00:00, 12.76it/s]
100%|██████████| 1/1 [00:00<00:00, 12.73it/s]
100%|██████████| 1/1 [00:00<00:00, 12.56it/s]
100%|██████████| 1/1 [00:00<00:00, 12.91it/s]
100%|██████████| 1/1 [00:00<00:00, 12.71it/s]
100%|██████████| 1/1 [00:00<00:00, 12.78it/s]
100%|██████████| 1/1 [00:00<00:00, 12.82it/s]
100%|██████████| 1/1 [00:00<00:00, 12.72it/s]
100%|██████████| 1/1 [00:00<00:00, 12.60it/s]
100%|██████████| 1/1 [00:00<00:00, 12.62it/s]
100%|██████████| 1/1 [00:00<00:00, 12.79it/s]
100%|██████████| 1/1 [00:00<00:00, 12.77it/s]
100%|██████████| 1/1 [00:00<00:00, 12.83it/s]
100%|██████████| 1/1 [00:00<00:00,

In [34]:
# export dataframe as csv
counterfact_sel_rel_exp.to_csv('datasets/counterfact_sel_rel_exp.csv', index=False) 

In [35]:
# reset index
counterfact_sel_rel_exp = counterfact_sel_rel_exp.reset_index(drop=True)

In [36]:
# create new dataframe that captures variable changes between the datapoint to be explained 
# and its counterfactual examples
diff_sel = pd.DataFrame()

for index, row in counterfact_sel_rel_exp.iterrows(): 
    exp_row = counterfact_sel_rel_exp.loc[index]
    exp_id = exp_row['id']
    profile_row = narratives.loc[narratives['id']==exp_id]
    profile_index = profile_row.index[0]
    profile_row = narratives.loc[profile_index]

    id_diff = exp_row['id']

    age_exp = int(exp_row['age'])
    age = int(profile_row['age'])

    priors_exp = int(exp_row['priors_count.1'])
    priors = int(profile_row['priors_count.1'])

    juv_fel_exp = int(exp_row['juv_fel_count'])
    juv_fel = int(profile_row['juv_fel_count'])

    juv_misd_exp = int(exp_row['juv_misd_count'])
    juv_misd = int(profile_row['juv_misd_count'])

    offense_type_exp = str(exp_row['offense_type'])
    offense_type = str(profile_row['offense_type'])

    c_charge_degree_exp = str(exp_row['c_charge_degree'])
    c_charge_degree = str(profile_row['c_charge_degree'])

    diff_age = check_value(age_exp, age)
    diff_priors = check_value(priors_exp, priors)
    diff_juv_fel = check_value(juv_fel_exp, juv_fel)
    diff_juv_misd = check_value(juv_misd_exp, juv_misd)
    diff_offense_type = check_value(offense_type_exp, offense_type)
    diff_charge_deg = check_value(c_charge_degree_exp, c_charge_degree)

    df_temp = pd.DataFrame([(id_diff, diff_age, diff_priors, diff_juv_fel, diff_juv_misd, diff_charge_deg, diff_offense_type)], 
                           columns=["id", "age", "priors_count.1", "juv_fel_count", "juv_misd_count", "c_charge_degree", "offense_type"])
    diff_sel = diff_sel.append(df_temp)

In [37]:
# export dataframe as csv
diff_sel.to_csv('datasets/diff_sel.csv', index=False) 

# Random Sampling
Sample 300 defendants from the test set

## Complete Dataset (All, n = 6195)

In [38]:
female = ((dataset['sex'] =='Female').sum() * 100) / len(dataset)
male = ((dataset['sex'] =='Male').sum() * 100) / len(dataset)

white = ((dataset['race'] =='Caucasian').sum() * 100) / len(dataset)
black = ((dataset['race'] =='African-American').sum() * 100) / len(dataset)
other = 100 - (white + black)

age_mean = dataset['age'].mean()

priors_mean = dataset['priors_count.1'].mean()
no_priors = ((dataset['priors_count.1'] ==0).sum() * 100) / len(dataset)
priors = ((dataset['priors_count.1'] !=0).sum() * 100) / len(dataset)

juv_misd_mean = dataset['juv_misd_count'].mean()
no_juv_misd = ((dataset['juv_misd_count'] ==0).sum() * 100) / len(dataset)
juv_misd = ((dataset['juv_misd_count'] !=0).sum() * 100) / len(dataset)

juv_fel_mean = dataset['juv_fel_count'].mean()
no_juv_fel = ((dataset['juv_fel_count'] ==0).sum() * 100) / len(dataset)
juv_fel = ((dataset['juv_fel_count'] !=0).sum() * 100) / len(dataset)


fel = ((dataset['c_charge_degree'] == 'F').sum() * 100) / len(dataset)
misd = ((dataset['c_charge_degree'] == 'M').sum() * 100) / len(dataset)


violent = ((dataset['offense_type'] == 'violent crime').sum() * 100) / len(dataset)
drug = ((dataset['offense_type'] == 'drug crime').sum() * 100) / len(dataset)
prop = ((dataset['offense_type'] == 'property crime').sum() * 100) / len(dataset)
pa = ((dataset['offense_type'] == 'public order crime').sum() * 100) / len(dataset)
no_charge = ((dataset['offense_type'] == 'arrested but not charged').sum() * 100) / len(dataset)
ooj = ((dataset['offense_type'] == 'obstruction of justice').sum() * 100) / len(dataset)
fraud = ((dataset['offense_type'] == 'fraud and financial crime').sum() * 100) / len(dataset)
weapons = ((dataset['offense_type'] == 'weapons-related crime').sum() * 100) / len(dataset)


recid = ((dataset['is_recid'] == 1).sum() * 100) / len(dataset)
no_recid = ((dataset['is_recid'] == 0).sum() * 100) / len(dataset)

print ('---- SEX ----')
print('% female: ', female)
print('% male: ', male)

print ('\n---- RACE ----')
print('% white: ', white)
print('% black: ', black)
print('% other: ', other)

print ('\n---- PRIORS ----')
print ('priors mean: ', priors_mean)
print ('juv misd mean: ', juv_misd_mean)
print ('juv fel mean: ', juv_fel_mean)
print('\n% w/ no priors: ', no_priors)
print('% w/ priors: ', priors)
print('% w/ no juv misd: ', no_juv_misd)
print('% w/ juv misd: ', juv_misd)
print('% w/ no juv fel: ', no_juv_fel)
print('% w/ juv fel: ', juv_fel)

print ('\n---- CRIME ----')
print('% violent: ', violent)
print('% drug: ', drug)
print('% property: ', prop)
print('% public order: ', pa)
print('% no charge: ', no_charge)
print('% obstruction of justice: ', ooj)
print('% fraud: ', fraud)
print('% weapons: ', weapons)

print('\n% felony: ', fel)
print('% misd: ', misd)

print ('\n---- OUTCOME ----')
print('% recid: ', recid)
print('% not recid: ', no_recid)

---- SEX ----
% female:  19.045299561617146
% male:  80.95470043838286

---- RACE ----
% white:  34.047735021919145
% black:  51.485630784218216
% other:  14.466634193862632

---- PRIORS ----
priors mean:  3.2495534989446337
juv misd mean:  0.09141094333495697
juv fel mean:  0.05926286734859555

% w/ no priors:  33.77171618769281
% w/ priors:  66.2282838123072
% w/ no juv misd:  94.28478649131353
% w/ juv misd:  5.7152135086864755
% w/ no juv fel:  96.63906478324404
% w/ juv fel:  3.360935216755967

---- CRIME ----
% violent:  28.576067543432377
% drug:  17.860042214645233
% property:  16.723494073713265
% public order:  14.628998213995779
% no charge:  12.729339178438059
% obstruction of justice:  4.935866212047411
% fraud:  3.03620717648969
% weapons:  1.509985387238188

% felony:  64.29615197272284
% misd:  35.703848027277154

---- OUTCOME ----
% recid:  48.49813281376847
% not recid:  51.50186718623153


### White (n = 2097)

In [39]:
dataset_white = dataset.copy()
dataset_white = dataset_white.loc[(dataset_white["race"]=="Caucasian")]

In [40]:
female = ((dataset_white['sex'] =='Female').sum() * 100) / len(dataset_white)
male = ((dataset_white['sex'] =='Male').sum() * 100) / len(dataset_white)

age_mean = dataset_white['age'].mean()

priors_mean = dataset_white['priors_count.1'].mean()
no_priors = ((dataset_white['priors_count.1'] ==0).sum() * 100) / len(dataset_white)
priors = ((dataset_white['priors_count.1'] !=0).sum() * 100) / len(dataset_white)

juv_misd_mean = dataset_white['juv_misd_count'].mean()
no_juv_misd = ((dataset_white['juv_misd_count'] ==0).sum() * 100) / len(dataset_white)
juv_misd = ((dataset_white['juv_misd_count'] !=0).sum() * 100) / len(dataset_white)

juv_fel_mean = dataset_white['juv_fel_count'].mean()
no_juv_fel = ((dataset_white['juv_fel_count'] ==0).sum() * 100) / len(dataset_white)
juv_fel = ((dataset_white['juv_fel_count'] !=0).sum() * 100) / len(dataset_white)


fel = ((dataset_white['c_charge_degree'] == 'F').sum() * 100) / len(dataset_white)
misd = ((dataset_white['c_charge_degree'] == 'M').sum() * 100) / len(dataset_white)


violent = ((dataset_white['offense_type'] == 'violent crime').sum() * 100) / len(dataset_white)
drug = ((dataset_white['offense_type'] == 'drug crime').sum() * 100) / len(dataset_white)
prop = ((dataset_white['offense_type'] == 'property crime').sum() * 100) / len(dataset_white)
pa = ((dataset_white['offense_type'] == 'public order crime').sum() * 100) / len(dataset_white)
no_charge = ((dataset_white['offense_type'] == 'arrested but not charged').sum() * 100) / len(dataset_white)
ooj = ((dataset_white['offense_type'] == 'obstruction of justice').sum() * 100) / len(dataset_white)
fraud = ((dataset_white['offense_type'] == 'fraud and financial crime').sum() * 100) / len(dataset_white)
weapons = ((dataset_white['offense_type'] == 'weapons-related crime').sum() * 100) / len(dataset_white)


recid = ((dataset_white['is_recid'] == 1).sum() * 100) / len(dataset_white)
no_recid = ((dataset_white['is_recid'] == 0).sum() * 100) / len(dataset_white)

print ('---- SEX ----')
print('% female: ', female)
print('% male: ', male)

print ('\n---- PRIORS ----')
print ('priors mean: ', priors_mean)
print ('juv misd mean: ', juv_misd_mean)
print ('juv fel mean: ', juv_fel_mean)
print('\n% w/ no priors: ', no_priors)
print('% w/ priors: ', priors)
print('% w/ no juv misd: ', no_juv_misd)
print('% w/ juv misd: ', juv_misd)
print('% w/ no juv fel: ', no_juv_fel)
print('% w/ juv fel: ', juv_fel)

print ('\n---- CRIME ----')
print('% violent: ', violent)
print('% drug: ', drug)
print('% property: ', prop)
print('% public order: ', pa)
print('% no charge: ', no_charge)
print('% obstruction of justice: ', ooj)
print('% fraud: ', fraud)
print('% weapons: ', weapons)

print('\n% felony: ', fel)
print('% misd: ', misd)

print ('\n---- OUTCOME ----')
print('% recid: ', recid)
print('% not recid: ', no_recid)

---- SEX ----
% female:  22.889842632331902
% male:  77.1101573676681

---- PRIORS ----
priors mean:  2.2899380066762043
juv misd mean:  0.04148783977110158
juv fel mean:  0.02479732951835956

% w/ no priors:  39.103481163567004
% w/ priors:  60.896518836432996
% w/ no juv misd:  97.13876967095851
% w/ juv misd:  2.8612303290414878
% w/ no juv fel:  98.76013352408202
% w/ juv fel:  1.239866475917978

---- CRIME ----
% violent:  29.0414878397711
% drug:  19.360991893180735
% property:  15.116833571769194
% public order:  17.167381974248926
% no charge:  11.015736766809729
% obstruction of justice:  4.959465903671912
% fraud:  2.5274201239866474
% weapons:  0.8106819265617549

% felony:  59.084406294706724
% misd:  40.915593705293276

---- OUTCOME ----
% recid:  41.63090128755365
% not recid:  58.36909871244635


### Black (n = 3171)

In [41]:
dataset_black = dataset.copy()
dataset_black = dataset_black.loc[(dataset_black["race"]=="African-American")]

In [42]:
female = ((dataset_black['sex'] =='Female').sum() * 100) / len(dataset_black)
male = ((dataset_black['sex'] =='Male').sum() * 100) / len(dataset_black)

age_mean = dataset_black['age'].mean()

priors_mean = dataset_black['priors_count.1'].mean()
no_priors = ((dataset_black['priors_count.1'] ==0).sum() * 100) / len(dataset_black)
priors = ((dataset_black['priors_count.1'] !=0).sum() * 100) / len(dataset_black)

juv_misd_mean = dataset_black['juv_misd_count'].mean()
no_juv_misd = ((dataset_black['juv_misd_count'] ==0).sum() * 100) / len(dataset_black)
juv_misd = ((dataset_black['juv_misd_count'] !=0).sum() * 100) / len(dataset_black)

juv_fel_mean = dataset_black['juv_fel_count'].mean()
no_juv_fel = ((dataset_black['juv_fel_count'] ==0).sum() * 100) / len(dataset_black)
juv_fel = ((dataset_black['juv_fel_count'] !=0).sum() * 100) / len(dataset_black)


fel = ((dataset_black['c_charge_degree'] == 'F').sum() * 100) / len(dataset_black)
misd = ((dataset_black['c_charge_degree'] == 'M').sum() * 100) / len(dataset_black)


violent = ((dataset_black['offense_type'] == 'violent crime').sum() * 100) / len(dataset_black)
drug = ((dataset_black['offense_type'] == 'drug crime').sum() * 100) / len(dataset_black)
prop = ((dataset_black['offense_type'] == 'property crime').sum() * 100) / len(dataset_black)
pa = ((dataset_black['offense_type'] == 'public order crime').sum() * 100) / len(dataset_black)
no_charge = ((dataset_black['offense_type'] == 'arrested but not charged').sum() * 100) / len(dataset_black)
ooj = ((dataset_black['offense_type'] == 'obstruction of justice').sum() * 100) / len(dataset_black)
fraud = ((dataset_black['offense_type'] == 'fraud and financial crime').sum() * 100) / len(dataset_black)
weapons = ((dataset_black['offense_type'] == 'weapons-related crime').sum() * 100) / len(dataset_black)


recid = ((dataset_black['is_recid'] == 1).sum() * 100) / len(dataset_black)
no_recid = ((dataset_black['is_recid'] == 0).sum() * 100) / len(dataset_black)

print ('---- SEX ----')
print('% female: ', female)
print('% male: ', male)

print ('\n---- PRIORS ----')
print ('priors mean: ', priors_mean)
print ('juv misd mean: ', juv_misd_mean)
print ('juv fel mean: ', juv_fel_mean)
print('\n% w/ no priors: ', no_priors)
print('% w/ priors: ', priors)
print('% w/ no juv misd: ', no_juv_misd)
print('% w/ juv misd: ', juv_misd)
print('% w/ no juv fel: ', no_juv_fel)
print('% w/ juv fel: ', juv_fel)

print ('\n---- CRIME ----')
print('% violent: ', violent)
print('% drug: ', drug)
print('% property: ', prop)
print('% public order: ', pa)
print('% no charge: ', no_charge)
print('% obstruction of justice: ', ooj)
print('% fraud: ', fraud)
print('% weapons: ', weapons)

print('\n% felony: ', fel)
print('% misd: ', misd)

print ('\n---- OUTCOME ----')
print('% recid: ', recid)
print('% not recid: ', no_recid)

---- SEX ----
% female:  17.31315042573321
% male:  82.68684957426679

---- PRIORS ----
priors mean:  4.23998738568275
juv misd mean:  0.13654998423210343
juv fel mean:  0.08546199936928414

% w/ no priors:  26.64774519079155
% w/ priors:  73.35225480920845
% w/ no juv misd:  91.70608640807316
% w/ juv misd:  8.293913591926836
% w/ no juv fel:  94.82812992746767
% w/ juv fel:  5.171870072532324

---- CRIME ----
% violent:  26.111636707663198
% drug:  18.290760012614317
% property:  17.660044150110377
% public order:  13.024282560706402
% no charge:  14.443393251340272
% obstruction of justice:  4.951119520655944
% fraud:  3.500473036896878
% weapons:  2.0182907600126145

% felony:  69.12645853043205
% misd:  30.87354146956796

---- OUTCOME ----
% recid:  55.88142541784926
% not recid:  44.11857458215074


## Test Set Sample ( All, n = 300)

In [43]:
narratives_black_white = narratives.copy()
narratives_black_white = narratives_black_white.loc[(narratives_black_white["race"]=="Caucasian") | (narratives_black_white["race"]=="African-American") ]

In [44]:
exp_sample = narratives_black_white.sample(n=300, random_state=10)

In [45]:
exp_sample.to_csv('datasets/sample.csv', index=False)

In [46]:
female = ((exp_sample['sex'] =='female').sum() * 100) / len(exp_sample)
male = ((exp_sample['sex'] =='male').sum() * 100) / len(exp_sample)

white = ((exp_sample['race'] =='Caucasian').sum() * 100) / len(exp_sample)
black = ((exp_sample['race'] =='African-American').sum() * 100) / len(exp_sample)
other = 100 - (white + black)

age_mean = exp_sample['age'].mean()

priors_mean = exp_sample['priors_count.1'].mean()
no_priors = ((exp_sample['priors_count.1'] ==0).sum() * 100) / len(exp_sample)
priors = ((exp_sample['priors_count.1'] !=0).sum() * 100) / len(exp_sample)

juv_misd_mean = exp_sample['juv_misd_count'].mean()
no_juv_misd = ((exp_sample['juv_misd_count'] ==0).sum() * 100) / len(exp_sample)
juv_misd = ((exp_sample['juv_misd_count'] !=0).sum() * 100) / len(exp_sample)

juv_fel_mean = exp_sample['juv_fel_count'].mean()
no_juv_fel = ((exp_sample['juv_fel_count'] ==0).sum() * 100) / len(exp_sample)
juv_fel = ((exp_sample['juv_fel_count'] !=0).sum() * 100) / len(exp_sample)


fel = ((exp_sample['c_charge_degree'] == 'F').sum() * 100) / len(exp_sample)
misd = ((exp_sample['c_charge_degree'] == 'M').sum() * 100) / len(exp_sample)


violent = ((exp_sample['offense_type'] == 'violent crime').sum() * 100) / len(exp_sample)
drug = ((exp_sample['offense_type'] == 'drug crime').sum() * 100) / len(exp_sample)
prop = ((exp_sample['offense_type'] == 'property crime').sum() * 100) / len(exp_sample)
pa = ((exp_sample['offense_type'] == 'public order crime').sum() * 100) / len(exp_sample)
no_charge = ((exp_sample['offense_type'] == 'arrested but not charged').sum() * 100) / len(exp_sample)
ooj = ((exp_sample['offense_type'] == 'obstruction of justice').sum() * 100) / len(exp_sample)
fraud = ((exp_sample['offense_type'] == 'fraud and financial crime').sum() * 100) / len(exp_sample)
weapons = ((exp_sample['offense_type'] == 'weapons-related crime').sum() * 100) / len(exp_sample)


recid = ((exp_sample['real_outcome'] == 1).sum() * 100) / len(exp_sample)
no_recid = ((exp_sample['real_outcome'] == 0).sum() * 100) / len(exp_sample)

print ('---- SEX ----')
print('% female: ', female)
print('% male: ', male)

print ('\n---- RACE ----')
print('% white: ', white)
print('% black: ', black)
print('% other: ', other)

print ('\n---- PRIORS ----')
print ('priors mean: ', priors_mean)
print ('juv misd mean: ', juv_misd_mean)
print ('juv fel mean: ', juv_fel_mean)
print('\n% w/ no priors: ', no_priors)
print('% w/ priors: ', priors)
print('% w/ no juv misd: ', no_juv_misd)
print('% w/ juv misd: ', juv_misd)
print('% w/ no juv fel: ', no_juv_fel)
print('% w/ juv fel: ', juv_fel)

print ('\n---- CRIME ----')
print('% violent: ', violent)
print('% drug: ', drug)
print('% property: ', prop)
print('% public order: ', pa)
print('% no charge: ', no_charge)
print('% obstruction of justice: ', ooj)
print('% fraud: ', fraud)
print('% weapons: ', weapons)

print('\n% felony: ', fel)
print('% misd: ', misd)

print ('\n---- OUTCOME ----')
print('% recid: ', recid)
print('% not recid: ', no_recid)

---- SEX ----
% female:  19.0
% male:  81.0

---- RACE ----
% white:  39.333333333333336
% black:  60.666666666666664
% other:  0.0

---- PRIORS ----
priors mean:  3.6233333333333335
juv misd mean:  0.1
juv fel mean:  0.05333333333333334

% w/ no priors:  27.333333333333332
% w/ priors:  72.66666666666667
% w/ no juv misd:  93.66666666666667
% w/ juv misd:  6.333333333333333
% w/ no juv fel:  96.66666666666667
% w/ juv fel:  3.3333333333333335

---- CRIME ----
% violent:  30.333333333333332
% drug:  19.666666666666668
% property:  14.666666666666666
% public order:  15.0
% no charge:  10.0
% obstruction of justice:  5.333333333333333
% fraud:  3.0
% weapons:  2.0

% felony:  66.0
% misd:  34.0

---- OUTCOME ----
% recid:  53.0
% not recid:  47.0


### White (n = 132)

In [47]:
dataset_white_2 = exp_sample.copy()
dataset_white_2 = dataset_white_2.loc[(dataset_white_2["race"]=="Caucasian")]

In [48]:
female = ((dataset_white_2['sex'] =='female').sum() * 100) / len(dataset_white_2)
male = ((dataset_white_2['sex'] =='male').sum() * 100) / len(dataset_white_2)

age_mean = dataset_white_2['age'].mean()

priors_mean = dataset_white_2['priors_count.1'].mean()
no_priors = ((dataset_white_2['priors_count.1'] ==0).sum() * 100) / len(dataset_white_2)
priors = ((dataset_white_2['priors_count.1'] !=0).sum() * 100) / len(dataset_white_2)

juv_misd_mean = dataset_white_2['juv_misd_count'].mean()
no_juv_misd = ((dataset_white_2['juv_misd_count'] ==0).sum() * 100) / len(dataset_white_2)
juv_misd = ((dataset_white_2['juv_misd_count'] !=0).sum() * 100) / len(dataset_white_2)

juv_fel_mean = dataset_white_2['juv_fel_count'].mean()
no_juv_fel = ((dataset_white_2['juv_fel_count'] ==0).sum() * 100) / len(dataset_white_2)
juv_fel = ((dataset_white_2['juv_fel_count'] !=0).sum() * 100) / len(dataset_white_2)


fel = ((dataset_white_2['c_charge_degree'] == 'F').sum() * 100) / len(dataset_white_2)
misd = ((dataset_white_2['c_charge_degree'] == 'M').sum() * 100) / len(dataset_white_2)


violent = ((dataset_white_2['offense_type'] == 'violent crime').sum() * 100) / len(dataset_white_2)
drug = ((dataset_white_2['offense_type'] == 'drug crime').sum() * 100) / len(dataset_white_2)
prop = ((dataset_white_2['offense_type'] == 'property crime').sum() * 100) / len(dataset_white_2)
pa = ((dataset_white_2['offense_type'] == 'public order crime').sum() * 100) / len(dataset_white_2)
no_charge = ((dataset_white_2['offense_type'] == 'arrested but not charged').sum() * 100) / len(dataset_white_2)
ooj = ((dataset_white_2['offense_type'] == 'obstruction of justice').sum() * 100) / len(dataset_white_2)
fraud = ((dataset_white_2['offense_type'] == 'fraud and financial crime').sum() * 100) / len(dataset_white_2)
weapons = ((dataset_white_2['offense_type'] == 'weapons-related crime').sum() * 100) / len(dataset_white_2)


recid = ((dataset_white_2['real_outcome'] == 1).sum() * 100) / len(dataset_white_2)
no_recid = ((dataset_white_2['real_outcome'] == 0).sum() * 100) / len(dataset_white_2)

print ('---- SEX ----')
print('% female: ', female)
print('% male: ', male)

print ('\n---- PRIORS ----')
print ('priors mean: ', priors_mean)
print ('juv misd mean: ', juv_misd_mean)
print ('juv fel mean: ', juv_fel_mean)
print('\n% w/ no priors: ', no_priors)
print('% w/ priors: ', priors)
print('% w/ no juv misd: ', no_juv_misd)
print('% w/ juv misd: ', juv_misd)
print('% w/ no juv fel: ', no_juv_fel)
print('% w/ juv fel: ', juv_fel)

print ('\n---- CRIME ----')
print('% violent: ', violent)
print('% drug: ', drug)
print('% property: ', prop)
print('% public order: ', pa)
print('% no charge: ', no_charge)
print('% obstruction of justice: ', ooj)
print('% fraud: ', fraud)
print('% weapons: ', weapons)

print('\n% felony: ', fel)
print('% misd: ', misd)

print ('\n---- OUTCOME ----')
print('% recid: ', recid)
print('% not recid: ', no_recid)

---- SEX ----
% female:  20.338983050847457
% male:  79.66101694915254

---- PRIORS ----
priors mean:  1.9661016949152543
juv misd mean:  0.03389830508474576
juv fel mean:  0.00847457627118644

% w/ no priors:  33.898305084745765
% w/ priors:  66.10169491525424
% w/ no juv misd:  97.45762711864407
% w/ juv misd:  2.542372881355932
% w/ no juv fel:  99.15254237288136
% w/ juv fel:  0.847457627118644

---- CRIME ----
% violent:  27.11864406779661
% drug:  18.64406779661017
% property:  16.10169491525424
% public order:  18.64406779661017
% no charge:  9.322033898305085
% obstruction of justice:  6.779661016949152
% fraud:  2.542372881355932
% weapons:  0.847457627118644

% felony:  61.86440677966102
% misd:  38.13559322033898

---- OUTCOME ----
% recid:  44.91525423728814
% not recid:  55.08474576271186


### Black (n = 168)

In [49]:
dataset_black_2 = exp_sample.copy()
dataset_black_2 = dataset_black_2.loc[(dataset_black_2["race"]=="African-American")]

In [50]:
female = ((dataset_black_2['sex'] =='female').sum() * 100) / len(dataset_black_2)
male = ((dataset_black_2['sex'] =='male').sum() * 100) / len(dataset_black_2)

age_mean = dataset_black_2['age'].mean()

priors_mean = dataset_black_2['priors_count.1'].mean()
no_priors = ((dataset_black_2['priors_count.1'] ==0).sum() * 100) / len(dataset_black_2)
priors = ((dataset_black_2['priors_count.1'] !=0).sum() * 100) / len(dataset_black_2)

juv_misd_mean = dataset_black_2['juv_misd_count'].mean()
no_juv_misd = ((dataset_black_2['juv_misd_count'] ==0).sum() * 100) / len(dataset_black_2)
juv_misd = ((dataset_black_2['juv_misd_count'] !=0).sum() * 100) / len(dataset_black_2)

juv_fel_mean = dataset_black_2['juv_fel_count'].mean()
no_juv_fel = ((dataset_black_2['juv_fel_count'] ==0).sum() * 100) / len(dataset_black_2)
juv_fel = ((dataset_black_2['juv_fel_count'] !=0).sum() * 100) / len(dataset_black_2)


fel = ((dataset_black_2['c_charge_degree'] == 'F').sum() * 100) / len(dataset_black_2)
misd = ((dataset_black_2['c_charge_degree'] == 'M').sum() * 100) / len(dataset_black_2)


violent = ((dataset_black_2['offense_type'] == 'violent crime').sum() * 100) / len(dataset_black_2)
drug = ((dataset_black_2['offense_type'] == 'drug crime').sum() * 100) / len(dataset_black_2)
prop = ((dataset_black_2['offense_type'] == 'property crime').sum() * 100) / len(dataset_black_2)
pa = ((dataset_black_2['offense_type'] == 'public order crime').sum() * 100) / len(dataset_black_2)
no_charge = ((dataset_black_2['offense_type'] == 'arrested but not charged').sum() * 100) / len(dataset_black_2)
ooj = ((dataset_black_2['offense_type'] == 'obstruction of justice').sum() * 100) / len(dataset_black_2)
fraud = ((dataset_black_2['offense_type'] == 'fraud and financial crime').sum() * 100) / len(dataset_black_2)
weapons = ((dataset_black_2['offense_type'] == 'weapons-related crime').sum() * 100) / len(dataset_black_2)


recid = ((dataset_black_2['real_outcome'] == 1).sum() * 100) / len(dataset_black_2)
no_recid = ((dataset_black_2['real_outcome'] == 0).sum() * 100) / len(dataset_black_2)

print ('---- SEX ----')
print('% female: ', female)
print('% male: ', male)

print ('\n---- PRIORS ----')
print ('priors mean: ', priors_mean)
print ('juv misd mean: ', juv_misd_mean)
print ('juv fel mean: ', juv_fel_mean)
print('\n% w/ no priors: ', no_priors)
print('% w/ priors: ', priors)
print('% w/ no juv misd: ', no_juv_misd)
print('% w/ juv misd: ', juv_misd)
print('% w/ no juv fel: ', no_juv_fel)
print('% w/ juv fel: ', juv_fel)

print ('\n---- CRIME ----')
print('% violent: ', violent)
print('% drug: ', drug)
print('% property: ', prop)
print('% public order: ', pa)
print('% no charge: ', no_charge)
print('% obstruction of justice: ', ooj)
print('% fraud: ', fraud)
print('% weapons: ', weapons)

print('\n% felony: ', fel)
print('% misd: ', misd)

print ('\n---- OUTCOME ----')
print('% recid: ', recid)
print('% not recid: ', no_recid)

---- SEX ----
% female:  18.13186813186813
% male:  81.86813186813187

---- PRIORS ----
priors mean:  4.697802197802198
juv misd mean:  0.14285714285714285
juv fel mean:  0.08241758241758242

% w/ no priors:  23.076923076923077
% w/ priors:  76.92307692307692
% w/ no juv misd:  91.20879120879121
% w/ juv misd:  8.791208791208792
% w/ no juv fel:  95.05494505494505
% w/ juv fel:  4.945054945054945

---- CRIME ----
% violent:  32.417582417582416
% drug:  20.32967032967033
% property:  13.736263736263735
% public order:  12.637362637362637
% no charge:  10.43956043956044
% obstruction of justice:  4.395604395604396
% fraud:  3.2967032967032965
% weapons:  2.7472527472527473

% felony:  68.68131868131869
% misd:  31.318681318681318

---- OUTCOME ----
% recid:  58.24175824175824
% not recid:  41.75824175824176
