# Notebook for Finding Process Outcome Explanations

## Main Configuration

In [27]:
import pandas as pd
import numpy as np
import re

from rulelearn.algorithms.ripper import RipperExplainer
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score
from dtree_helper import get_rules

In [28]:
STRING_IMPUTE_VAL = 'missing'
FLOAT_IMPUTE_VAL = np.finfo(np.float32).min  # Imputation value for floats
TRAIN_TEST_SPLIT = 0.0 # CART:4 not to be used with 0.0 (current limitation)

BINA, ALGO =  'NATIVE', 'RIPPER'   # alternative: 'LENC', 'CART:4'  
# for other options, see: https://github.com/hvoelzer/rulebenchmarking
# CART:4 has usability drawbacks, e.g., created by float imputation and label encoding, label encoding however can also help for categoricals with a larger number of values

In [29]:
INPUT_FILE_NAME = 'cases_rtfm_full.csv'

DATA_TYPES = { # Rule Induction typically requires string or float, hence cast int and bool here
    'paid_full': object,
    'dismissed': object,
    'credit_collection': object,
    'unresolved': object,
    'appeal': object,
    'overturned_judge': object,
    'overturned_prefecture': object,
    'overturned': object,
    'upheld': object,
    'Create Fine.article': object,
    'Create Fine.org:resource': object
    }

df = pd.read_csv(INPUT_FILE_NAME, dtype=DATA_TYPES)
print('Read', len(df), 'rows from', INPUT_FILE_NAME)

df = df.dropna(axis=1, how='all')  # drop all completely empty columns
# we could also get rid of columns that have a constant value in all cells
df.info(verbose=True)

  df = pd.read_csv(INPUT_FILE_NAME, dtype=DATA_TYPES)


Read 150370 rows from cases_rtfm_full.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150370 entries, 0 to 150369
Data columns (total 383 columns):
 #    Column                                                                        Dtype  
---   ------                                                                        -----  
 0    case_id                                                                       object 
 1    Create Fine.count                                                             int64  
 2    event_count                                                                   int64  
 3    start_time                                                                    object 
 4    start_time_rel                                                                int64  
 5    duration                                                                      int64  
 6    Create Fine.start                                                             int64  
 7    Create Fine.

In [30]:
# Prep data: cast all (remaining) Boolean columns as object and all Integer columns as float
for column in df.select_dtypes(include=['bool']).columns:
    print(column, 'bool->object')
    df[column] = df[column].astype('object')
for column in df.select_dtypes(include=['int64']).columns:
    print(column, 'int->float')
    df[column] = df[column].astype('float64')
df.info(verbose=True)

Create Fine.count int->float
event_count int->float
start_time_rel int->float
duration int->float
Create Fine.start int->float
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150370 entries, 0 to 150369
Data columns (total 383 columns):
 #    Column                                                                        Dtype  
---   ------                                                                        -----  
 0    case_id                                                                       object 
 1    Create Fine.count                                                             float64
 2    event_count                                                                   float64
 3    start_time                                                                    object 
 4    start_time_rel                                                                float64
 5    duration                                                                      float64
 6    Create Fine.start  

In [31]:
# Prep data: Imputation whenever needed
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].fillna(STRING_IMPUTE_VAL)  # normalize missing string values
if ALGO in ['CART:4'] or BINA in ['TREES']:
    for column in df.select_dtypes(include=['float64']).columns:
        df[column] = df[column].fillna(FLOAT_IMPUTE_VAL)

## Configuration of the Explanation Problem

In [32]:
# Set baseline and outcome to be explained
baseline = lambda row: row['paid_full']=='True' or row['dismissed']=='True'
outcome = lambda row: row['Send for Credit Collection.count'] <= 0.0
# outcome = lambda row: row['Insert Fine Notification.count'] <= 0.0

POS_CLASS = True

# The baseline should have high precision for explaining the outcome with respect to the entire log
df['outcome'] = df.apply(outcome, axis=1)
df['baseline'] = df.apply(baseline, axis=1)
y_true = df['outcome']
y_pred = df['baseline']
precision = precision_score(y_true, y_pred, pos_label=True, zero_division=1)
print('Precision of baseline on case log:', round(precision, 3), precision)


df_baseline = df[~df['baseline']]
print('Baseline filter: {} out of {} rows remain.'.format(len(df_baseline),len(df)))
df = df_baseline
print('Outcome distribution in baseline:')
print(df['outcome'].value_counts())
df['baseline'] = df['baseline'].astype('object')

# Add explanations to the baseline filter that have already been found
rules = [
    # lambda row: row['Insert Fine Notification.count'] <= 0.0,
    # lambda row: row['Final.outstanding_balance_without_penalty'] <= 0.01 and row['Add penalty:Payment.delay'] <= 3.0,
    # lambda row: row['Insert Date Appeal to Prefecture.count'] >= 1.0 and row['Notify Result Appeal to Offender.count'] <= 0.0,
    # lambda row: row['Final.outstanding_balance'] <= 10.00,
    # lambda row: row['start_time_rel'] >= 4481.0,
    # lambda row: row['appeal'] == 'True' and row['Final.outstanding_balance_without_penalty'] <= 0.01,
    # lambda row: row['start_time_rel'] >= 4401.0 and row['Payment.count'] >= 1.0,
]

for (i, rule) in enumerate(rules):
    before = len(df)
    df = df[~df.apply(rule, axis=1)]
    print('Rule {} removed {} cases.'.format(i, before-len(df)))


Precision of baseline on case log: 1.0 0.9999518845228549
Baseline filter: 88020 out of 150370 rows remain.
Outcome distribution in baseline:
outcome
False    59010
True     29010
Name: count, dtype: int64


### Attribute Hiding

In [33]:
# Use attribute names or regular expressions to hide
HIDE = [
    r'.*Send for Credit Collection.*',
    # r'.*Insert Fine Notification.*',
    # r'.*Add penalty.*',
    'credit_collection',
    # 'dismissed',
    'unresolved',
    # 'paid_full',
    'duration',
    # 'case_id',
    # r'.*outstanding_balance_without_penalty.*',
    # r'.*outstanding_balance_without_expense.*',
    # r'.*Final.*', 
    # 'start_time_rel',
    # 'start_time',
    'event_count',
    ]

for pattern in HIDE:
    for col in df.columns:
        if re.fullmatch(pattern, col):
            # print('Dropping {} on pattern "{}".'.format(col, pattern))
            df = df.drop(columns=[col])


In [34]:
# Prep data: normalizing label for specific algorithms

def convert(char):
    if char == POS_CLASS:
        return 1
    else:
        return 0
    
if ALGO in ('CART:4'):
    df['outcome'] = df['outcome'].map(convert)
    POS_CLASS = 1
    NEG_CLASS = 0

In [35]:
# Prep data: Split into training and test set
if TRAIN_TEST_SPLIT > 0.0:
    x_train, x_test, y_train, y_test = train_test_split(
        df.drop(columns=['outcome']),
        df['outcome'],
        test_size=TRAIN_TEST_SPLIT) 
else:
    x_train = df.drop(columns=['outcome'])
    y_train = df['outcome']
    x_test = x_train
    y_test = y_train

print(len(y_train))
print(y_train.value_counts())
print(len(y_test))
print(y_test.value_counts())

88020
outcome
False    59010
True     29010
Name: count, dtype: int64
88020
outcome
False    59010
True     29010
Name: count, dtype: int64


## Run Rule Induction

In [36]:
# Run Binarizer / Encoding
if BINA == 'LENC':
    x_train_bin = x_train
    x_test_bin = x_test
    categorical_features = x_train_bin.select_dtypes(include=['object']).columns
    for col in categorical_features:
        label_encoder = LabelEncoder()
        label_encoder = label_encoder.fit(df[col])
        x_train_bin[col] = label_encoder.transform(x_train_bin[col])
        x_test_bin[col] = label_encoder.transform(x_test_bin[col])
        print(col, label_encoder.classes_, label_encoder.transform(label_encoder.classes_))
elif BINA == 'NATIVE':
    x_train_bin = x_train.copy() # RIPPER implementation messes with the training set
    x_test_bin = x_test

In [37]:
# Run Rule Induction
if ALGO == 'CART:4':
    estimator = DecisionTreeClassifier(max_depth=4)
    estimator.fit(x_train_bin, y_train)
elif ALGO == 'RIPPER':
    estimator = RipperExplainer()
    estimator.fit(x_train_bin, y_train, target_label=POS_CLASS)
else:
    print('Not supported.')

In [38]:
# Evaluation
y_predicted = estimator.predict(x_test_bin)
# print('Accuracy:', round(accuracy_score(y_test, y_predicted), 2))
print('Predictive:')
print('Precision:', round(precision_score(y_test, y_predicted, pos_label=POS_CLASS, zero_division=1), 6))
print('Recall:', round(recall_score(y_test, y_predicted, pos_label=POS_CLASS), 6))
y_predicted_train = estimator.predict(x_train_bin)
print('Descriptive:')
print('Precision:', round(precision_score(y_train, y_predicted_train, pos_label=POS_CLASS, zero_division=1), 6))
print('Recall:', round(recall_score(y_train, y_predicted_train, pos_label=POS_CLASS), 6))

Predictive:
Precision: 0.998514
Recall: 0.996312
Descriptive:
Precision: 0.998718
Recall: 0.993209


In [39]:
# Model export
if ALGO in ['CART:4']:
    print(export_text(estimator, feature_names=x_train_bin.columns.tolist()))
    for rule in get_rules(estimator, x_train_bin.columns.tolist(), [0, 1]):
        if rule.find('then class: 1') >= 0:
            print(rule)
elif ALGO in ('RIPPER'):
    rule_set = estimator.explain()
    print(rule_set)

if
([Insert Fine Notification.count <= 0.0]) v
([Add penalty:Payment.delay <= 3.0] ^ [Final.outstanding_balance_without_penalty <= 0.01] ^ [start_time_rel <= 4172.0]) v
([Insert Date Appeal to Prefecture.start >= 2.0] ^ [Notify Result Appeal to Offender.count <= 0.0]) v
([Final.outstanding_balance_without_penalty <= -11.0] ^ [Final.outstanding_balance <= 7.5]) v
([Add penalty:Payment.delay >= -19.0] ^ [start_time_rel >= 4389.0]) v
([appeal == True] ^ [Final.paymentAmount::sum >= 27.64]) v
([Add penalty:Payment.delay >= -16.0] ^ [Payment.totalPaymentAmount >= 50.26] ^ [Final.paymentAmount::sum <= 49.0]) v
([Add penalty:Payment.delay >= -38.0] ^ [Add penalty:Payment.delay <= 2.0] ^ [start_time_rel >= 4063.0]) v
([Send Fine:Payment.delay >= 71.0] ^ [start_time_rel >= 3369.0] ^ [Add penalty:Payment.delay <= 7.0] ^ [Send Fine.start >= 51.0]) v
([Receive Result Appeal from Prefecture.outstanding_balance_without_penalty >= 50.2] ^ [Final.expense::sum >= 28.7]) v
([Payment.outstanding_balance_