In [1]:
import pm4py
import pandas as pd
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.metrics import accuracy_score

# Data pre-processing

Event logs are already filtered and projected in ProM.
- A_events_new.xes contains only A events with traces ending with A_Pending, A_Cancelled, A_Denied. This log is used for label extraction.
- OA_events.xes contains A and O events. This log is used for prefix and feature extraction.

After processing, both logs are joined together to form the final set.

In [2]:
# Open the event logs
event_log1 = pm4py.read_xes('data/A_events_new.xes')
event_log2 = pm4py.read_xes('data/OA_events.xes')



parsing log, completed traces ::   0%|          | 0/31411 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

In [3]:
# Convert the event logs to dataframes
df1 = pm4py.convert_to_dataframe(event_log1)
df2 = pm4py.convert_to_dataframe(event_log2)

In [4]:
# Nomalize the timestamps
df1['time:timestamp'] = pd.to_datetime(df1['time:timestamp'], utc=True)
df2['time:timestamp'] = pd.to_datetime(df2['time:timestamp'], utc=True)
df1['Year'] = df1['time:timestamp'].dt.year
df1['Month'] = df1['time:timestamp'].dt.month

# Add label

Based on the end activies
- If the trace ends with A_Pending "successful"
- If the trace ends with A_Cancelled or A_Denied, the label is "unsuccessful"

In [5]:
success_log = pm4py.filter_end_activities(event_log1, ["A_Pending"])
unsuccess_log = pm4py.filter_end_activities(event_log1, ["A_Cancelled", "A_Denied"])
df_suc = pm4py.convert_to_dataframe(success_log)
df_unsuc = pm4py.convert_to_dataframe(unsuccess_log)

df_suc['label'] = 'successful'
df_unsuc['label'] = 'unsuccessful'

# Concatenate the DataFrames
df1 = pd.concat([df_suc, df_unsuc], ignore_index=True)
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_suc['label'] = 'successful'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unsuc['label'] = 'unsuccessful'


Unnamed: 0,Action,EventID,EventOrigin,org:resource,concept:name,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:RequestedAmount,case:concept:name,Year,Month,label
0,Created,Application_652823628,Application,User_1,A_Create Application,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,20000.0,Application_652823628,2016,1,successful
1,statechange,ApplState_1582051990,Application,User_1,A_Submitted,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,20000.0,Application_652823628,2016,1,successful
2,statechange,ApplState_642383566,Application,User_1,A_Concept,complete,2016-01-01 09:52:36.413000+00:00,Existing loan takeover,New credit,20000.0,Application_652823628,2016,1,successful
3,statechange,ApplState_99568828,Application,User_52,A_Accepted,complete,2016-01-02 11:23:04.299000+00:00,Existing loan takeover,New credit,20000.0,Application_652823628,2016,1,successful
4,statechange,ApplState_946455804,Application,User_52,A_Complete,complete,2016-01-02 11:30:28.633000+00:00,Existing loan takeover,New credit,20000.0,Application_652823628,2016,1,successful


# Prefix Encoding
We aim to predict the outcome of the application when the offer is sent. Therefore, all events before offer sent (included) are encoded as prefix.

In [6]:
# Show all unique events
unique_events = sorted(df2['concept:name'].unique())
unique_events

['A_Accepted',
 'A_Cancelled',
 'A_Complete',
 'A_Concept',
 'A_Create Application',
 'A_Denied',
 'A_Incomplete',
 'A_Pending',
 'A_Submitted',
 'A_Validating',
 'O_Accepted',
 'O_Cancelled',
 'O_Create Offer',
 'O_Created',
 'O_Refused',
 'O_Returned',
 'O_Sent (mail and online)',
 'O_Sent (online only)']

### Filter the event log to extract prefix

In [7]:
# Filter the events so that only the events before O_Sent (mail and online) or O_Sent (online only) of each trace are kept
df2 = df2.sort_values(by=['case:concept:name', 'time:timestamp'])

# Function to filter rows
def filter_rows(group):
    # Find the first occurrence of the specified 'concept:name'
    idx = group[group['concept:name'].isin(['O_Sent (mail and online)', 'O_Sent (online only)'])].index.min()
    # If found, keep rows up to and including the first occurrence
    if pd.notna(idx):
        return group.loc[:idx]
    # If not found, return the group as is
    return group

# Apply the function to each group and combine the results
filtered_df2 = df2.groupby('case:concept:name', group_keys=False).apply(filter_rows)

### Encode the prefix by Aggreagation

In [8]:
result = pd.crosstab(filtered_df2['case:concept:name'], filtered_df2['concept:name']).reindex(unique_events, axis=1, fill_value=0)
result.head()

concept:name,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,O_Accepted,O_Cancelled,O_Create Offer,O_Created,O_Refused,O_Returned,O_Sent (mail and online),O_Sent (online only)
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Application_1000086665,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0
Application_1000158214,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0
Application_1000311556,1,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0
Application_1000334415,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0
Application_1000339879,1,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,1,0


# Getting trace attribute and merge

In [9]:
# Extract the case attributes
case_attr_df1 = df1.groupby('case:concept:name').agg({
    'case:LoanGoal': 'first',
    'case:RequestedAmount': 'first',
    'case:ApplicationType': 'first',
    'time:timestamp': 'first',
    'Month': 'first',
    'Year': 'first',
    'label': 'first'
}).reset_index()
case_attr_df1.head()

Unnamed: 0,case:concept:name,case:LoanGoal,case:RequestedAmount,case:ApplicationType,time:timestamp,Month,Year,label
0,Application_1000086665,"Other, see explanation",5000.0,New credit,2016-08-03 15:57:21.673000+00:00,8,2016,unsuccessful
1,Application_1000158214,Home improvement,12500.0,New credit,2016-06-02 10:14:26.844000+00:00,6,2016,successful
2,Application_1000311556,Car,45000.0,New credit,2016-04-04 15:56:37.675000+00:00,4,2016,unsuccessful
3,Application_1000334415,"Other, see explanation",5000.0,New credit,2016-09-15 16:39:17.758000+00:00,9,2016,unsuccessful
4,Application_1000339879,Existing loan takeover,37500.0,New credit,2016-03-17 12:57:10.159000+00:00,3,2016,successful


In [10]:
# Merge the trace attributes with the prefix encoding
merged_df = pd.merge(result, case_attr_df1, on='case:concept:name', how='inner')
merged_df = merged_df.set_index('case:concept:name')
merged_df.head()

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,O_Returned,O_Sent (mail and online),O_Sent (online only),case:LoanGoal,case:RequestedAmount,case:ApplicationType,time:timestamp,Month,Year,label
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_1000086665,1,0,0,1,1,0,0,0,1,0,...,0,1,0,"Other, see explanation",5000.0,New credit,2016-08-03 15:57:21.673000+00:00,8,2016,unsuccessful
Application_1000158214,1,0,0,1,1,0,0,0,1,0,...,0,1,0,Home improvement,12500.0,New credit,2016-06-02 10:14:26.844000+00:00,6,2016,successful
Application_1000311556,1,0,0,1,1,0,0,0,0,0,...,0,1,0,Car,45000.0,New credit,2016-04-04 15:56:37.675000+00:00,4,2016,unsuccessful
Application_1000334415,1,0,0,1,1,0,0,0,1,0,...,0,1,0,"Other, see explanation",5000.0,New credit,2016-09-15 16:39:17.758000+00:00,9,2016,unsuccessful
Application_1000339879,1,0,0,1,1,0,0,0,1,0,...,0,1,0,Existing loan takeover,37500.0,New credit,2016-03-17 12:57:10.159000+00:00,3,2016,successful


# Encode categorical features using one-hot encoding

In [11]:
one_hot_encoded_data = pd.get_dummies(merged_df['case:LoanGoal'], prefix='LoanGoal')
merged_df = merged_df.join(one_hot_encoded_data)
merged_df = merged_df.drop(columns=['case:LoanGoal'])

one_hot_encoded_data = pd.get_dummies(merged_df['case:ApplicationType'], prefix='ApplicationType')
merged_df = merged_df.join(one_hot_encoded_data)
merged_df = merged_df.drop(columns=['case:ApplicationType'])

merged_df.head()

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,LoanGoal_Extra spending limit,LoanGoal_Home improvement,LoanGoal_Motorcycle,LoanGoal_Not speficied,"LoanGoal_Other, see explanation",LoanGoal_Remaining debt home,LoanGoal_Tax payments,LoanGoal_Unknown,ApplicationType_Limit raise,ApplicationType_New credit
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_1000086665,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
Application_1000158214,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
Application_1000311556,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Application_1000334415,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
Application_1000339879,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


# Train test split

In [12]:
merged_df['time:timestamp'] = pd.to_datetime(merged_df['time:timestamp'])
merged_df.sort_values(by=['time:timestamp'], inplace=True)
merged_df.head()

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,LoanGoal_Extra spending limit,LoanGoal_Home improvement,LoanGoal_Motorcycle,LoanGoal_Not speficied,"LoanGoal_Other, see explanation",LoanGoal_Remaining debt home,LoanGoal_Tax payments,LoanGoal_Unknown,ApplicationType_Limit raise,ApplicationType_New credit
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_652823628,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Application_1691306052,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
Application_428409768,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
Application_1746793196,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Application_828200680,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


### Split the data into train and test set. Following time order, the first 70% of the traces are used for training, the next 10% for vallidation, and the last 20% for testing.

In [13]:
total_rows = len(merged_df)
train_size = int(0.7 * total_rows)
validation_size = int(0.1 * total_rows)
test_size = total_rows - train_size - validation_size

train = merged_df[:train_size]
validation = merged_df[train_size:train_size + validation_size]
test = merged_df[train_size + validation_size:]

In [14]:
# Splitting the training set
X_train = train.drop('label', axis=1)
X_train = X_train.drop('time:timestamp', axis=1)
X_train = X_train.drop('Month', axis=1)
X_train = X_train.drop('Year', axis=1)
y_train = train['label']

# Splitting the validation set
X_validation = validation.drop('label', axis=1)
X_validation = X_validation.drop('time:timestamp', axis=1)
X_validation = X_validation.drop('Month', axis=1)
X_validation = X_validation.drop('Year', axis=1)
y_validation = validation['label']

# Splitting the testing set
X_test = test.drop('label', axis=1)
X_test = X_test.drop('time:timestamp', axis=1)
X_test = X_test.drop('Month', axis=1)
X_test = X_test.drop('Year', axis=1)
y_test = test['label']

In [15]:
X_train.head()

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,LoanGoal_Extra spending limit,LoanGoal_Home improvement,LoanGoal_Motorcycle,LoanGoal_Not speficied,"LoanGoal_Other, see explanation",LoanGoal_Remaining debt home,LoanGoal_Tax payments,LoanGoal_Unknown,ApplicationType_Limit raise,ApplicationType_New credit
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_652823628,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Application_1691306052,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
Application_428409768,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
Application_1746793196,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Application_828200680,1,0,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1


In [16]:
y_train.head()

case:concept:name
Application_652823628       successful
Application_1691306052    unsuccessful
Application_428409768       successful
Application_1746793196      successful
Application_828200680     unsuccessful
Name: label, dtype: object

In [17]:
y_test.head()

case:concept:name
Application_1046222107      successful
Application_1235896381    unsuccessful
Application_1985757094    unsuccessful
Application_396092392     unsuccessful
Application_585529210       successful
Name: label, dtype: object

# Create naive predictors
For the test set, randomly assign the two lables.

In [18]:
np.random.seed(0)

# Randomly assign 'successful' or 'unsuccessful'
statuses = ['successful', 'unsuccessful']
y_test_naive = pd.Series(np.random.choice(statuses, size=len(y_test)), index=y_test.index)

# Display the new DataFrame
print(y_test_naive[:10])

case:concept:name
Application_1046222107      successful
Application_1235896381    unsuccessful
Application_1985757094    unsuccessful
Application_396092392       successful
Application_585529210     unsuccessful
Application_1751900898    unsuccessful
Application_867435788     unsuccessful
Application_543098684     unsuccessful
Application_1090974006    unsuccessful
Application_2087713022    unsuccessful
dtype: object


# Prediction

## Naive Predictor Performance

In [19]:
accuracy_score(y_test_naive, y_test)

0.5091516791341716

## Decision Tree

In [20]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [21]:
accuracy = clf.score(X_test, y_test)
accuracy

0.5779086423682954

## SVM

In [22]:
clf = svm.SVC()
clf.fit(X_train, y_train)

In [23]:
accuracy = clf.score(X_test, y_test)
accuracy

0.5505331847843387

## Gradient Boosting

In [24]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.593187967531434

## Logistic Regression

In [25]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

0.5489415884131784

## Newral Network (MLP)

In [26]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5489415884131784