In [200]:
import pm4py
import pandas as pd
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Data pre-processing

In [169]:
event_log = pm4py.read_xes('data/end_A_event_log.xes')

parsing log, completed traces ::   0%|          | 0/17549 [00:00<?, ?it/s]

In [170]:
df = pm4py.convert_to_dataframe(event_log)

In [171]:
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'], utc=True)
df['Year'] = df['time:timestamp'].dt.year
df['Month'] = df['time:timestamp'].dt.month

# Add label

In [172]:
success_log = pm4py.filter_end_activities(event_log, ["A_Pending"])
unsuccess_log = pm4py.filter_end_activities(event_log, ["A_Cancelled", "A_Denied"])
df_suc = pm4py.convert_to_dataframe(success_log)
df_unsuc = pm4py.convert_to_dataframe(unsuccess_log)

df_suc['label'] = 'successful'
df_unsuc['label'] = 'unscucessful'

# Concatenate the DataFrames
df = pd.concat([df_suc, df_unsuc], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_suc['label'] = 'successful'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unsuc['label'] = 'unscucessful'


In [173]:
df

Unnamed: 0,Action,EventID,EventOrigin,org:resource,concept:name,lifecycle:transition,time:timestamp,case:LoanGoal,case:concept:name,case:RequestedAmount,case:ApplicationType,Year,Month,label
0,Created,Application_652823628,Application,User_1,A_Create Application,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,Application_652823628,20000.0,New credit,2016,1,successful
1,statechange,ApplState_1582051990,Application,User_1,A_Submitted,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,Application_652823628,20000.0,New credit,2016,1,successful
2,statechange,ApplState_642383566,Application,User_1,A_Concept,complete,2016-01-01 09:52:36.413000+00:00,Existing loan takeover,Application_652823628,20000.0,New credit,2016,1,successful
3,statechange,ApplState_99568828,Application,User_52,A_Accepted,complete,2016-01-02 11:23:04.299000+00:00,Existing loan takeover,Application_652823628,20000.0,New credit,2016,1,successful
4,statechange,ApplState_946455804,Application,User_52,A_Complete,complete,2016-01-02 11:30:28.633000+00:00,Existing loan takeover,Application_652823628,20000.0,New credit,2016,1,successful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130192,statechange,ApplState_660057066,Application,User_12,A_Complete,complete,2017-01-03 19:30:00.773000+00:00,Car,Application_117342811,5000.0,New credit,2017,1,unscucessful
130193,statechange,ApplState_1952793614,Application,User_53,A_Validating,complete,2017-01-11 14:23:09.159000+00:00,Car,Application_117342811,5000.0,New credit,2017,1,unscucessful
130194,statechange,ApplState_1839877280,Application,User_53,A_Incomplete,complete,2017-01-11 14:26:39.541000+00:00,Car,Application_117342811,5000.0,New credit,2017,1,unscucessful
130195,statechange,ApplState_1927619154,Application,User_123,A_Validating,complete,2017-01-13 10:30:50.195000+00:00,Car,Application_117342811,5000.0,New credit,2017,1,unscucessful


# Prefix with full trace before the end event

In [174]:
unique_events = sorted(df['concept:name'].unique())
unique_events

['A_Accepted',
 'A_Cancelled',
 'A_Complete',
 'A_Concept',
 'A_Create Application',
 'A_Denied',
 'A_Incomplete',
 'A_Pending',
 'A_Submitted',
 'A_Validating']

In [175]:
result = pd.crosstab(df['case:concept:name'], df['concept:name'])
result

concept:name,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Application_1000158214,1,0,1,1,1,0,0,1,1,1
Application_1000334415,1,0,1,1,1,1,1,0,1,2
Application_100034150,1,0,1,1,1,0,1,1,1,2
Application_1000386745,1,0,1,1,1,0,2,1,1,2
Application_1000474975,1,0,1,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
Application_998666405,1,1,1,1,1,0,0,0,1,0
Application_999090530,1,0,1,1,1,0,0,1,0,1
Application_999487618,1,0,1,1,1,0,0,1,1,1
Application_999507989,1,0,1,1,1,0,1,1,1,1


In [176]:
result = result.reindex(unique_events, axis=1, fill_value=0)
result

concept:name,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Application_1000158214,1,0,1,1,1,0,0,1,1,1
Application_1000334415,1,0,1,1,1,1,1,0,1,2
Application_100034150,1,0,1,1,1,0,1,1,1,2
Application_1000386745,1,0,1,1,1,0,2,1,1,2
Application_1000474975,1,0,1,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
Application_998666405,1,1,1,1,1,0,0,0,1,0
Application_999090530,1,0,1,1,1,0,0,1,0,1
Application_999487618,1,0,1,1,1,0,0,1,1,1
Application_999507989,1,0,1,1,1,0,1,1,1,1


# Getting trace attribute and merge

In [177]:
case_attr_df = df.groupby('case:concept:name').agg({
    'case:LoanGoal': 'first',
    'case:RequestedAmount': 'first',
    'case:ApplicationType': 'first',
    'time:timestamp': 'first',
    'Month': 'first',
    'Year': 'first',
    'label': 'first'
}).reset_index()
case_attr_df

Unnamed: 0,case:concept:name,case:LoanGoal,case:RequestedAmount,case:ApplicationType,time:timestamp,Month,Year,label
0,Application_1000158214,Home improvement,12500.0,New credit,2016-06-02 10:14:26.844000+00:00,6,2016,successful
1,Application_1000334415,"Other, see explanation",5000.0,New credit,2016-09-15 16:39:17.758000+00:00,9,2016,unscucessful
2,Application_100034150,Existing loan takeover,5000.0,New credit,2016-02-26 08:17:08.702000+00:00,2,2016,successful
3,Application_1000386745,Car,5000.0,New credit,2016-11-25 14:31:09.852000+00:00,11,2016,successful
4,Application_1000474975,"Other, see explanation",50000.0,New credit,2016-06-16 15:37:24.412000+00:00,6,2016,successful
...,...,...,...,...,...,...,...,...
17544,Application_998666405,Not speficied,45000.0,New credit,2016-10-21 12:41:01.854000+00:00,10,2016,unscucessful
17545,Application_999090530,Car,7500.0,New credit,2016-08-11 11:06:31.793000+00:00,8,2016,successful
17546,Application_999487618,Existing loan takeover,20000.0,New credit,2016-08-17 18:17:43.311000+00:00,8,2016,successful
17547,Application_999507989,Not speficied,24000.0,New credit,2016-03-17 06:53:36.792000+00:00,3,2016,successful


In [178]:
merged_df = pd.merge(result, case_attr_df, on='case:concept:name', how='inner')
merged_df = merged_df.set_index('case:concept:name')
merged_df

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,case:LoanGoal,case:RequestedAmount,case:ApplicationType,time:timestamp,Month,Year,label
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Application_1000158214,1,0,1,1,1,0,0,1,1,1,Home improvement,12500.0,New credit,2016-06-02 10:14:26.844000+00:00,6,2016,successful
Application_1000334415,1,0,1,1,1,1,1,0,1,2,"Other, see explanation",5000.0,New credit,2016-09-15 16:39:17.758000+00:00,9,2016,unscucessful
Application_100034150,1,0,1,1,1,0,1,1,1,2,Existing loan takeover,5000.0,New credit,2016-02-26 08:17:08.702000+00:00,2,2016,successful
Application_1000386745,1,0,1,1,1,0,2,1,1,2,Car,5000.0,New credit,2016-11-25 14:31:09.852000+00:00,11,2016,successful
Application_1000474975,1,0,1,1,1,0,0,1,0,1,"Other, see explanation",50000.0,New credit,2016-06-16 15:37:24.412000+00:00,6,2016,successful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Application_998666405,1,1,1,1,1,0,0,0,1,0,Not speficied,45000.0,New credit,2016-10-21 12:41:01.854000+00:00,10,2016,unscucessful
Application_999090530,1,0,1,1,1,0,0,1,0,1,Car,7500.0,New credit,2016-08-11 11:06:31.793000+00:00,8,2016,successful
Application_999487618,1,0,1,1,1,0,0,1,1,1,Existing loan takeover,20000.0,New credit,2016-08-17 18:17:43.311000+00:00,8,2016,successful
Application_999507989,1,0,1,1,1,0,1,1,1,1,Not speficied,24000.0,New credit,2016-03-17 06:53:36.792000+00:00,3,2016,successful


In [179]:
one_hot_encoded_data = pd.get_dummies(merged_df['case:LoanGoal'], prefix='LoanGoal')
merged_df = merged_df.join(one_hot_encoded_data)
merged_df = merged_df.drop(columns=['case:LoanGoal'])

one_hot_encoded_data = pd.get_dummies(merged_df['case:ApplicationType'], prefix='ApplicationType')
merged_df = merged_df.join(one_hot_encoded_data)
merged_df = merged_df.drop(columns=['case:ApplicationType'])

merged_df

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,LoanGoal_Extra spending limit,LoanGoal_Home improvement,LoanGoal_Motorcycle,LoanGoal_Not speficied,"LoanGoal_Other, see explanation",LoanGoal_Remaining debt home,LoanGoal_Tax payments,LoanGoal_Unknown,ApplicationType_Limit raise,ApplicationType_New credit
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_1000158214,1,0,1,1,1,0,0,1,1,1,...,0,1,0,0,0,0,0,0,0,1
Application_1000334415,1,0,1,1,1,1,1,0,1,2,...,0,0,0,0,1,0,0,0,0,1
Application_100034150,1,0,1,1,1,0,1,1,1,2,...,0,0,0,0,0,0,0,0,0,1
Application_1000386745,1,0,1,1,1,0,2,1,1,2,...,0,0,0,0,0,0,0,0,0,1
Application_1000474975,1,0,1,1,1,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Application_998666405,1,1,1,1,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
Application_999090530,1,0,1,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
Application_999487618,1,0,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
Application_999507989,1,0,1,1,1,0,1,1,1,1,...,0,0,0,1,0,0,0,0,0,1


# Train test split

In [180]:
merged_df['time:timestamp'] = pd.to_datetime(merged_df['time:timestamp'])

In [181]:
merged_df.sort_values(by=['time:timestamp'], inplace=True)
merged_df

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,LoanGoal_Extra spending limit,LoanGoal_Home improvement,LoanGoal_Motorcycle,LoanGoal_Not speficied,"LoanGoal_Other, see explanation",LoanGoal_Remaining debt home,LoanGoal_Tax payments,LoanGoal_Unknown,ApplicationType_Limit raise,ApplicationType_New credit
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_652823628,1,0,1,1,1,0,1,1,1,2,...,0,0,0,0,0,0,0,0,0,1
Application_1085880569,1,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Application_1266995739,1,1,1,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
Application_1878239836,1,0,1,1,1,1,0,0,1,1,...,0,1,0,0,0,0,0,0,0,1
Application_619403287,1,0,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Application_965278193,1,0,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
Application_1331346987,1,0,1,1,1,0,1,1,1,1,...,0,1,0,0,0,0,0,0,0,1
Application_1107993134,1,0,1,1,1,0,2,1,1,2,...,0,0,0,0,0,0,0,0,0,1
Application_59622033,1,0,1,1,1,0,2,1,1,2,...,0,1,0,0,0,0,0,0,0,1


In [182]:
total_rows = len(merged_df)
train_size = int(0.7 * total_rows)
validation_size = int(0.1 * total_rows)
test_size = total_rows - train_size - validation_size

train = merged_df[:train_size]
validation = merged_df[train_size:train_size + validation_size]
test = merged_df[train_size + validation_size:]

In [183]:
train.count()

A_Accepted                         12284
A_Cancelled                        12284
A_Complete                         12284
A_Concept                          12284
A_Create Application               12284
A_Denied                           12284
A_Incomplete                       12284
A_Pending                          12284
A_Submitted                        12284
A_Validating                       12284
case:RequestedAmount               12284
time:timestamp                     12284
Month                              12284
Year                               12284
label                              12284
LoanGoal_Boat                      12284
LoanGoal_Business goal             12284
LoanGoal_Car                       12284
LoanGoal_Caravan / Camper          12284
LoanGoal_Existing loan takeover    12284
LoanGoal_Extra spending limit      12284
LoanGoal_Home improvement          12284
LoanGoal_Motorcycle                12284
LoanGoal_Not speficied             12284
LoanGoal_Other, 

In [184]:
test.count()

A_Accepted                         3511
A_Cancelled                        3511
A_Complete                         3511
A_Concept                          3511
A_Create Application               3511
A_Denied                           3511
A_Incomplete                       3511
A_Pending                          3511
A_Submitted                        3511
A_Validating                       3511
case:RequestedAmount               3511
time:timestamp                     3511
Month                              3511
Year                               3511
label                              3511
LoanGoal_Boat                      3511
LoanGoal_Business goal             3511
LoanGoal_Car                       3511
LoanGoal_Caravan / Camper          3511
LoanGoal_Existing loan takeover    3511
LoanGoal_Extra spending limit      3511
LoanGoal_Home improvement          3511
LoanGoal_Motorcycle                3511
LoanGoal_Not speficied             3511
LoanGoal_Other, see explanation    3511


In [185]:
# Splitting the training set
X_train = train.drop('label', axis=1)
X_train = X_train.drop('time:timestamp', axis=1)
X_train = X_train.drop('Month', axis=1)
X_train = X_train.drop('Year', axis=1)
y_train = train['label']

# Splitting the validation set
X_validation = validation.drop('label', axis=1)
X_validation = X_validation.drop('time:timestamp', axis=1)
X_validation = X_validation.drop('Month', axis=1)
X_validation = X_validation.drop('Year', axis=1)
y_validation = validation['label']

# Splitting the testing set
X_test = test.drop('label', axis=1)
X_test = X_test.drop('time:timestamp', axis=1)
X_test = X_test.drop('Month', axis=1)
X_test = X_test.drop('Year', axis=1)
y_test = test['label']

In [186]:
X_train

Unnamed: 0_level_0,A_Accepted,A_Cancelled,A_Complete,A_Concept,A_Create Application,A_Denied,A_Incomplete,A_Pending,A_Submitted,A_Validating,...,LoanGoal_Extra spending limit,LoanGoal_Home improvement,LoanGoal_Motorcycle,LoanGoal_Not speficied,"LoanGoal_Other, see explanation",LoanGoal_Remaining debt home,LoanGoal_Tax payments,LoanGoal_Unknown,ApplicationType_Limit raise,ApplicationType_New credit
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Application_652823628,1,0,1,1,1,0,1,1,1,2,...,0,0,0,0,0,0,0,0,0,1
Application_1085880569,1,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
Application_1266995739,1,1,1,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
Application_1878239836,1,0,1,1,1,1,0,0,1,1,...,0,1,0,0,0,0,0,0,0,1
Application_619403287,1,0,1,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Application_1639015703,1,0,1,1,1,0,0,1,1,1,...,0,1,0,0,0,0,0,0,0,1
Application_1546234552,1,0,1,1,1,0,0,1,1,1,...,0,0,0,1,0,0,0,0,0,1
Application_2032209718,1,0,1,1,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
Application_1230226715,1,1,1,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [187]:
y_train

case:concept:name
Application_652823628       successful
Application_1085880569    unscucessful
Application_1266995739    unscucessful
Application_1878239836    unscucessful
Application_619403287       successful
                              ...     
Application_1639015703      successful
Application_1546234552      successful
Application_2032209718      successful
Application_1230226715    unscucessful
Application_1190132102      successful
Name: label, Length: 12284, dtype: object

# Prediction

## Decision Tree

In [188]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [189]:
accuracy = clf.score(X_test, y_test)
accuracy

1.0

## SVM

In [190]:
clf = svm.SVC()
clf.fit(X_train, y_train)

In [191]:
accuracy = clf.score(X_test, y_test)
accuracy

0.6832811164910282

## Gradient Boosting

In [197]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

## Logistic Regression

In [199]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

## Newral Network (MLP)

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)