In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime 

%matplotlib inline

# _1. Import an event log as a pandas dataframe


In [2]:
small = pd.read_csv('Large.csv')
small.head(2)

Unnamed: 0,Case ID,Activity,Complete Timestamp,Variant,Variant index
0,case_466,Activity A,1970/01/01 09:00:00.000,Variant 8,8
1,case_466,Activity C,1970/01/01 10:00:00.000,Variant 8,8


In [3]:
small.columns

Index(['Case ID', 'Activity', 'Complete Timestamp', 'Variant',
       'Variant index'],
      dtype='object')

# _2.Create an event log that retains only the attributes caseid, activity, timestamp,. Rename the attributes as “caseid, activity, ts” if names are different


In [4]:

# resource_anomaly_type : outcome
df_original = small[['Case ID', 'Activity', 'Complete Timestamp']]
df_original.head()

Unnamed: 0,Case ID,Activity,Complete Timestamp
0,case_466,Activity A,1970/01/01 09:00:00.000
1,case_466,Activity C,1970/01/01 10:00:00.000
2,case_466,Activity D,1970/01/01 11:00:00.000
3,case_466,Activity E,1970/01/01 12:00:00.000
4,case_466,Activity G,1970/01/01 13:00:00.000


In [5]:
cols = ['caseid', 'activity', 'ts']
df_original.columns = cols
df_original.head()

Unnamed: 0,caseid,activity,ts
0,case_466,Activity A,1970/01/01 09:00:00.000
1,case_466,Activity C,1970/01/01 10:00:00.000
2,case_466,Activity D,1970/01/01 11:00:00.000
3,case_466,Activity E,1970/01/01 12:00:00.000
4,case_466,Activity G,1970/01/01 13:00:00.000


#_3.Create a function that does window-based encoding of an event log using size W (= number of events in a window). 
For each case, a number of observations are extracted, for each observation, the features are the W events and the duration of the window. 
The class label is the next event that will be executed.

In [6]:
def func_window(df_sample, window_size):
  new_df = pd.DataFrame(columns = ['caseid', '1st_act', '2nd_act', '3rd_act', 'label'])
  case_list = df_sample['caseid'].unique()

  for caseid in case_list:
    act_byCase = df_sample.loc[df_sample['caseid'] == caseid]
    act_byCase.sort_values(by = 'ts', ascending=True)
    act_byCase.reset_index(inplace= True)

    case_size = len(act_byCase)

    for act in range(case_size-window_size):
      tmp = list()
      for i in range(window_size +1):
        tmp.append(act_byCase.loc[act + i, 'activity'])

      set_values = [caseid, tmp[0], tmp[1], tmp[2], tmp[3]]
      new_df.loc[len(new_df), :] = set_values

  return new_df

In [7]:
window_3 = func_window(df_original, 3)

In [8]:
window_3.head(20)

Unnamed: 0,caseid,1st_act,2nd_act,3rd_act,label
0,case_466,Activity A,Activity C,Activity D,Activity E
1,case_466,Activity C,Activity D,Activity E,Activity G
2,case_466,Activity D,Activity E,Activity G,Activity F
3,case_466,Activity E,Activity G,Activity F,Activity I
4,case_466,Activity G,Activity F,Activity I,Activity M
5,case_466,Activity F,Activity I,Activity M,Activity N
6,case_466,Activity I,Activity M,Activity N,Activity J
7,case_466,Activity M,Activity N,Activity J,Activity B
8,case_12282,Activity A,Activity Q,Activity R,Activity S
9,case_12282,Activity Q,Activity R,Activity S,Activity T


In [9]:
def encorder(df):
  df_result = df[['label', 'caseid']]
  df_result = df_result.merge(pd.get_dummies(df["1st_act"], prefix_sep='_', prefix= '1st'), left_index=True, right_index=True)
  df_result = df_result.merge(pd.get_dummies(df["2nd_act"], prefix_sep='_', prefix= '2nd'), left_index=True, right_index=True)
  df_result = df_result.merge(pd.get_dummies(df["3rd_act"], prefix_sep='_', prefix= '3rd'), left_index=True, right_index=True)

  return df_result

In [10]:
encoded_window_3 = encorder(window_3)
encoded_window_3.head(3)

Unnamed: 0,label,caseid,1st_Activity A,1st_Activity AB,1st_Activity AC,1st_Activity AD,1st_Activity AE,1st_Activity AF,1st_Activity AG,1st_Activity AH,...,3rd_Activity P,3rd_Activity R,3rd_Activity S,3rd_Activity T,3rd_Activity U,3rd_Activity V,3rd_Activity W,3rd_Activity X,3rd_Activity Y,3rd_Activity Z
0,Activity E,case_466,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Activity G,case_466,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Activity F,case_466,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# _6. Train a decision tree and a gradient boosting model from the scikit-learn package for next event prediction using the event log created at step 4 for W=3.
Split 70-30 for training-testing.

Hyperparameters:
DT max_depth = 5, random_state = 1234
GBM max_depth = 3, random_state = 1234


## Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score         # between pred_result and real outcome

### window_3


In [12]:
#  define X,y
X = encoded_window_3.loc[:, encoded_window_3.columns.difference(['caseid','label'])]
y = encoded_window_3['label']

In [13]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((69906, 104), (29960, 104), (69906,), (29960,))

In [14]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train, y_train)
# predict
y_pred = clf.predict( X_test )

print('정확도', accuracy_score(y_test, y_pred)  )

정확도 0.3315086782376502


## a gradient boosting model
GBM max_depth = 3, random_state = 1234


In [15]:
from sklearn.ensemble import GradientBoostingClassifier

# GBM
gb_clf = GradientBoostingClassifier(max_depth = 3, random_state = 1234 )
gb_clf.fit( X_train, y_train )
pred   = gb_clf.predict( X_test )
print('정확도', accuracy_score(y_test, pred)  )

정확도 0.8694259012016021
