In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime 

%matplotlib inline

# _1. Import an event log as a pandas dataframe


In [2]:
helpdesk = pd.read_csv('helpdesk.csv')
helpdesk.head(2)

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,2,1,4/3/2012 16:55
1,2,8,4/3/2012 16:55


In [3]:
helpdesk.columns

Index(['CaseID', 'ActivityID', 'CompleteTimestamp'], dtype='object')

# _2.Create an event log that retains only the attributes caseid, activity, timestamp,. Rename the attributes as “caseid, activity, ts” if names are different


In [4]:
# resource_anomaly_type : outcome
df_original = helpdesk[['CaseID', 'ActivityID', 'CompleteTimestamp']]
df_original.head()

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,2,1,4/3/2012 16:55
1,2,8,4/3/2012 16:55
2,2,6,4/5/2012 17:15
3,3,1,10/29/2010 18:14
4,3,8,11/4/2010 1:16


In [5]:
cols = ['caseid', 'activity', 'ts']
df_original.columns = cols
df_original.head()

Unnamed: 0,caseid,activity,ts
0,2,1,4/3/2012 16:55
1,2,8,4/3/2012 16:55
2,2,6,4/5/2012 17:15
3,3,1,10/29/2010 18:14
4,3,8,11/4/2010 1:16


#_3.Create a function that does window-based encoding of an event log using size W (= number of events in a window). 
For each case, a number of observations are extracted, for each observation, the features are the W events and the duration of the window. 
The class label is the next event that will be executed.

In [6]:
def func_window(df_sample, window_size):
  new_df = pd.DataFrame(columns = ['caseid', '1st_act', '2nd_act', '3rd_act', 'label'])
  case_list = df_sample['caseid'].unique()

  for caseid in case_list:
    act_byCase = df_sample.loc[df_sample['caseid'] == caseid]
    act_byCase.sort_values(by = 'ts', ascending=True)
    act_byCase.reset_index(inplace= True)

    case_size = len(act_byCase)

    for act in range(case_size-window_size):
      tmp = list()
      for i in range(window_size +1):
        tmp.append(act_byCase.loc[act + i, 'activity'])

      set_values = [caseid, tmp[0], tmp[1], tmp[2], tmp[3]]
      new_df.loc[len(new_df), :] = set_values

  return new_df

In [7]:
window_3 = func_window(df_original, 3)

In [8]:
window_3.head(20)

Unnamed: 0,caseid,1st_act,2nd_act,3rd_act,label
0,5,1,8,6,8
1,5,8,6,8,6
2,9,3,1,8,6
3,16,1,8,8,6
4,22,1,8,9,6
5,26,1,8,9,8
6,26,8,9,8,6
7,37,1,8,6,6
8,37,8,6,6,8
9,37,6,6,8,9


In [9]:
def encorder(df):
  
  df_result = df[['label', 'caseid']]
  df_result = df_result.merge(pd.get_dummies(df["1st_act"], prefix_sep='_', prefix= '1st'), left_index=True, right_index=True)
  df_result = df_result.merge(pd.get_dummies(df["2nd_act"], prefix_sep='_', prefix= '2nd'), left_index=True, right_index=True)
  df_result = df_result.merge(pd.get_dummies(df["3rd_act"], prefix_sep='_', prefix= '3rd'), left_index=True, right_index=True)

  return df_result

In [10]:
encoded_window_3 = encorder(window_3)
encoded_window_3.head(3)

Unnamed: 0,label,caseid,1st_1,1st_2,1st_3,1st_4,1st_6,1st_7,1st_8,1st_9,...,2nd_8,2nd_9,3rd_1,3rd_2,3rd_4,3rd_5,3rd_6,3rd_7,3rd_8,3rd_9
0,8,5,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,6,5,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,6,9,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
encoded_window_3['label'] = pd.to_numeric(encoded_window_3['label'])

# _6. Train a decision tree and a gradient boosting model from the scikit-learn package for next event prediction using the event log created at step 4 for W=3.
Split 70-30 for training-testing.

Hyperparameters:
DT max_depth = 5, random_state = 1234
GBM max_depth = 3, random_state = 1234


## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score         # between pred_result and real outcome


### window_3 with case id


In [13]:
#  define X,y
X = encoded_window_3.loc[:, encoded_window_3.columns.difference(['label'])]
y = encoded_window_3['label']

In [14]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1733, 25), (744, 25), (1733,), (744,))

In [15]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train, y_train)
# predict
y_pred = clf.predict( X_test )

print('정확도', accuracy_score(y_test, y_pred)  )

정확도 0.6196236559139785


### window_3 without case id


In [16]:
X_train_2 = X_train.loc[:, X_train.columns.difference(['caseid'])]
X_test_2 = X_test.loc[:, X_test.columns.difference(['caseid'])]

X_train_2.shape,X_test_2.shape

((1733, 24), (744, 24))

In [17]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train_2, y_train)
# predict
y_pred = clf.predict( X_test_2 )

print('정확도', accuracy_score(y_test, y_pred)  )

정확도 0.6196236559139785


## a gradient boosting model
GBM max_depth = 3, random_state = 1234


In [18]:
from sklearn.ensemble import GradientBoostingClassifier

# GBM
gb_clf = GradientBoostingClassifier(max_depth = 3, random_state = 1234 )
gb_clf.fit( X_train, y_train )
pred   = gb_clf.predict( X_test )
print('정확도', accuracy_score(y_test, pred)  )

정확도 0.6209677419354839
