Exercise Predictive monitoring - Part 2
Filtering prefixes, aggregation and index-based encoding. Repeat the following tasks for the 2 event logs: small (synthetic) and BPIC_2017 (real). Commit the solutions on your github repository.






In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime 

%matplotlib inline

# _1. Import an event log as a pandas dataframe


In [3]:
small = pd.read_csv('small3insert.csv')
small.head(2)

Unnamed: 0,Case,Event,Activity,Timestamp,Resource,Resource_failure_rate,Resource_Pass/Fail,order,resource_anomaly_type,resource_parameter,trace_change_resource,variant_num
0,case_0,9367,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,0.026984,0,1,normal,,0,var_358
1,case_0,9368,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,0.018454,0,2,normal,,0,var_358


In [4]:
small.columns

Index(['Case', 'Event', 'Activity', 'Timestamp', 'Resource',
       'Resource_failure_rate', 'Resource_Pass/Fail', 'order',
       'resource_anomaly_type', 'resource_parameter', 'trace_change_resource',
       'variant_num'],
      dtype='object')

# _2.Create an event log that retains only the attributes caseid, activity, timestamp, resource and outcome. Rename the attributes as “caseid, activity, ts, resource, outcome” if names are different (note: we should indicate which one is the outcome label)


In [5]:
# resource_anomaly_type : outcome
df_original = small[['Case', 'Activity', 'Timestamp', 'Resource', 'resource_anomaly_type']]
df_original.head()

Unnamed: 0,Case,Activity,Timestamp,Resource,resource_anomaly_type
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,case_0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,case_0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal
3,case_0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal
4,case_0,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal


In [6]:
cols = ['caseid', 'activity', 'ts', 'resource', 'outcome']
df_original.columns = cols
df_original.head()

Unnamed: 0,caseid,activity,ts,resource,outcome
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,case_0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,case_0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal
3,case_0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal
4,case_0,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal


# _3.Create a function that does aggregation encoding of an event log:
Activity and resource are aggregated using frequency
Timestamp are aggregated using average


In [7]:
def aggregation(df):
  # create new df for result
  df_result = pd.DataFrame( columns= [ 'List_Activity', 'List_Resource'])

  # List of Activity and Resource
  df_result.loc[:,'List_Activity']= df.groupby('caseid')['activity'].apply(list)
  df_result.loc[:,'List_Resource']= df.groupby('caseid')['resource'].apply(list)

  # Average of timestamps
  df.loc[:,'ts'] = pd.to_datetime(df.loc[:,'ts'])
  tsMin = df.groupby('caseid')['ts'].min()
  df = pd.merge(df, tsMin.reset_index(), on= 'caseid', how='outer', suffixes = ('','_min'))
  df.loc[:,'ts_Gap'] = df['ts'] - df['ts_min']
  df_result.loc[:,'Avg_ts']= df.groupby('caseid')['ts_Gap'].apply(np.sum)/(df.groupby('caseid')['ts_Gap'].apply(len)-1)
  df_result.loc[:,'Avg_ts'] = df_result.loc[:,'Avg_ts'].dt.round('1s')

  return df_result


In [8]:
a = aggregation(df_original)
a.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0_level_0,List_Activity,List_Resource,Avg_ts
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
case_0,"[Activity A, Activity B, Activity C, Activity ...","[Resource_Group2_res_1, Resource_Group0_res_0,...",03:30:00
case_1,"[Activity A, Activity B, Activity C, Activity ...","[Resource_Group2_res_0, Resource_Group0_res_0,...",04:32:36
case_10,"[Activity A, Activity B, Activity C, Activity ...","[Resource_Group2_res_1, Resource_Group0_res_0,...",03:30:00
case_100,"[Activity A, Activity B, Activity C, Activity ...","[Resource_Group2_res_1, Resource_Group0_res_0,...",05:00:00
case_1000,"[Activity A, Activity B, Activity C, Activity ...","[Resource_Group2_res_0, Resource_Group0_res_0,...",05:00:00


# -4. Create a function to filter the prefixes of length L for the event log created at step 2. This function retains only the first L events of all cases that have at least L events. 


In [9]:
def prefix_df(df, n = 2):

  # the min of the number of activities is 7
  result_df = df.sort_values(by = ['caseid','ts'], ascending=True).groupby('caseid').head(n)
  result_df.reset_index(inplace = True, drop = True)

  return result_df

In [10]:
prefix_2 = prefix_df(df_original)
prefix_2.head()

Unnamed: 0,caseid,activity,ts,resource,outcome
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,case_0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,case_1,Activity A,1970-01-01 09:00:00,Resource_Group2_res_0,insert
3,case_1,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,insert
4,case_10,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal


In [11]:
prefix_5 = prefix_df(df_original,5)
prefix_5.head(10)

Unnamed: 0,caseid,activity,ts,resource,outcome
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group2_res_1,normal
1,case_0,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,normal
2,case_0,Activity C,1970-01-01 11:00:00,Resource_Group1_res_0,normal
3,case_0,Activity D,1970-01-01 12:00:00,Resource_Group2_res_2,normal
4,case_0,Activity E,1970-01-01 13:00:00,Resource_Group0_res_0,normal
5,case_1,Activity A,1970-01-01 09:00:00,Resource_Group2_res_0,insert
6,case_1,Activity B,1970-01-01 10:00:00,Resource_Group0_res_0,insert
7,case_1,Activity C,1970-01-01 11:00:00,Resource_Group1_res_1,insert
8,case_1,Activity G,1970-01-01 11:07:35,Resource_Group2_res_1,insert
9,case_1,Activity R,1970-01-01 11:51:06,Resource_Group1_res_1,insert


# _5. Create a function that does index-based encoding of a prefix-filtered event log.
Activity and resource are one-hot encoded
Timestamp is left as-is
*이탤릭체 텍스트*

In [43]:
def encorder(df, n = 2):

  df_result = df[['outcome', 'caseid','ts']]
  tmp_result = pd.DataFrame()

  for i in range(0,n):
    tmp = pd.get_dummies(df.groupby('caseid')['activity'].nth(i),  prefix_sep='_', prefix = f'{i+1}')
    tmp = df_result.groupby('caseid').nth(i).merge(tmp, how= 'left', left_on=None, right_on=None, left_index=True, right_index=True)
    tmp.reset_index(inplace = True)

    tmp_result = pd.concat([tmp_result, tmp], join = 'outer')

  tmp_result.fillna(0, inplace=True)
  tmp_result.sort_values(by = ['caseid', 'ts'] , ascending = True, inplace = True)
  tmp_result.reset_index(inplace = True, drop = True)

  return tmp_result

In [44]:
encorder(prefix_2).columns

Index(['caseid', 'outcome', 'ts', '1_Activity A', '2_Activity B',
       '2_Activity G', '2_Activity H', '2_Activity I', '2_Activity J',
       '2_Activity K', '2_Activity L', '2_Activity M', '2_Activity N',
       '2_Activity O', '2_Activity P', '2_Activity Q', '2_Activity R',
       '2_Activity S', '2_Activity T'],
      dtype='object')

In [45]:
encoded_pf_2 = encorder(prefix_2)
encoded_pf_2.head()

Unnamed: 0,caseid,outcome,ts,1_Activity A,2_Activity B,2_Activity G,2_Activity H,2_Activity I,2_Activity J,2_Activity K,2_Activity L,2_Activity M,2_Activity N,2_Activity O,2_Activity P,2_Activity Q,2_Activity R,2_Activity S,2_Activity T
0,case_0,normal,1970-01-01 09:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,case_0,normal,1970-01-01 10:00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,case_1,insert,1970-01-01 09:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,case_1,insert,1970-01-01 10:00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,case_10,normal,1970-01-01 09:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
encoded_pf_5 = encorder(prefix_5,5)
encoded_pf_5.head()

Unnamed: 0,caseid,outcome,ts,1_Activity A,2_Activity B,2_Activity G,2_Activity H,2_Activity I,2_Activity J,2_Activity K,...,5_Activity K,5_Activity L,5_Activity M,5_Activity N,5_Activity O,5_Activity P,5_Activity Q,5_Activity R,5_Activity S,5_Activity T
0,case_0,normal,1970-01-01 09:00:00,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,case_0,normal,1970-01-01 10:00:00,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,case_0,normal,1970-01-01 11:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,case_0,normal,1970-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,case_0,normal,1970-01-01 13:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# _6. Train a decision tree and a randomforest from the scikit-learn package for outcome label prediction using the event log created at step 3 and and step 6 for prefix length L=2 and L=5.  Split 70-30 for training-testing.

Hyperparameters
DT: max_depth=5
RF: n_estimators=100, oob_score=True


(Provide solutions for 2 event logs: artificial and real)

## Decision Tree

In [47]:
!pip install graphviz



In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score         # between pred_result and real outcome
from sklearn.tree import export_graphviz            # the creating of model
import graphviz                                     # the visualization of model

from graphviz import Digraph

### prefix_Length 2


In [49]:
# Generate DT algorithm
clf = DecisionTreeClassifier( random_state=0,  max_depth=5)

In [50]:
#  define X,y
X = encoded_pf_2.iloc[:,3:]
y= encoded_pf_2['outcome']

In [51]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7000, 16), (3000, 16), (7000,), (3000,))

In [52]:
# train
clf.fit( X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [53]:
# predict
y_pred = clf.predict( X_test )
y_pred

array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
      dtype=object)

In [54]:
# check accuracy_score
accuracy_score( y_test, y_pred )

0.8656666666666667

#### Visualization 
- it's just for showing the structure of clf (the fitted decision tree)
- if you dont' need to check this, you can skip below 2 codes. 

In [55]:
# export the dump file of DT model

export_graphviz(clf, out_file = 'tree.model', 
                class_names   = y.unique(),
                feature_names = X.columns, 
                filled        = True)

In [56]:
# Load the model dump file
with open('tree.model') as f:
  model_graph = f.read()

# the visualization of DT
graphviz.Source( model_graph )

ExecutableNotFound: failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x193bbf5f488>

### prefix_Length 5

In [57]:
# Generate DT algorithm
clf1 = DecisionTreeClassifier( random_state=0,  max_depth=5)

In [58]:
#  define X,y
x = encoded_pf_5.iloc[:,3:]
y= encoded_pf_5['outcome']

In [59]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17500, 67), (7500, 67), (17500,), (7500,))

In [60]:
# train
clf1.fit( X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [61]:
# predict
y_pred = clf1.predict( X_test )
y_pred

array(['normal', 'normal', 'normal', ..., 'normal', 'normal', 'normal'],
      dtype=object)

In [62]:
# check accuracy_score
accuracy_score( y_test, y_pred )

0.8768

## Random Forest

In [63]:
# 랜덤포레스트 사용 -> 앙상블 기법중 배깅을 이용하여서 예측했다
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier( random_state=0, n_estimators=100, oob_score=True)

### prefix_Length 2


In [64]:
#  define X,y
x = encoded_pf_2.iloc[:,3:]
y= encoded_pf_2['outcome']

In [65]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7000, 16), (3000, 16), (7000,), (3000,))

In [66]:
# RF 훈련
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [67]:
# RF 예측 및 정확도 체크
pred = rf_clf.predict( X_test )
accuracy_score( y_test, pred )

0.873

### prefix_Length 5


In [68]:
rf_clf1 = RandomForestClassifier( random_state=0, n_estimators=100, oob_score=True)

In [69]:
#  define X,y

x = encoded_pf_5.iloc[:,3:]
y= encoded_pf_5['outcome']

In [70]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17500, 67), (7500, 67), (17500,), (7500,))

In [71]:
# RF 훈련

rf_clf1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=0, verbose=0,
                       warm_start=False)

In [72]:
# RF 예측 및 정확도 체크

pred = rf_clf1.predict( X_test )
accuracy_score( y_test, pred )

0.8969333333333334