In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime 
from datetime import datetime as dt

%matplotlib inline

# Data : large (synthetic)

## _1. Import an event log as a pandas dataframe


In [3]:
original_data = pd.read_csv('Large_hw4.csv')
original_data.head()

Unnamed: 0,Case,Activity,Timestamp,Resource,is_anomalous
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group4_res_4,0
1,case_0,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_1,0
2,case_0,Activity AF,1970-01-01 11:00:00,Resource_Group0_res_0,0
3,case_0,Activity AC,1970-01-01 12:00:00,Resource_Group4_res_4,0
4,case_0,Activity AG,1970-01-01 13:00:00,Resource_Group4_res_3,0


In [4]:
original_data.dtypes

Case            object
Activity        object
Timestamp       object
Resource        object
is_anomalous     int64
dtype: object

## _2. Create an event log that retains only the attributes caseid, activity, timestamp, resource, outcome. Rename the attributes as “caseid, activity, ts, resource, outcome” if names are different

In [5]:
# resource_anomaly_type : outcome
cols = ['caseid','activity','ts','resource','outcome']
df_original = pd.DataFrame(original_data)
df_original.columns = cols

form = "%Y-%m-%d %H:%M:%S"
df_original['ts'] = df_original['ts'].apply(lambda x: dt.strptime(x, form))

df_original.head()


Unnamed: 0,caseid,activity,ts,resource,outcome
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group4_res_4,0
1,case_0,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_1,0
2,case_0,Activity AF,1970-01-01 11:00:00,Resource_Group0_res_0,0
3,case_0,Activity AC,1970-01-01 12:00:00,Resource_Group4_res_4,0
4,case_0,Activity AG,1970-01-01 13:00:00,Resource_Group4_res_3,0


In [6]:
df_original.dtypes

caseid              object
activity            object
ts          datetime64[ns]
resource            object
outcome              int64
dtype: object

## _3.Create a function that pre-processes an event log using index-based encoding and filters the prefixes of length L (you can reuse what you develop in the previous weeks)

In [7]:
df_prefix = df_original

In [8]:
def prefix_df(df, n = 2):

    result_df = df.sort_values(['caseid', 'ts'], ascending=[True,True]).groupby('caseid').head(n)
    result_df.reset_index(inplace = True, drop = True)

    return result_df

In [9]:
df_pre = prefix_df(df_prefix,3)
df_pre.head(10)

Unnamed: 0,caseid,activity,ts,resource,outcome
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group4_res_4,0
1,case_0,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_1,0
2,case_0,Activity AF,1970-01-01 11:00:00,Resource_Group0_res_0,0
3,case_1,Activity A,1970-01-01 09:00:00,Resource_Group4_res_2,0
4,case_1,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_1,0
5,case_1,Activity AF,1970-01-01 11:00:00,Resource_Group0_res_0,0
6,case_10,Activity A,1970-01-01 09:00:00,Resource_Group4_res_3,1
7,case_10,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_0,1
8,case_10,Activity AF,1970-01-01 11:00:00,Resource_Group0_res_0,1
9,case_100,Activity A,1970-01-01 09:00:00,Resource_Group4_res_1,0


## _4. Create a function that, given an event log pre-processed as in step 3 (i.e., index-based encoded and containing only prefixes of a given length L), augments the log with the following resource-aware features:

- n-work-items: the number of work items executed by the current resource until the current event
  - 각 event의 resource가 한 일 중에 현재 event 보다 빨리 발생된 일 
- n-curr-case: the number of tasks executed by the current resources in the current case until the current event
  - 각 event의 resource가 한 일중에 동일 케이스 내 현재 event보다 이전에 발생된 일
- per-case: the polarity of cases involving the current resource, completed until the current event
  - 현재 이벤트의 리소스가 포함된 Case의 갯수 중 outcome이 1인 것 / 현재 이벤트의 리소스가 포함된 Case의 갯수 
- per-curr-ho: the polarity of cases involving at least one hand-off equal to the current one, completed until the current event
  - 직전 resource, 현재 resource의 세트가 동일한 경우를 가진 케이스 중 outcome이 1인 것/ 직전 resource, 현재 resource의 세트가 동일한 경우를 가진 케이스


In [10]:
def preprocessing(df):
    resource_list = df.resource.unique()
    df[['n-work-items','n-Curr-case', 'per-case','per-curr-ho']] =  pd.DataFrame([[np.nan, np.nan, np.nan,np.nan]], index=df.index)

    for i in resource_list:

        # 각 Resource의 df 생성
        df_target = df[df.resource == i]
        df_target['one'] = 1 
        df_target.loc[:,'previous_rcs'] = df.resource.shift(1)
        df_target.reset_index(inplace=True)

        for j in range(len(df_target)):
          # 해당 resource를 가진 event 추적 및 update

            # Keeping the original index of the event.
            idx = df_target['index'].loc[j].astype(int)
            # indicate the target time/case
            target_time = df_target.ts.loc[j]
            target_case = df_target.caseid.loc[j]

            # /n-work-items calculation 
            a = df_target.loc[(df_target['ts'] < target_time)]['one'].transform(pd.Series.cumsum).tail(1).values 

            # n-Curr-case calculation 
            b = df_target.loc[(df_target.loc[:,'caseid'] == target_case)&(df_target.ts < target_time)]['one'].transform(pd.Series.cumsum).tail(1).values

            # per-case calculation 
            c = len(df_target.loc[(df_target.ts< target_time)]['caseid'].unique())
            conditional_c = len(df_target.loc[(df_target.ts < target_time) & (df_target.loc[:,'outcome'].values==1)]['caseid'].unique())

            if a.size == 0:
                df.loc[idx,'n-work-items'] = 0
            else:
                a = a.astype(int)
                df.loc[idx,'n-work-items'] = a

            if b.size == 0:
                df.loc[idx,'n-Curr-case'] = 0
            else:
                b = b.astype(int)
                df.loc[idx,'n-Curr-case'] = b

            if c == 0 | conditional_c==0 | (c ==np.nan) | (conditional_c == np.nan):
                df.loc[idx,'per-case'] = 0
            else:
                df.loc[idx,'per-case'] = conditional_c/c

        # if there isn't the previous resource, the value of 'per-corr-ho' is 0.
        if df_target.previous_rcs.isnull().values.any() :
            df.loc[df_target['index'],'per-curr-ho'] = 0 


        list_reRcs = df_target.previous_rcs.unique()
        list_reRcs=[x for x in list_reRcs if x == x]

        for r in range(len(list_reRcs)):

            df_preRcs = df_target[df_target.previous_rcs == list_reRcs[r]]
            d = len(df_preRcs.caseid.unique())
            conditional_d = len(df_preRcs.loc[df_preRcs.outcome==1,'caseid'].unique())
            df.loc[df_preRcs['index'],'per-curr-ho']= conditional_d/d


    return df

In [11]:
df_final = preprocessing(df_pre)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [12]:
df_final

Unnamed: 0,caseid,activity,ts,resource,outcome,n-work-items,n-Curr-case,per-case,per-curr-ho
0,case_0,Activity A,1970-01-01 09:00:00,Resource_Group4_res_4,0,26.0,0.0,0.307692,0.000000
1,case_0,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_1,0,26.0,0.0,0.961538,0.270886
2,case_0,Activity AF,1970-01-01 11:00:00,Resource_Group0_res_0,0,40.0,0.0,1.000000,0.204461
3,case_1,Activity A,1970-01-01 09:00:00,Resource_Group4_res_2,0,1.0,0.0,0.000000,0.187611
4,case_1,Activity AB,1970-01-01 10:00:00,Resource_Group1_res_1,0,26.0,0.0,0.961538,0.225287
...,...,...,...,...,...,...,...,...,...
37495,case_9998,Activity C,1970-01-01 10:00:00,Resource_Group3_res_1,0,13.0,0.0,1.000000,0.169014
37496,case_9998,Activity D,1970-01-01 11:00:00,Resource_Group4_res_4,0,2562.0,0.0,0.220319,0.192641
37497,case_9999,Activity A,1970-01-01 09:00:00,Resource_Group4_res_0,0,3.0,0.0,0.333333,0.268293
37498,case_9999,Activity C,1970-01-01 10:00:00,Resource_Group3_res_3,0,15.0,0.0,0.866667,0.199052


## Q5. Train a decision tree and a gradient boosting model from the scikit-learn package for outcome label prediction using the event log created at step 4 for prefix length L=2 and L=5.  Split 70-30 for training-testing.

- Hyperparameters:
  - DT max_depth = 5, random_state = 1234
  - GBM max_depth = 3, random_state = 1234


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score         # between pred_result and real outcome
from sklearn import preprocessing                   # To convert 'Activity' into the float type of data

from sklearn.ensemble import GradientBoostingClassifier

### Q5-1. Decision Tree 

- DT max_depth = 5, random_state = 1234



In [14]:
df_final.columns

Index(['caseid', 'activity', 'ts', 'resource', 'outcome', 'n-work-items',
       'n-Curr-case', 'per-case', 'per-curr-ho'],
      dtype='object')

In [15]:
# ,'activity','caseid','resource'
encorded_pre2 = df_final

In [16]:
le = preprocessing.LabelEncoder()
encorded_pre2['new_activity']=le.fit_transform(df_final['activity'])
encorded_pre2['new_caseid']=le.fit_transform(df_final['caseid'])
encorded_pre2['new_resource']=le.fit_transform(df_final['resource'])
encorded_pre2['new_ts']=le.fit_transform(df_final['ts'])


In [17]:
#  define X,y
X = encorded_pre2.loc[:, encorded_pre2.columns.difference(['outcome','activity','caseid','resource','ts'])]
y = encorded_pre2['outcome']

In [18]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26250, 8), (11250, 8), (26250,), (11250,))

In [19]:
# Recreate the train/test data for dataset without the resource aware features. 
X_train_withoutRsc = X_train.loc[:, X_train.columns.difference(['n-Curr-case','n-work-items','per-case','per-curr-ho'])]
X_test_withoutRsc = X_test.loc[:, X_test.columns.difference(['n-Curr-case','n-work-items','per-case','per-curr-ho'])]


In [20]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train, y_train)
# predict
y_pred = clf.predict( X_test )

print('Accuracy for data with the resource-aware features : ', accuracy_score(y_test, y_pred)  )

Accuracy for data with the resource-aware features :  0.8136888888888889


In [21]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train_withoutRsc, y_train)
# predict
y_pred = clf.predict( X_test_withoutRsc )

print('Accuracy for data without the resource-aware features : ', accuracy_score(y_test, y_pred)  )

Accuracy for data without the resource-aware features :  0.8136


### Q5-2. a gradient boosting model

-  GBM max_depth = 3, random_state = 1234


In [22]:
# GBM
gb_clf = GradientBoostingClassifier(max_depth = 3, random_state = 1234 )
gb_clf.fit( X_train, y_train )
pred   = gb_clf.predict( X_test )
print('Accuracy for data with the resource-aware features :', accuracy_score(y_test, pred)  )

Accuracy for data with the resource-aware features : 0.8192


In [23]:
from sklearn.ensemble import GradientBoostingClassifier

# GBM
gb_clf = GradientBoostingClassifier(max_depth = 3, random_state = 1234 )
gb_clf.fit( X_train_withoutRsc, y_train )
pred   = gb_clf.predict( X_test_withoutRsc )
print('Accuracy for data with the resource-aware features :', accuracy_score(y_test, pred)  )

Accuracy for data with the resource-aware features : 0.8152


# Data : sepsis (real)

## _1. Import an event log as a pandas dataframe


In [24]:
original_data = pd.read_csv('sepsis_hw4-binary-label-diagnose-correct.csv')
original_data.head(5)

Unnamed: 0,case_id,activity,resource,timestamp,diagnose
0,A,ER Registration,A,2014-10-22 11:15:41,1
1,A,Leucocytes,B,2014-10-22 11:27:00,1
2,A,CRP,B,2014-10-22 11:27:00,1
3,A,LacticAcid,B,2014-10-22 11:27:00,1
4,A,ER Triage,C,2014-10-22 11:33:37,1


In [25]:
original_data.dtypes

case_id      object
activity     object
resource     object
timestamp    object
diagnose      int64
dtype: object

## _2. Create an event log that retains only the attributes caseid, activity, timestamp, resource, outcome. Rename the attributes as “caseid, activity, ts, resource, outcome” if names are different

In [26]:
# resource_anomaly_type : outcome
cols = ['caseid','activity','resource','ts','outcome']
df_original = pd.DataFrame(original_data)
df_original.columns = cols

form = "%Y-%m-%d %H:%M:%S"
df_original['ts'] = df_original['ts'].apply(lambda x: dt.strptime(x, form))

df_original.head()

Unnamed: 0,caseid,activity,resource,ts,outcome
0,A,ER Registration,A,2014-10-22 11:15:41,1
1,A,Leucocytes,B,2014-10-22 11:27:00,1
2,A,CRP,B,2014-10-22 11:27:00,1
3,A,LacticAcid,B,2014-10-22 11:27:00,1
4,A,ER Triage,C,2014-10-22 11:33:37,1


## _3.Create a function that pre-processes an event log using index-based encoding and filters the prefixes of length L (you can reuse what you develop in the previous weeks)

In [27]:
df_prefix = df_original

In [28]:
def prefix_df(df, n = 2):

    result_df = df.sort_values(['caseid', 'ts'], ascending=[True,True]).groupby('caseid').head(n)
    result_df.reset_index(inplace = True, drop = True)

    return result_df

In [29]:
df_prefix2 = prefix_df(df_prefix,2)
df_prefix2.head()

Unnamed: 0,caseid,activity,resource,ts,outcome
0,A,ER Registration,A,2014-10-22 11:15:41,1
1,A,ER Registration,A,2014-10-22 11:15:41,1
2,AAA,ER Registration,A,2014-11-19 03:16:21,1
3,AAA,ER Registration,A,2014-11-19 03:16:21,1
4,ABA,ER Registration,A,2014-10-12 11:22:24,1


In [30]:
df_prefix5 = prefix_df(df_prefix,5)
df_prefix5.head(10)

Unnamed: 0,caseid,activity,resource,ts,outcome
0,A,ER Registration,A,2014-10-22 11:15:41,1
1,A,ER Registration,A,2014-10-22 11:15:41,1
2,A,Leucocytes,B,2014-10-22 11:27:00,1
3,A,CRP,B,2014-10-22 11:27:00,1
4,A,LacticAcid,B,2014-10-22 11:27:00,1
5,AAA,ER Registration,A,2014-11-19 03:16:21,1
6,AAA,ER Registration,A,2014-11-19 03:16:21,1
7,AAA,ER Triage,C,2014-11-19 03:18:49,1
8,AAA,ER Triage,C,2014-11-19 03:18:49,1
9,AAA,ER Sepsis Triage,A,2014-11-19 03:19:09,1


## _4. Create a function that, given an event log pre-processed as in step 3 (i.e., index-based encoded and containing only prefixes of a given length L), augments the log with the following resource-aware features:


In [31]:
def preprocessing(df):
    resource_list = df.resource.unique()
    df[['n-work-items','n-Curr-case', 'per-case','per-curr-ho']] =  pd.DataFrame([[np.nan, np.nan, np.nan,np.nan]], index=df.index)

    for i in resource_list:

        # 각 Resource의 df 생성
        df_target = df[df.resource == i]
        df_target['one'] = 1 
        df_target.loc[:,'previous_rcs'] = df.resource.shift(1)
        df_target.reset_index(inplace=True)

        for j in range(len(df_target)):
          # 해당 resource를 가진 event 추적 및 update

            # Keeping the original index of the event.
            idx = df_target['index'].loc[j].astype(int)
            # indicate the target time/case
            target_time = df_target.ts.loc[j]
            target_case = df_target.caseid.loc[j]

            # /n-work-items calculation 
            a = df_target.loc[(df_target['ts'] < target_time)]['one'].transform(pd.Series.cumsum).tail(1).values 

            # n-Curr-case calculation 
            b = df_target.loc[(df_target.loc[:,'caseid'] == target_case)&(df_target.ts < target_time)]['one'].transform(pd.Series.cumsum).tail(1).values

            # per-case calculation 
            c = len(df_target.loc[(df_target.ts< target_time)]['caseid'].unique())
            conditional_c = len(df_target.loc[(df_target.ts < target_time) & (df_target.loc[:,'outcome'].values==1)]['caseid'].unique())

            if a.size == 0:
                df.loc[idx,'n-work-items'] = 0
            else:
                a = a.astype(int)
                df.loc[idx,'n-work-items'] = a

            if b.size == 0:
                df.loc[idx,'n-Curr-case'] = 0
            else:
                b = b.astype(int)
                df.loc[idx,'n-Curr-case'] = b

            if c == 0 | conditional_c==0 | (c ==np.nan) | (conditional_c == np.nan):
                df.loc[idx,'per-case'] = 0
            else:
                df.loc[idx,'per-case'] = conditional_c/c

        # if there isn't the previous resource, the value of 'per-corr-ho' is 0.
        if df_target.previous_rcs.isnull().values.any() :
            df.loc[df_target['index'],'per-curr-ho'] = 0 


        list_reRcs = df_target.previous_rcs.unique()
        list_reRcs=[x for x in list_reRcs if x == x]

        for r in range(len(list_reRcs)):

            df_preRcs = df_target[df_target.previous_rcs == list_reRcs[r]]
            d = len(df_preRcs.caseid.unique())
            conditional_d = len(df_preRcs.loc[df_preRcs.outcome==1,'caseid'].unique())
            df.loc[df_preRcs['index'],'per-curr-ho']= conditional_d/d


    return df

In [32]:
df_final = preprocessing(df_prefix5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


## Q5. Train a decision tree and a gradient boosting model from the scikit-learn package for outcome label prediction using the event log created at step 4 for prefix length L=2 and L=5.  Split 70-30 for training-testing.

- Hyperparameters:
  - DT max_depth = 5, random_state = 1234
  - GBM max_depth = 3, random_state = 1234


In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score         # between pred_result and real outcome
from sklearn import preprocessing                   # To convert 'Activity' into the float type of data

from sklearn.ensemble import GradientBoostingClassifier

### Q5-1. Decision Tree 

- DT max_depth = 5, random_state = 1234



In [34]:
df_final.columns

Index(['caseid', 'activity', 'resource', 'ts', 'outcome', 'n-work-items',
       'n-Curr-case', 'per-case', 'per-curr-ho'],
      dtype='object')

In [35]:
# ,'activity','caseid','resource'
df_encorded = df_final

In [36]:
le = preprocessing.LabelEncoder()
df_encorded['new_activity']=le.fit_transform(df_final['activity'])
df_encorded['new_caseid']=le.fit_transform(df_final['caseid'])
df_encorded['new_resource']=le.fit_transform(df_final['resource'])
df_encorded['new_ts']=le.fit_transform(df_final['ts'])


In [37]:
#  define X,y
X = df_encorded.loc[:, df_encorded.columns.difference(['outcome','activity','caseid','resource','ts'])]
y = df_encorded['outcome']

In [38]:
# Data Prepare-> 70:30,(train/test, X,y)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2782, 8), (1193, 8), (2782,), (1193,))

In [39]:
# Recreate the train/test data for dataset without the resource aware features. 
X_train_withoutRsc = X_train.loc[:, X_train.columns.difference(['n-Curr-case','n-work-items','per-case','per-curr-ho'])]
X_test_withoutRsc = X_test.loc[:, X_test.columns.difference(['n-Curr-case','n-work-items','per-case','per-curr-ho'])]


In [40]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train, y_train)
# predict
y_pred = clf.predict( X_test )

print('Accuracy for data with the resource-aware features : ', accuracy_score(y_test, y_pred)  )

Accuracy for data with the resource-aware features :  0.8625314333612741


In [41]:
# Generate DT algorithm
clf = DecisionTreeClassifier( max_depth = 5, random_state = 1234)
# train
clf.fit( X_train_withoutRsc, y_train)
# predict
y_pred = clf.predict( X_test_withoutRsc )

print('Accuracy for data without the resource-aware features : ', accuracy_score(y_test, y_pred)  )

Accuracy for data without the resource-aware features :  0.8633696563285834


### Q5-2. a gradient boosting model

-  GBM max_depth = 3, random_state = 1234


In [42]:
# GBM
gb_clf = GradientBoostingClassifier(max_depth = 3, random_state = 1234 )
gb_clf.fit( X_train, y_train )
pred   = gb_clf.predict( X_test )
print('Accuracy for data with the resource-aware features :', accuracy_score(y_test, pred)  )

Accuracy for data with the resource-aware features : 0.8575020955574183


In [43]:
from sklearn.ensemble import GradientBoostingClassifier

# GBM
gb_clf = GradientBoostingClassifier(max_depth = 3, random_state = 1234 )
gb_clf.fit( X_train_withoutRsc, y_train )
pred   = gb_clf.predict( X_test_withoutRsc )
print('Accuracy for data without the resource-aware features :', accuracy_score(y_test, pred)  )

Accuracy for data without the resource-aware features : 0.8625314333612741
