In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime 

%matplotlib inline

# 1. Import an event log (csv format only) as a pandas dataframe

In [2]:
small = pd.read_csv('Small.csv')
small.head(2)

Unnamed: 0,Case ID,Activity,Complete Timestamp,Variant,Variant index
0,case_1153,Activity A,1970/01/01 09:00:00.000,Variant 2,2
1,case_1153,Activity B,1970/01/01 10:00:00.000,Variant 2,2


In [10]:
small.columns

Index(['Case ID', 'Activity', 'Complete Timestamp', 'Variant',
       'Variant index'],
      dtype='object')

# 2. Rename the attributes as “caseid, activity, ts” if names are different (ts is for timestamp!)

In [11]:
small = small.rename(columns = 
             {
              'Case ID'             : 'caseid',
              'Activity'            : 'activity',
              'Complete Timestamp'  : 'ts'
             })

In [12]:
small.head(2)

Unnamed: 0,caseid,activity,ts,Variant,Variant index
0,case_1153,Activity A,1970/01/01 09:00:00.000,Variant 2,2
1,case_1153,Activity B,1970/01/01 10:00:00.000,Variant 2,2


# 3. Create an event log (= a new dataframe) retaining only the caseid, activity and ts attributes

In [13]:
event_small_df = pd.DataFrame(data = small, columns= ['caseid', 'activity', 'ts','act'])
event_small_df.head(2)

Unnamed: 0,caseid,activity,ts,act
0,case_1153,Activity A,1970/01/01 09:00:00.000,
1,case_1153,Activity B,1970/01/01 10:00:00.000,


In [14]:
event_small_df['act'] = event_small_df['activity'].values
event_small_df['act']=event_small_df['act'].str.split(pat = ' ', expand=True)[1]

event_small_df.head()

Unnamed: 0,caseid,activity,ts,act
0,case_1153,Activity A,1970/01/01 09:00:00.000,A
1,case_1153,Activity B,1970/01/01 10:00:00.000,B
2,case_1153,Activity C,1970/01/01 11:00:00.000,C
3,case_1153,Activity D,1970/01/01 12:00:00.000,D
4,case_1153,Activity E,1970/01/01 13:00:00.000,E


# 4. Create a new dataframe with columns caseid, list of events in a case (ordered by timestamp)


In [15]:
event_df = pd.DataFrame(columns = ['Caseid','List of events ordered by timestamp'])

In [16]:

group_tmp = event_small_df.groupby('caseid')
id_list = event_small_df['caseid'].unique().tolist()

In [17]:
for caseid in id_list:
    tmp_df = group_tmp.get_group(caseid).sort_values(by = 'ts', ascending=True)
    a = tmp_df['act'].to_list()
    event_df=event_df.append({'Caseid':caseid, 'List of events ordered by timestamp':a},ignore_index=True) 

In [18]:
event_df.head(3)

Unnamed: 0,Caseid,List of events ordered by timestamp
0,case_1153,"[A, B, C, D, E, G, H, I, J, F]"
1,case_7327,"[A, B, C, D, E, T, F]"
2,case_5863,"[A, B, C, D, E, G, H, I, J, F]"


# 5. Augment the event log created at 4 with two new attributes: duration, event_number
duration = ts(last event in case) - ts(first event in case)
Event_number: number of events in a case


In [19]:
event_df['Duration'] = np.nan
event_df['Event_number'] = np.nan
event_df.head(5)

Unnamed: 0,Caseid,List of events ordered by timestamp,Duration,Event_number
0,case_1153,"[A, B, C, D, E, G, H, I, J, F]",,
1,case_7327,"[A, B, C, D, E, T, F]",,
2,case_5863,"[A, B, C, D, E, G, H, I, J, F]",,
3,case_4346,"[A, B, C, D, E, G, H, I, J, F]",,
4,case_5961,"[A, B, C, D, E, T, F]",,


In [20]:
for caseid in id_list:
    tmp_df = group_tmp.get_group(caseid)
    time = pd.to_datetime(tmp_df['ts'])
    min_time= time.min()
    max_time = time.max()
    duration = max_time-min_time

    event_df.loc[event_df['Caseid']==caseid, ["Duration","Event_number"]] = duration, len(tmp_df)

In [21]:
event_df.head(5)

Unnamed: 0,Caseid,List of events ordered by timestamp,Duration,Event_number
0,case_1153,"[A, B, C, D, E, G, H, I, J, F]",0 days 09:00:00,10.0
1,case_7327,"[A, B, C, D, E, T, F]",0 days 06:00:00,7.0
2,case_5863,"[A, B, C, D, E, G, H, I, J, F]",0 days 09:00:00,10.0
3,case_4346,"[A, B, C, D, E, G, H, I, J, F]",0 days 09:00:00,10.0
4,case_5961,"[A, B, C, D, E, T, F]",0 days 06:00:00,7.0


In [22]:
event_df.tail()

Unnamed: 0,Caseid,List of events ordered by timestamp,Duration,Event_number
12495,case_11819,"[A, B, C, D, E, T, F]",0 days 06:00:00,7.0
12496,case_1768,"[A, B, C, D, E, T, F]",0 days 06:00:00,7.0
12497,case_10585,"[A, B, C, D, E, T, F]",0 days 06:00:00,7.0
12498,case_12401,"[A, B, C, D, E, T, F]",0 days 06:00:00,7.0
12499,case_3542,"[A, B, C, D, E, K, O, P, L, F]",0 days 09:00:00,10.0


# 6. Find the longest/shortest case by duration - duration = ts(last event in case) - ts(first event in case)

In [23]:
caseList_maxDuration = event_df['Caseid'][event_df['Duration']==event_df['Duration'].max()].to_list()
len(caseList_maxDuration), caseList_maxDuration

(7244,
 ['case_1153',
  'case_5863',
  'case_4346',
  'case_10126',
  'case_11902',
  'case_6503',
  'case_9347',
  'case_10955',
  'case_11690',
  'case_778',
  'case_507',
  'case_1955',
  'case_6185',
  'case_9151',
  'case_12229',
  'case_396',
  'case_3630',
  'case_12010',
  'case_930',
  'case_1259',
  'case_6888',
  'case_6635',
  'case_9285',
  'case_1200',
  'case_11291',
  'case_9119',
  'case_10455',
  'case_4984',
  'case_2324',
  'case_6527',
  'case_6961',
  'case_888',
  'case_2746',
  'case_10741',
  'case_10565',
  'case_6718',
  'case_9528',
  'case_6832',
  'case_1894',
  'case_3016',
  'case_4884',
  'case_6926',
  'case_7776',
  'case_11027',
  'case_8440',
  'case_3628',
  'case_4846',
  'case_6937',
  'case_1623',
  'case_2644',
  'case_9410',
  'case_5666',
  'case_11191',
  'case_3728',
  'case_928',
  'case_8790',
  'case_4196',
  'case_66',
  'case_8853',
  'case_724',
  'case_943',
  'case_12266',
  'case_1968',
  'case_2046',
  'case_10613',
  'case_11126'

In [24]:
caseList_minDuration = event_df['Caseid'][event_df['Duration']==event_df['Duration'].min()].to_list()
len(caseList_minDuration), caseList_minDuration

(4188,
 ['case_7327',
  'case_5961',
  'case_360',
  'case_7224',
  'case_8114',
  'case_1841',
  'case_6899',
  'case_4847',
  'case_4941',
  'case_4816',
  'case_634',
  'case_3703',
  'case_1679',
  'case_2907',
  'case_2378',
  'case_10944',
  'case_9168',
  'case_3130',
  'case_7397',
  'case_12106',
  'case_2910',
  'case_6542',
  'case_5631',
  'case_6732',
  'case_11204',
  'case_10908',
  'case_5065',
  'case_7775',
  'case_2656',
  'case_10301',
  'case_5574',
  'case_1518',
  'case_3372',
  'case_6098',
  'case_4600',
  'case_2597',
  'case_3121',
  'case_4128',
  'case_10181',
  'case_5504',
  'case_1503',
  'case_196',
  'case_11517',
  'case_8669',
  'case_1813',
  'case_4700',
  'case_7509',
  'case_4291',
  'case_5110',
  'case_7154',
  'case_6661',
  'case_4778',
  'case_7555',
  'case_1073',
  'case_3048',
  'case_5455',
  'case_6706',
  'case_509',
  'case_7226',
  'case_10206',
  'case_6621',
  'case_1233',
  'case_4104',
  'case_584',
  'case_2189',
  'case_5033',


# 7. Fine the longest case(s) by number of events

In [25]:
list_max_eventno= event_df['Caseid'][event_df['Event_number']==event_df['Event_number'].max()].to_list()
len(list_max_eventno)

7244

In [26]:
list_max_eventno

['case_1153',
 'case_5863',
 'case_4346',
 'case_10126',
 'case_11902',
 'case_6503',
 'case_9347',
 'case_10955',
 'case_11690',
 'case_778',
 'case_507',
 'case_1955',
 'case_6185',
 'case_9151',
 'case_12229',
 'case_396',
 'case_3630',
 'case_12010',
 'case_930',
 'case_1259',
 'case_6888',
 'case_6635',
 'case_9285',
 'case_1200',
 'case_11291',
 'case_9119',
 'case_10455',
 'case_4984',
 'case_2324',
 'case_6527',
 'case_6961',
 'case_888',
 'case_2746',
 'case_10741',
 'case_10565',
 'case_6718',
 'case_9528',
 'case_6832',
 'case_1894',
 'case_3016',
 'case_4884',
 'case_6926',
 'case_7776',
 'case_11027',
 'case_8440',
 'case_3628',
 'case_4846',
 'case_6937',
 'case_1623',
 'case_2644',
 'case_9410',
 'case_5666',
 'case_11191',
 'case_3728',
 'case_928',
 'case_8790',
 'case_4196',
 'case_66',
 'case_8853',
 'case_724',
 'case_943',
 'case_12266',
 'case_1968',
 'case_2046',
 'case_10613',
 'case_11126',
 'case_944',
 'case_10683',
 'case_4542',
 'case_5206',
 'case_9977',
 