In [23]:
from datetime import datetime
import pandas as pd
import random
import numpy as np

## BPI12 - Ruduce Size

In [36]:
eventlog = "bpi_12_w.csv"

In [88]:
df = pd.read_csv('../data/%s' % eventlog)

In [4]:
df.head(3)

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,173688,3,2011-10-01 19:45:13
1,173688,5,2011-10-01 20:17:08
2,173688,5,2011-10-09 00:32:00


In [5]:
CaseID = df['CaseID'].unique()

In [6]:
random.seed(42)

# Randomly select half of the case IDs
sampled_CaseID = random.sample(list(CaseID), k=len(CaseID)//2)

In [7]:
# Filter the original dataset based on the sampled Case IDs
filtered_df = df[df['CaseID'].isin(sampled_CaseID)]

In [8]:
# Store the filtered dataset
filtered_df.to_csv('../data/randomsampled_%s' % eventlog, index=False)

## BPI12 - Increase Representation

In [38]:
# Calculate unique activity IDs per case
unique_activities_per_case = df.groupby('CaseID')['ActivityID'].nunique()

# Calculate rework activities per case
total_activities_per_case = df.groupby('CaseID')['ActivityID'].count()
rework_activities_per_case = total_activities_per_case - unique_activities_per_case

# Print the results
results = pd.DataFrame({
    'CaseID': unique_activities_per_case.index,
    'UniqueActivityCount': unique_activities_per_case.values,
    'ReworkActivityCount': rework_activities_per_case.values
})
print(results)

      CaseID  UniqueActivityCount  ReworkActivityCount
0     173688                    3                    2
1     173691                    3                    6
2     173694                    3                   13
3     173703                    1                    1
4     173706                    2                    2
...      ...                  ...                  ...
9653  214361                    3                    4
9654  214364                    2                    4
9655  214370                    1                    0
9656  214373                    3                    0
9657  214376                    1                    0

[9658 rows x 3 columns]


In [69]:
# Filter the cases where the count of rework activities is less than 8
filtered_cases = rework_activities_per_case[rework_activities_per_case < 7].index

# Filter the original dataset based on the filtered cases
filtered_df2 = df[df['CaseID'].isin(filtered_cases)]

In [70]:
# Store the filtered dataset
filtered_df2.to_csv('../data/7sampled_%s' % eventlog, index=False)

In [64]:
# Filter the cases where there are no rework in the case and length>2
norework_cases = df.groupby('CaseID').filter(lambda x: (total_activities_per_case[x.name] == unique_activities_per_case[x.name] ) and (len(x) > 2))['CaseID'].unique()

# Filter the original dataset based on the filtered cases
norework = df[df['CaseID'].isin(norework_cases)]

In [66]:
# Store the filtered dataset
norework.to_csv('../data/norework_%s' % eventlog, index=False)

## helpdesk - Increase Size

In [90]:
eventlog = "helpdesk.csv"
df = pd.read_csv('../data/%s' % eventlog)

In [91]:
CaseID = df['CaseID'].unique()
random.seed(42)

# Randomly select half of the case IDs
sampled_CaseID = random.sample(list(CaseID), k=len(CaseID)//2)

In [92]:
# Filter the original dataset based on the sampled Case IDs
filtered_df = df[df['CaseID'].isin(sampled_CaseID)]

In [93]:
# Store the filtered dataset
filtered_df.to_csv('../data/randomsampled_%s' % eventlog, index=False)

## Env Permit - Reducee Variability

In [108]:
eventlog = "preprocessed_env_frequent.csv"
df = pd.read_csv('../data/%s' % eventlog)

In [114]:
repeat_times = 30
new_data = pd.DataFrame()

for repeat in range(1, repeat_times + 1):
    df_copy = df.copy()
    df_copy['CaseID'] = str(repeat).zfill(2) + df_copy['CaseID'].astype(str)
    new_data = pd.concat([new_data, df_copy])

# Reset the index of the new_data DataFrame
new_data.reset_index(drop=True, inplace=True)

In [115]:
 new_data

Unnamed: 0,CaseID,ActivityID,CompleteTimestamp
0,012876753,23,2010-11-23 09:00:00
1,012876753,28,2010-11-23 09:00:00
2,012876753,25,2010-11-24 00:00:09
3,012876753,26,2010-11-24 00:00:16
4,012876753,27,2010-11-26 20:17:18
...,...,...,...
7135,304573766,26,2012-02-14 17:54:21
7136,305271267,23,2012-07-06 08:00:00
7137,305271267,24,2012-07-09 18:19:26
7138,305271267,25,2012-07-09 18:19:26


In [117]:
# Store the new dataset
new_data.to_csv('../data/30times_%s' % eventlog, index=False)