In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from pathlib import Path
#from torch import cdist
import os
import time

In [18]:
from catboost import CatBoostClassifier
# from hmmlearn import hmm
import sklearn 
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline 
from sklearn.compose import make_column_selector, make_column_transformer

# import math
# from statsmodels.tsa.arima.model import ARIMA

In [2]:
import category_encoders as ce

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
ROOT_DIR = Path().cwd()
while not ROOT_DIR.joinpath("data").exists():
    ROOT_DIR = ROOT_DIR.parent
os.chdir(ROOT_DIR)

In [12]:
from processors import processor_factory
from data.DatasetManager import DatasetManager
import data.EncoderFactory as EncoderFactory

In [11]:
def get_first_n_cases(df, n):
    earliest_timestamps = df.groupby(dataset_manager.case_id_col)[dataset_manager.timestamp_col].min()
    sorted_cases = earliest_timestamps.sort_values().index[:n]
    return df[df[dataset_manager.case_id_col].isin(sorted_cases)]

In [13]:
def return_last_row(group):
    max_event_row = group.loc[group['event_nr'].idxmax()]
    return max_event_row

In [9]:
dataset_ref = ["bpic2011","bpic2015",  "bpic2012_cancelled", "sepsis_cases", "bpic2012_accepted", "bpic2012_declined", 'bpic2017_accepted', 'bpic2017_cancelled', 'bpic2017_refused', 'production', 'traffic_fines_1']
dataset_ref_to_datasets = {
    "bpic2011": ["bpic2011_f%s" % formula for formula in range(1, 5)],
    "bpic2015": ["bpic2015_%s_f2" % (municipality) for municipality in range(1, 6)],
    "sepsis_cases": ["sepsis_cases_1", "sepsis_cases_2", "sepsis_cases_4"]
}

# Check if any element in dataset_ref is a key in dataset_ref_to_datasets
datasets = []
for ref in dataset_ref:
    if ref in dataset_ref_to_datasets:
        datasets.extend(dataset_ref_to_datasets[ref])
    else:
        datasets.append(ref)

print(datasets)

['bpic2011_f1', 'bpic2011_f2', 'bpic2011_f3', 'bpic2011_f4', 'bpic2015_1_f2', 'bpic2015_2_f2', 'bpic2015_3_f2', 'bpic2015_4_f2', 'bpic2015_5_f2', 'bpic2012_cancelled', 'sepsis_cases_1', 'sepsis_cases_2', 'sepsis_cases_4', 'bpic2012_accepted', 'bpic2012_declined', 'bpic2017_accepted', 'bpic2017_cancelled', 'bpic2017_refused', 'production', 'traffic_fines_1']


In [19]:
# Initialize a DataFrame to store dataset names and shapes
dataset_shapes = pd.DataFrame(columns=['dataset_name', 'num_rows', 'num_columns'])
transformed_shapes = pd.DataFrame(columns=['dataset_name', 'num_rows', 'num_columns'])

for dataset_name in datasets:
    dataset_manager = DatasetManager(dataset_name)
    df = dataset_manager.read_dataset()
    df.sort_values([dataset_manager.case_id_col, dataset_manager.timestamp_col], inplace=True)
    print(f"Number of cases in {dataset_name}: {df[dataset_manager.case_id_col].nunique()}")
    max_case_num = df[dataset_manager.case_id_col].nunique()
    filtered_df = get_first_n_cases(df, max_case_num)
    for col in [dataset_manager.activity_col]:
        counts = filtered_df[col].value_counts()
        mask = filtered_df[col].isin(counts[counts >= 100].index)
        filtered_df.loc[~mask, col] = "other"

    # Determine min and max (truncated) prefix lengths
    min_prefix_length = 1
    if "traffic_fines" in dataset_name:
        max_prefix_length = 10
    elif "bpic2017" in dataset_name:
        max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(filtered_df, 0.90))
    else:
        max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(filtered_df, 0.90))

    start_test_prefix_generation = time.time()
    print("Generating prefix data...")
    dt_prefixes = dataset_manager.generate_prefix_data(filtered_df, min_prefix_length, max_prefix_length)
    test_prefix_generation_time = time.time() - start_test_prefix_generation

    encoder = EncoderFactory.get_encoder(method='agg', case_id_col=dataset_manager.case_id_col, static_cat_cols=dataset_manager.static_cat_cols, 
                                     static_num_cols=dataset_manager.static_num_cols, dynamic_cat_cols=dataset_manager.dynamic_cat_cols,
                                     dynamic_num_cols=dataset_manager.dynamic_num_cols, fillna=True, max_events=None, 
                                     activity_col=dataset_manager.activity_col, resource_col=None, 
                                     timestamp_col=dataset_manager.timestamp_col, scale_model=None)
    
    dt_transformed = encoder.transform(dt_prefixes)
    dt_transformed.reset_index(drop=False, inplace=False)[dataset_manager.case_id_col]

    subset = dt_prefixes[[dataset_manager.case_id_col, dataset_manager.timestamp_col, dataset_manager.activity_col, dataset_manager.label_col, 'event_nr', 'case_length'] + dataset_manager.static_num_cols+dataset_manager.static_cat_cols]
    subset = subset.groupby(dataset_manager.case_id_col).apply(return_last_row).reset_index(drop=True)
    # Create a new column 'finished' with values based on the condition
    subset['finished'] = (subset['event_nr'] == subset['case_length']).astype(int)
    
    merged_df = pd.merge(subset, dt_transformed, on=[dataset_manager.case_id_col])

    # Step 1: Identify object columns
    object_columns = merged_df.select_dtypes(include=['object']).columns

    # Step 2: Check if object columns contain boolean values
    for col in object_columns:
        if merged_df[col].isin(['True', 'False', 'TRUE', 'FALSE', 'true', 'false']).all():
            merged_df[col] = merged_df[col].str.lower().map({'true': True, 'false': False})

            # Step 3: Transform boolean object columns to boolean data type
            merged_df[col] = merged_df[col].astype('boolean')

    # Add dataset name and shape of merged_df to the dataset_shapes DataFrame
    dataset_shapes = pd.concat([dataset_shapes, pd.DataFrame({
        'dataset_name': [dataset_name],
        'num_rows': [merged_df.shape[0]],
        'num_columns': [merged_df.shape[1]]
    })], ignore_index=True)

    print(merged_df.shape)

    preprocessor = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),  # Scale continuous variables
        (ce.quantile_encoder.SummaryEncoder(), make_column_selector(dtype_include=['object', 'category'])),  # Encode categorical variables
        remainder='drop')
    
    features_used = merged_df.columns.difference([dataset_manager.label_col, dataset_manager.timestamp_col, dataset_manager.case_id_col, 'event_nr', 'case_length', 'finished'], sort=False)
    merged_df_transformed = preprocessor.fit_transform(merged_df[features_used], merged_df[dataset_manager.label_col])

    transformed_shapes = pd.concat([transformed_shapes, pd.DataFrame({
        'dataset_name': [dataset_name],
        'num_rows': [merged_df_transformed.shape[0]],
        'num_columns': [merged_df_transformed.shape[1]]
    })], ignore_index=True)

# Print the final DataFrame with dataset names and shapes
# print(dataset_shapes.head(10))
print(transformed_shapes.head(10))

Number of cases in bpic2011_f1: 1140
Generating prefix data...
Activity code
(24176, 229)
Number of cases in bpic2011_f2: 1140
Generating prefix data...
Activity code
(31235, 265)
Number of cases in bpic2011_f3: 1121
Generating prefix data...
Activity code
(20534, 223)
Number of cases in bpic2011_f4: 1140
Generating prefix data...
Activity code
(30928, 253)
Number of cases in bpic2015_1_f2: 696
Generating prefix data...
Activity
(23786, 201)
Number of cases in bpic2015_2_f2: 753
Generating prefix data...
Activity
(28377, 205)
Number of cases in bpic2015_3_f2: 1328
Generating prefix data...
Activity
(48333, 212)
Number of cases in bpic2015_4_f2: 577
Generating prefix data...
Activity
(20464, 163)
Number of cases in bpic2015_5_f2: 1051
Generating prefix data...
Activity
(40530, 213)
Number of cases in bpic2012_cancelled: 4685
Generating prefix data...
Activity
(155783, 144)
Number of cases in sepsis_cases_1: 782
Generating prefix data...
Activity
(12189, 122)
Number of cases in sepsis_ca

In [16]:
dataset_shapes.to_csv("dataset_sizes.csv", index=False)

In [20]:
transformed_shapes.to_csv("transformed_dataset_sizes.csv", index=False)

In [24]:
# Initialize a DataFrame to store dataset names and shapes
# dataset_shapes = pd.DataFrame(columns=['dataset_name', 'num_rows', 'num_columns'])
# transformed_shapes = pd.DataFrame(columns=['dataset_name', 'num_rows', 'num_columns'])
results = []
for dataset_name in datasets:
    dataset_manager = DatasetManager(dataset_name)
    df = dataset_manager.read_dataset()
    df.sort_values([dataset_manager.case_id_col, dataset_manager.timestamp_col], inplace=True)
    print(f"Number of cases in {dataset_name}: {df[dataset_manager.case_id_col].nunique()}")
    max_case_num = df[dataset_manager.case_id_col].nunique()
    filtered_df = get_first_n_cases(df, max_case_num)
    
    # Group by case_id_col and find the dominant class for each case_id
    case_class = (
        filtered_df.groupby(dataset_manager.case_id_col)[dataset_manager.label_col]
        .agg(lambda x: x.mode()[0])  # Get the most frequent (mode) class for each case_id
    )
    
    # Count how many case_ids are "regular" and how many are "deviant"
    regular_count = (case_class == "regular").sum()
    deviant_count = (case_class == "deviant").sum()
    
    # Compute proportions
    total_count = regular_count + deviant_count
    p_regular = regular_count / total_count
    p_deviant = deviant_count / total_count
    
    # Compute metrics
    imbalance_ratio = max(regular_count, deviant_count) / min(regular_count, deviant_count)
    gini_index = 1 - (p_regular**2 + p_deviant**2)
    majority_to_minority_ratio = max(regular_count, deviant_count) / min(regular_count, deviant_count)
    
    # Append results to the list
    results.append({
        "dataset_name": dataset_name,
        "imbalance_ratio": imbalance_ratio,
        "gini_index": gini_index,
        "majority_to_minority_ratio": majority_to_minority_ratio
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

Number of cases in bpic2011_f1: 1140
Number of cases in bpic2011_f2: 1140
Number of cases in bpic2011_f3: 1121
Number of cases in bpic2011_f4: 1140
Number of cases in bpic2015_1_f2: 696
Number of cases in bpic2015_2_f2: 753
Number of cases in bpic2015_3_f2: 1328
Number of cases in bpic2015_4_f2: 577
Number of cases in bpic2015_5_f2: 1051
Number of cases in bpic2012_cancelled: 4685
Number of cases in sepsis_cases_1: 782
Number of cases in sepsis_cases_2: 782
Number of cases in sepsis_cases_4: 782
Number of cases in bpic2012_accepted: 4685
Number of cases in bpic2012_declined: 4685
Number of cases in bpic2017_accepted: 31413
Number of cases in bpic2017_cancelled: 31413
Number of cases in bpic2017_refused: 31413
Number of cases in production: 220
Number of cases in traffic_fines_1: 129615


In [25]:
results_df.to_csv("dataset_imbalance_metrics.csv", index=False)


In [None]:
def find_threshold(proba_values, true_values):
    list_acc = []
    thresholds = np.arange(0, 1.0, 0.05)
    true_values = true_values.map({'regular': True, 'deviant': False})


    for threshold in thresholds:
        preds_thr = proba_values > threshold
        acc= accuracy_score(true_values, preds_thr)
        # acc = np.mean(true_values == preds_thr)
        list_acc.append(acc)
        
        #print(f"Threshold: {threshold}, Accuracy: {acc}")

    # plt.plot(thresholds, list_acc)


    best_threshold = thresholds[np.where(list_acc==np.max(list_acc))]
    best_accuracy = np.max(list_acc)

    # print(f"Best Threshold: {best_threshold}, Best Accuracy: {best_accuracy}")
    return best_threshold, best_accuracy

In [None]:
results = []
method = 'Catboost'

In [None]:
# Define the values to experiment with
results = []
trainingtimes = []
AUCs = []
num_nearest_neighbors_values = [300]
distance_metrics = ['euclidean']
results_df = pd.DataFrame()

# Loop through the values and create processors
for num_nearest_neighbors in num_nearest_neighbors_values:
    for distance_metric in distance_metrics:
        print(f"Experimenting with num_nearest_neighbors={num_nearest_neighbors} and distance_metric={distance_metric}")
        
        # Create the processor with the current values
        processor = processor_factory.get_processor(dataset_name, use_encoding=False, use_bucketing=False, num_nearest_neighbors=num_nearest_neighbors, distance_metric=distance_metric)

        data = merged_df.sort_values([dataset_manager.case_id_col, dataset_manager.timestamp_col], ascending=True, kind='mergesort')
        historic, current = processor.split_data_strict(data, train_ratio=0.5)
        historic.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)
        current.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)
        # current = current.head(1000)


        # Exclude specific columns from historic
        features_used = historic.columns.difference([dataset_manager.label_col, dataset_manager.timestamp_col, dataset_manager.case_id_col, 'event_nr', 'case_length', 'finished'], sort=False)
        # n_neighbors = 200
        # # Initialize the NearestNeighbors model
        # nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
        # # Fit the model on the historic data
        # nn_model.fit(historic[features_used])

        batch_size = 100

        initial_start_time = time.time()

        # Define the preprocessor
        preprocessor = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['int64', 'float64'])),  # Scale continuous variables
        (ce.quantile_encoder.SummaryEncoder(), make_column_selector(dtype_include=['object', 'category'])),  # Encode categorical variables
        remainder='drop')  # Drop other columns

        # num_model, cat_model = processor.train_nn_model_bpic_encoding(historic[features_used])
        nn_model = NearestNeighbors(n_neighbors=num_nearest_neighbors, metric=distance_metric)
        # nn_model = make_pipeline(preprocessor,('nearestneighbors', 
        #                         NearestNeighbors(n_neighbors=num_nearest_neighbors, metric=distance_metric)))

        historic_transformed = preprocessor.fit_transform(historic[features_used], historic[dataset_manager.label_col])
        nn_model.fit(historic_transformed)
        # nn_model.fit(historic.drop(columns=[dataset_manager.label_col, dataset_manager.timestamp_col, dataset_manager.case_id_col, dataset_manager.activity_col]))



        initial_end_time = time.time()

        initial_training_time = initial_end_time - initial_start_time
        print("Initial Model Training time: ", initial_training_time, " seconds")

        for start in range(0, len(current), batch_size):
            end = start + batch_size
            batch = current.iloc[start:end]
            # Find the n nearest neighbors for the selected row
            #distances, indices = nn_model.kneighbors(batch.drop(columns=[dataset_manager.label_col, dataset_manager.timestamp_col, dataset_manager.case_id_col, dataset_manager.activity_col]).to_numpy())
            # Drop non-numeric columns from the batch DataFrame
            # numeric_batch = batch.select_dtypes(include=['number'])

            distances, indices = nn_model.kneighbors(preprocessor.transform(batch[features_used]))
            # distances, indices = nn_model[1].kneighbors(batch.drop(columns=[dataset_manager.label_col, dataset_manager.timestamp_col, dataset_manager.case_id_col, dataset_manager.activity_col]).to_numpy())
            # distances, indices = processor.find_nearest_neighbors_encoding(cat_model, num_model, batch[features_used])
            nearest_neighbors = pd.concat([historic.iloc[indices[i]] for i in range(len(batch))])
            # nearest_neighbors = historic.iloc[indices.flatten()]

            # print(distances.shape)
            # print(indices.shape)

            # print(cat_model)
            # print(num_model)
            # print(features_used)
            # print(batch[features_used].shape)
            print(nearest_neighbors.shape)

            target = nearest_neighbors[dataset_manager.label_col].values
            target_test = batch[dataset_manager.label_col]

            # Split into training and calibration sets
            X_train, X_cal, y_train, y_cal = train_test_split(nearest_neighbors, target, test_size=0.3, random_state=42)

            if target_test is None:
                continue
            

            start_time = time.time()
            if method == 'Catboost':
                
                # categorical_features_indices = [nearest_neighbors.columns.get_loc(col) for col in processor.static_cat_cols+processor.dynamic_cat_cols]
                # Create the CatBoostRegressor model
                # model = CatBoostClassifier(iterations=100, loss_function='MultiClass', verbose=0, cat_features=processor.static_cat_cols + processor.dynamic_cat_cols) # , max_ctr_complexity=1
                model = CatBoostClassifier(iterations=100, loss_function='Logloss', eval_metric='AUC', verbose=0, cat_features=[dataset_manager.activity_col]+dataset_manager.static_cat_cols)
                print('Now training')
                model.fit(X_train[features_used], y_train, cat_features=[dataset_manager.activity_col]+dataset_manager.static_cat_cols)

            if method == 'HMM':
                # Create an instance of the HMM model
                model = hmm.GaussianHMM(n_components=7)  # Specify the number of hidden states
                model.fit(nearest_neighbors[features_used])


            if method == 'LogisticRegression':

                preprocessor = make_column_transformer(
                    (OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include='object')),
                    remainder='passthrough'
                )
                model = make_pipeline(
                    preprocessor,
                    LogisticRegression()
                )
                model.fit(X_train[features_used], y_train)

            end_time = time.time()

            training_time = (end_time - start_time) / 60
            print("Training time: ", training_time, " minutes")
            trainingtimes.append(training_time)

            # Step 2: Wrap with calibration on held-out calibration data
            calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
            calibrated_model.fit(X_cal[features_used], y_cal)

            # Make predictions on the testing data
            print('Now predicting')
            preds = model.predict(batch[features_used])
            
            # probs = model.predict_proba(batch[features_used])[:, 1]
            # Step 3: Predict calibrated probabilities for the query instance
            probs = calibrated_model.predict_proba(batch[features_used])[:, 1]
            y_true = batch[dataset_manager.label_col].values
            # Check if y_true contains at least two classes
            if len(set(y_true)) > 1:
                auc = roc_auc_score(y_true, probs)
                AUCs.append(auc)
            else:
                print("Warning: Only one class present in y_true. Skipping ROC AUC calculation.")
                auc = None

            cal_probs = calibrated_model.predict_proba(X_cal[features_used])[:, 1]
            cal_true = X_cal[dataset_manager.label_col]
            best_threshold, best_accuracy = find_threshold(cal_probs, cal_true)
            preds_thr = probs > np.median(best_threshold)

            # Check if array has null values
            has_null = pd.Series(preds.flatten(), index=batch.index).isna().any()

            if has_null:
                print("Preds has null values")


            # true_conc_glu = row['Target_orig']
            # preds_scaled = scalers[target_col].inverse_transform(preds.reshape(-1, 1))

            batch.loc[:, 'predicted_value'] = pd.Series(preds.flatten(), index=batch.index)
            batch.loc[:, 'proba_of_regular'] = pd.Series(probs.flatten(), index=batch.index)
            batch.loc[:, 'predicted_thr'] = pd.Series(preds_thr.flatten(), index=batch.index)
            batch.loc[:, 'predicted_thr'] = batch['predicted_thr'].map({False: 'deviant', True: 'regular'})
            is_null = batch['predicted_value'].isna().any()
            if is_null:
                print("Batch has null values")

            results.append(batch)

            # Add the current row with its prediction to the historic data
            # row_with_prediction[target_col] = preds_scaled[0][0]
            finished_case_ids = batch[batch['finished'] == 1][dataset_manager.case_id_col].unique()
            finished_cases = current[current[dataset_manager.case_id_col].isin(finished_case_ids)]
            historic = pd.concat([historic, finished_cases], ignore_index=True)
            historic.sort_values([dataset_manager.case_id_col, dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)
            # historic.drop(columns=['predicted_value', 'proba_of_regular'], inplace=True)
            # nn_model =processor.train_nn_model_bpic(historic)  # Refit the model with the updated historic data
            historic_transformed = preprocessor.fit_transform(historic[features_used], historic[dataset_manager.label_col])
            nn_model.fit(historic_transformed)
            # num_model, cat_model = processor.train_nn_model_bpic_encoding(historic[features_used])

        results_df = pd.concat(results)

        # Calculate metrics
        true_values = results_df[dataset_manager.label_col]
        predicted_values = results_df['predicted_value']
        proba_values = results_df['proba_of_regular']

        accuracy = np.mean(true_values == predicted_values)
        accuracy_thr = np.mean(true_values == results_df['predicted_thr'])
        
        # MAE_t = mean_absolute_error(true_values, predicted_values)
        # MSE_t = mean_squared_error(true_values, predicted_values)
        # RMSE_t = math.sqrt(MSE_t)
        # r2_t = r2_score(true_values, predicted_values)
        # mape_t = mean_absolute_percentage_error(true_values, predicted_values)

        # Save results to a CSV file
        # results_df.to_csv('predictions_with_row_data.csv', index=False)

        # Print metrics
        print(f"Accuracy: {accuracy}")
        print(f"Accuracy (Threshold): {accuracy_thr}")
        print(f"f1_score: {f1_score(true_values, predicted_values, average='weighted')}")
        print(f"Training Time: {sum(trainingtimes)/len(trainingtimes)}")
        print(f"AUC: {sum(AUCs)/len(AUCs)}")
        # print(f"MAE: {MAE_t}")
        # print(f"MSE: {MSE_t}")
        # print(f"RMSE: {RMSE_t}")
        # print(f"R2: {r2_t}")
        # print(f"MAPE: {mape_t}")

        output_dir = Path(f"results/{dataset_name}")
        output_dir.mkdir(parents=True, exist_ok=True)
        # dataset_name.mkdir(parents=True, exist_ok=True)
        
        results_df.to_csv(f'{output_dir}/{method}_{num_nearest_neighbors}_{distance_metric}.csv', index=False)
        print('***********************************')


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve

In [None]:
# Plotting the distribution
plt.figure(figsize=(8, 6))
plt.hist(AUCs, bins=10, color='skyblue', edgecolor='black')
plt.xlabel('AUC Values')
plt.ylabel('Frequency')
plt.title('Distribution of AUC Values')
plt.grid(axis='y', alpha=0.75)
plt.show()


In [None]:
# Calculate FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(dataset_manager.get_label_numeric(current), proba_values)

# Calculate the AUC
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [None]:
print(model.classes_)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
####### Baseline Version without JITL ########

results_baseline = pd.DataFrame()

print(f"Experimenting with baseline version")

# Create the processor with the current values
processor = processor_factory.get_processor(dataset_name, use_encoding=False, use_bucketing=False, num_nearest_neighbors=num_nearest_neighbors, distance_metric=distance_metric)



data = merged_df.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort')
# data['Target'] = data.groupby(case_id_col)['Target'].ffill().bfill()

historic, current = processor.split_data_strict(data, train_ratio=0.5)
historic.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)
current.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)
# current = current.head(1000)


features_used = historic.columns.difference([dataset_manager.label_col, dataset_manager.timestamp_col, dataset_manager.case_id_col, 'case_length', 'event_nr', 'finished'])


# # Encode the target column
# label_encoder = LabelEncoder()
# historic[dataset_manager.label_col] = label_encoder.fit_transform(historic[dataset_manager.label_col])
# current[dataset_manager.label_col] = label_encoder.transform(current[dataset_manager.label_col])

# # Update the target variables
# target = historic[dataset_manager.label_col].values
# target_test = current[dataset_manager.label_col].values


target = historic[dataset_manager.label_col].values
target_test = current[dataset_manager.label_col]

start_time = time.time()
if method == 'Catboost':
        model = CatBoostClassifier(iterations=100, loss_function='Logloss', eval_metric='AUC', verbose=0, cat_features=[dataset_manager.activity_col]+dataset_manager.static_cat_cols)
        print('Now training')
        model.fit(historic[features_used], target, cat_features=[dataset_manager.activity_col]+dataset_manager.static_cat_cols)

if method == 'HMM':
    # Create an instance of the HMM model
    model = hmm.GaussianHMM(n_components=7)  # Specify the number of hidden states
    model.fit(historic[features_used])
if method == 'LogisticRegression':
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import make_pipeline 
    #model = LogisticRegression()

    from sklearn.compose import make_column_selector, make_column_transformer

    preprocessor = make_column_transformer(
        (OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include='object')),
        remainder='passthrough'
    )
    model = make_pipeline(
        preprocessor,
        LogisticRegression()
    )
    model.fit(X_train[features_used], y_train)

end_time = time.time()
training_time = (end_time - start_time) / 60
print("Training time: ", training_time, " minutes")
# trainingtimes.append(training_time)
batch_size = 100

print("Now predicting")
preds = model.predict(current[features_used])
probs = model.predict_proba(current[features_used])[:, 1]
y_true = current[dataset_manager.label_col].values
auc = roc_auc_score(y_true, probs)

results_baseline = current.copy()
results_baseline.loc[:, 'predicted_value'] = pd.Series(preds.flatten(), index=current.index)
results_baseline.loc[:, 'proba_of_regular'] = pd.Series(probs.flatten(), index=current.index)


# for start in range(0, len(current), batch_size):
#     end = start + batch_size
#     batch = current.iloc[start:end]

#     target = historic[dataset_manager.label_col].values
#     target_test = batch[dataset_manager.label_col]

#     if target_test is None:
#         continue

#     print('Now predicting')
#     # Make predictions on the testing data
#     preds = model.predict(batch[features_used])
#     probs = model.predict_proba(batch[features_used])[:, 1]
#     y_true = batch[dataset_manager.label_col].values
#     if len(set(y_true)) > 1:
#         auc = roc_auc_score(y_true, probs)
#         AUCs.append(auc)
#     else:
#         print("Warning: Only one class present in y_true. Skipping ROC AUC calculation.")
#         auc = None

#     # Check if array has null values
#     has_null = pd.Series(preds.flatten(), index=batch.index).isna().any()
#     if has_null:
#         print("Preds has null values")

#     batch.loc[:, 'predicted_value'] = pd.Series(preds.flatten(), index=batch.index)
#     is_null = batch['predicted_value'].isna().any()
#     if is_null:
#         print("Batch has null values")

#     results.append(batch)


# results_df = pd.concat(results)

# Calculate metrics
true_values = results_baseline[dataset_manager.label_col]
predicted_values = results_baseline['predicted_value']

accuracy = np.mean(true_values == predicted_values)

# Save results to a CSV file
# results_df.to_csv('predictions_with_row_data.csv', index=False)

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"f1_score: {f1_score(true_values, predicted_values, average='weighted')}")
print(f"Training Time: {training_time}")
# print(f"AUC: {sum(AUCs)/len(AUCs)}")
print(f"AUC: {auc}")
# print(f"MAE: {MAE_t}")

results_baseline.to_csv(f'results/{dataset_name}/baseline_{method}.csv', index=False)
print('***********************************')


In [None]:
tmp = results_baseline.copy()
tmp.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)

In [None]:
tmp2 = results_df.copy()
tmp2.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)

In [None]:
def calculate_moving_avg_f1(df, true_col, pred_col, window_size=5):
    # Sort the DataFrame by the timestamp column
    df = df.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort')

    # Convert columns to numpy arrays for faster operations
    true_values = df[true_col].to_numpy()
    predicted_values = df[pred_col].to_numpy()

    # Preallocate arrays for results
    num_rows = np.arange(2, len(true_values) + 1)  # Start from 2
    f1_list = np.zeros(len(num_rows))

    # Compute F1 scores incrementally
    for i in range(2, len(true_values) + 1):
        f1_list[i - 2] = f1_score(true_values[:i], predicted_values[:i], average='weighted')

    # Create a DataFrame for results
    f1_df = pd.DataFrame({'num_rows': num_rows, 'f1': f1_list})

    # Compute the moving average of F1 scores
    f1_df['moving_avg_f1'] = f1_df['f1'].rolling(window=window_size, min_periods=1).mean()

    return f1_df

In [None]:
def calculate_moving_avg_acc_fast(df, true_col, pred_col, window_size=5):

    df.sort_values([dataset_manager.timestamp_col], ascending=True, kind='mergesort', inplace=True)
    # Convert columns to numpy arrays
    true_values = df[true_col].to_numpy()
    predicted_values = df[pred_col].to_numpy()


    # Calculate cumulative accuracy (vectorized)
    cumulative_correct = np.cumsum(true_values == predicted_values)
    num_rows = np.arange(1, len(true_values) + 1)
    cumulative_accuracy = cumulative_correct / num_rows

    # Create DataFrame for results
    mae_df = pd.DataFrame({
        'num_rows': num_rows[1:],  # Start from 2
        'mae': cumulative_accuracy[1:]  # Start from 2
    })

    # Calculate moving average of MAE
    mae_df['moving_avg_mae'] = mae_df['mae'].rolling(window=window_size).mean()

    return mae_df

In [None]:
# Calculate moving average MAE for tmp
mae_df_tmp = calculate_moving_avg_acc_fast(tmp, dataset_manager.label_col, 'predicted_value')

In [None]:
# Calculate moving average MAE for tmp2
mae_df_tmp2 = calculate_moving_avg_acc_fast(tmp2, dataset_manager.label_col, 'predicted_value')

In [None]:


# Plot the moving average MAE for both DataFrames
plt.figure(figsize=(10, 6))
plt.plot(mae_df_tmp['num_rows'], mae_df_tmp['moving_avg_mae'], label='baseline', color='blue')
plt.plot(mae_df_tmp2['num_rows'], mae_df_tmp2['moving_avg_mae'], label='JIT-Cat', color='red')
# plt.plot(mae_df_tmp3['num_rows'], mae_df_tmp3['moving_avg_mae'], label='euclidean', color='green')
# plt.plot(mae_df_tmp4['num_rows'], mae_df_tmp4['moving_avg_mae'], label='DTW', color='black')
plt.xlabel('Number of Observed Events over Time')
plt.ylabel('Moving Average Accuracy')
plt.title(f'{dataset_name} Moving Average Accuracy')
plt.legend()
plt.grid(True)
# plt.show()
plt.savefig(f"results/{dataset_name}/moving_avg_accuracy_score.png", dpi=600, bbox_inches='tight')
plt.savefig(f"results/{dataset_name}/moving_avg_accuracy_score.pdf", dpi=600, bbox_inches='tight', format="pdf")  # Save as PDF

plt.close()

In [None]:
# Plot the moving average F1 score for both DataFrames
plt.figure(figsize=(10, 6))
plt.plot(mae_df_tmp['num_rows'], mae_df_tmp['moving_avg_f1'], label='baseline', color='blue')
plt.plot(mae_df_tmp2['num_rows'], mae_df_tmp2['moving_avg_f1'], label='JIT-Cat', color='red')
# plt.plot(mae_df_tmp3['num_rows'], mae_df_tmp3['moving_avg_mae'], label='euclidean', color='green')
# plt.plot(mae_df_tmp4['num_rows'], mae_df_tmp4['moving_avg_mae'], label='DTW', color='black')
plt.xlabel('Number of Observed Events over Time')
plt.ylabel('Moving Average F1 Score')
plt.title(f'{dataset_name} Moving Average F1 Score')
plt.legend()
plt.grid(True)
# plt.show()
plt.savefig(f"results/{dataset_name}/moving_avg_f1_score.png", dpi=600, bbox_inches='tight')
plt.savefig(f"results/{dataset_name}/moving_avg_F1_score.pdf", dpi=600, bbox_inches='tight', format="pdf")  # Save as PDF
plt.close()

In [None]:
def save_results_to_csv(dataset_name, proposed_metrics, baseline_metrics, file_path='results_metrics.csv'):
    """
    Save results metrics to a CSV file.

    Parameters:
        dataset_name (str): Name of the dataset.
        proposed_metrics (dict): Metrics for the proposed method (keys: accuracy, f1_score, auc, precision, recall).
        baseline_metrics (dict): Metrics for the baseline method (keys: accuracy, f1_score, auc, precision, recall).
        file_path (str): Path to the results CSV file.
    """
    # Define the columns for the CSV file
    columns = [
        'dataset_name',
        'proposed_accuracy', 'proposed_f1_score', 'proposed_auc', 'proposed_precision', 'proposed_recall',
        'baseline_accuracy', 'baseline_f1_score', 'baseline_auc', 'baseline_precision', 'baseline_recall'
    ]

    # Create a DataFrame for the new results
    new_data = {
        'dataset_name': dataset_name,
        'proposed_accuracy': proposed_metrics['accuracy'],
        'proposed_f1_score': proposed_metrics['f1_score'],
        'proposed_auc': proposed_metrics['auc'],
        'proposed_precision': proposed_metrics['precision'],
        'proposed_recall': proposed_metrics['recall'],
        'baseline_accuracy': baseline_metrics['accuracy'],
        'baseline_f1_score': baseline_metrics['f1_score'],
        'baseline_auc': baseline_metrics['auc'],
        'baseline_precision': baseline_metrics['precision'],
        'baseline_recall': baseline_metrics['recall']
    }

    new_row = pd.DataFrame([new_data])

    # Check if the file exists
    if not os.path.exists(file_path):
        # If the file doesn't exist, create it with the appropriate headers
        new_row.to_csv(file_path, index=False, columns=columns)
    else:
        # If the file exists, append the new row
        existing_data = pd.read_csv(file_path)
        # Check if the dataset already exists in the file
        if dataset_name in existing_data['dataset_name'].values:
            print(f"Dataset '{dataset_name}' already exists in the results file. Updating the row.")
            
            # Remove the existing row with the same dataset_name
            existing_data = existing_data[existing_data['dataset_name'] != dataset_name]
            
            # Append the new row to the DataFrame
            updated_data = pd.concat([existing_data, new_row], ignore_index=True)
            
            # Overwrite the file with the updated DataFrame
            updated_data.to_csv(file_path, mode='w', index=False, columns=columns)
        else:
            new_row.to_csv(file_path, mode='a', index=False, header=False, columns=columns)

In [None]:
def calculate_metrics(results_df):
    """
    Calculate metrics from a results DataFrame.

    Parameters:
        results_df (pd.DataFrame): DataFrame containing 'true_labels', 'predicted_labels', and 'predicted_probs'.

    Returns:
        dict: A dictionary containing accuracy, f1_score, auc, precision, and recall.
    """
    true_labels = results_df[dataset_manager.label_col]
    predicted_labels = results_df['predicted_value']
    predicted_probs = results_df['proba_of_regular']

    metrics = {
        'accuracy': accuracy_score(true_labels, predicted_labels),
        'f1_score': f1_score(true_labels, predicted_labels, average='weighted'),
        'auc': roc_auc_score(true_labels, predicted_probs),
        'precision': precision_score(true_labels, predicted_labels, average='weighted'),
        'recall': recall_score(true_labels, predicted_labels, average='weighted')
    }
    return metrics

In [None]:
# Calculate metrics for proposed method
proposed_metrics = calculate_metrics(results_df)

# Calculate metrics for baseline method
baseline_metrics = calculate_metrics(results_baseline)

save_results_to_csv(f'{dataset_name}', proposed_metrics, baseline_metrics)