In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl

from pathlib import Path
from torch import cdist
import os


In [2]:
from catboost import CatBoostClassifier
from hmmlearn import hmm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.neighbors import NearestNeighbors
import math
# from statsmodels.tsa.arima.model import ARIMA

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
ROOT_DIR = Path().cwd()
while not ROOT_DIR.joinpath("data").exists():
    ROOT_DIR = ROOT_DIR.parent
os.chdir(ROOT_DIR)

In [5]:
from processors import processor_factory
from data.DatasetManager import DatasetManager

AMBR
CSL312
Astrazeneca
bpic2011_f1
bpic2011_f2
bpic2011_f3
bpic2011_f4


In [6]:
case_id_col = 'Case ID'
timestamp_col = 'time:timestamp'
target_col = "Activity code"
dataset_name = 'bpic2011_f1'

In [7]:
cols_to_keep = [case_id_col, timestamp_col, target_col, 'BR Temp (°C)', 'pH (Internal)', 'Viability (%)', 'VCD (10^6 cells/mL)', 
                'Glutamine (mmol/L)', 'Glutamate (mmol/L)', 'Glucose (g/L)','Lactate (g/L)', 'NH4+ (mmol/L)', 
                'Na+ (mmol/L)', 'K+ (mmol/L)', 'Ca2+ (g/L)']

features = ['BR Temp (°C)', 'pH (Internal)', 'Viability (%)', 'VCD (10^6 cells/mL)', 
                'Glutamine (g/L)', 'Glutamate (g/L)', 'Glucose (g/L)','Lactate (g/L)', 'NH4+ (g/L)', 
                'Na+ (g/L)', 'K+ (g/L)', 'Ca2+ (g/L)']

In [8]:
df = pd.read_csv('data/processed_benchmark_event_logs/BPIC11_f1.csv', sep=';')

In [9]:
df[timestamp_col] = pd.to_datetime(df[timestamp_col])

In [10]:
df.sort_values([case_id_col, timestamp_col], inplace=True)

In [11]:
dataset_manager = DatasetManager(dataset_name)

In [13]:
df[case_id_col].nunique()

1140

In [14]:
df.columns

Index(['Diagnosis', 'Treatment code', 'Diagnosis code', 'Specialism code',
       'Diagnosis Treatment Combination ID', 'Age', 'Case ID', 'label',
       'Activity code', 'Producer code', 'Section', 'Specialism code.1',
       'group', 'Number of executions', 'time:timestamp', 'timesincemidnight',
       'month', 'weekday', 'hour', 'timesincelastevent', 'timesincecasestart',
       'event_nr', 'open_cases'],
      dtype='object')

In [16]:
earliest_timestamps = df.groupby(case_id_col)[timestamp_col].min()


Case ID
0      2005-01-02 23:00:00
1      2005-01-02 23:00:00
2      2005-01-02 23:00:00
3      2005-01-02 23:00:00
4      2005-01-02 23:00:00
               ...        
1138   2008-03-11 23:00:00
1139   2008-03-11 23:00:00
1140   2008-03-16 23:00:00
1141   2008-03-17 23:00:00
1142   2008-03-19 23:00:00
Name: time:timestamp, Length: 1140, dtype: datetime64[ns]

In [18]:
def get_first_n_cases(df, n):
    earliest_timestamps = df.groupby(case_id_col)[timestamp_col].min()
    sorted_cases = earliest_timestamps.sort_values().index[:n]
    return df[df[case_id_col].isin(sorted_cases)]

In [24]:
filtered_df = get_first_n_cases(df, 100)

In [25]:
df_train, df_test = dataset_manager.split_data_strict(filtered_df, 0.5)

In [26]:
results = []
method = 'Catboost'
config = 'no_encoding_no_bucketing'
# config = 'no_encoding_bucketing'
# config = 'encoding_no_bucketing'
# config = 'encoding_bucketing'

In [None]:
# from data.dataset_confs import dataset_configs

# config = dataset_configs['CSL_5L']
use_encoding = False
use_bucketing = False
num_nearest_neighbors = 100
distance_metric = 'euclidean'

# processor = CSL_5LProcessor("5L", use_encoding, use_bucketing, num_nearest_neighbors, distance_metric)
processor = processor_factory.get_processor(dataset_name, use_encoding, use_bucketing, num_nearest_neighbors, distance_metric)

In [None]:
# Define the values to experiment with
results = []
num_nearest_neighbors_values = [200]
distance_metrics = ['euclidean']

# Loop through the values and create processors
for num_nearest_neighbors in num_nearest_neighbors_values:
    for distance_metric in distance_metrics:
        print(f"Experimenting with num_nearest_neighbors={num_nearest_neighbors} and distance_metric={distance_metric}")
        
        # Create the processor with the current values
        processor = processor_factory.get_processor(dataset_name, use_encoding=False, use_bucketing=False, num_nearest_neighbors=num_nearest_neighbors, distance_metric=distance_metric)
        
    

        data = df.sort_values([case_id_col, timestamp_col], ascending=True, kind='mergesort')
        data['Target'] = data.groupby(case_id_col)[target_col].shift(-1)
        # data['Target_orig'] = data.groupby(case_id_col)['Titer (g/L) original'].shift(-1)
        data['Target'] = data.groupby(case_id_col)['Target'].ffill().bfill()
        # data['Target_orig'] = data.groupby(case_id_col)['Target_orig'].ffill()

        historic, current = processor.split_data_strict(data, train_ratio=0.5)
        historic.sort_values([case_id_col, timestamp_col], ascending=True, kind='mergesort', inplace=True)
        current.sort_values([timestamp_col], ascending=True, kind='mergesort', inplace=True)

        if config == 'no_encoding_bucketing' or config == 'encoding_bucketing':
            features_used = processor.static_num_cols + processor.dynamic_num_cols + processor.static_cat_cols + processor.dynamic_cat_cols + ['Cluster']
        else:
            features_used = processor.static_num_cols + processor.dynamic_num_cols + processor.static_cat_cols + processor.dynamic_cat_cols

        # n_neighbors = 200
        # # Initialize the NearestNeighbors model
        # nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
        # # Fit the model on the historic data
        # nn_model.fit(historic[features_used])

        batch_size = 100

        num_model, cat_model = processor.train_nn_model_bpic(historic)

        for start in range(0, len(current), batch_size):
            end = start + batch_size
            batch = current.iloc[start:end]
            # Find the n nearest neighbors for the selected row
            # distances, indices = nn_model.kneighbors([row[features_used]])
            distances, indices = processor.find_nearest_neighbors(cat_model, num_model, batch[features_used])
            nearest_neighbors = pd.concat([historic.iloc[indices[i]] for i in range(len(batch))])
            # nearest_neighbors = historic.iloc[indices.flatten()]

            # print(distances.shape)
            # print(indices.shape)

            # print(cat_model)
            # print(num_model)
            # print(features_used)
            # print(batch[features_used].shape)
            print(nearest_neighbors.shape)

            target = nearest_neighbors['Target'].values
            target_test = batch['Target']

            if target_test is None:
                continue
            

            if method == 'Catboost':
                
                categorical_features_indices = [nearest_neighbors.columns.get_loc(col) for col in processor.static_cat_cols+processor.dynamic_cat_cols]
                # Create the CatBoostRegressor model
                model = CatBoostClassifier(iterations=100, loss_function='MultiClass', verbose=0, cat_features=processor.static_cat_cols + processor.dynamic_cat_cols) # , max_ctr_complexity=1
                print('Now training')
                model.fit(nearest_neighbors[features_used], target, cat_features=processor.static_cat_cols + processor.dynamic_cat_cols)

            if method == 'HMM':
                # Create an instance of the HMM model
                model = hmm.GaussianHMM(n_components=7)  # Specify the number of hidden states
                model.fit(nearest_neighbors[features_used])

            # Make predictions on the testing data
            print('Now predicting')
            preds = model.predict(batch[features_used])

            # Check if array has null values
            has_null = pd.Series(preds.flatten(), index=batch.index).isna().any()

            if has_null:
                print("Preds has null values")


            # true_conc_glu = row['Target_orig']
            # preds_scaled = scalers[target_col].inverse_transform(preds.reshape(-1, 1))

            batch.loc[:, 'predicted_value'] = pd.Series(preds.flatten(), index=batch.index)
            is_null = batch['predicted_value'].isna().any()
            if is_null:
                print("Batch has null values")

            results.append(batch)

            # Add the current row with its prediction to the historic data
            # row_with_prediction[target_col] = preds_scaled[0][0]
            historic = pd.concat([historic, batch], ignore_index=True)
            historic.sort_values([case_id_col, timestamp_col], ascending=True, kind='mergesort', inplace=True)
            nn_model =processor.train_nn_model_bpic(historic)  # Refit the model with the updated historic data

        results_df = pd.concat(results)

        # Calculate metrics
        true_values = results_df[target_col]
        predicted_values = results_df['predicted_value']

        accuracy = np.mean(true_values == predicted_values)
        # MAE_t = mean_absolute_error(true_values, predicted_values)
        # MSE_t = mean_squared_error(true_values, predicted_values)
        # RMSE_t = math.sqrt(MSE_t)
        # r2_t = r2_score(true_values, predicted_values)
        # mape_t = mean_absolute_percentage_error(true_values, predicted_values)

        # Save results to a CSV file
        # results_df.to_csv('predictions_with_row_data.csv', index=False)

        # Print metrics
        print(f"Accuracy: {accuracy}")
        # print(f"MAE: {MAE_t}")
        # print(f"MSE: {MSE_t}")
        # print(f"RMSE: {RMSE_t}")
        # print(f"R2: {r2_t}")
        # print(f"MAPE: {mape_t}")

        results_df.to_csv(f'results/{dataset_name}/BPIC2011_{method}_{config}_{num_nearest_neighbors}_{distance_metric}.csv', index=False)
        print('***********************************')


In [None]:
results_df

In [None]:
####### Baseline Version without JITL ########

# Define the values to experiment with

print(f"Experimenting with num_nearest_neighbors={num_nearest_neighbors} and distance_metric={distance_metric}")

# Create the processor with the current values
processor = processor_factory.get_processor(dataset_name, use_encoding=False, use_bucketing=False, num_nearest_neighbors=num_nearest_neighbors, distance_metric=distance_metric)



data = df_normalized.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort')
data['Target'] = data.groupby(case_id_col)[target_col].shift(-1)
# data['Target_orig'] = data.groupby(case_id_col)['Titer (g/L) original'].shift(-1)
data['Target'] = data.groupby(case_id_col)['Target'].ffill().bfill()
# data['Target_orig'] = data.groupby(case_id_col)['Target_orig'].ffill()

historic, current = processor.split_data(data, train_ratio=0.5, split="temporal sim")
historic.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)
current.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)

if config == 'no_encoding_bucketing' or config == 'encoding_bucketing':
    features_used = features + ['Cluster']
else:
    features_used = features

if method == 'Catboost':
        # Create the CatBoostRegressor model
        model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, loss_function='MAE', verbose=0)
        model.fit(historic[features_used], target)

if method == 'HMM':
    # Create an instance of the HMM model
    model = hmm.GaussianHMM(n_components=7)  # Specify the number of hidden states
    model.fit(historic[features_used])

batch_size = 50

for start in range(0, len(current), batch_size):
    end = start + batch_size
    batch = current.iloc[start:end]

    target = historic['Target'].values
    target_test = batch['Target']

    if target_test is None:
        continue

    # Make predictions on the testing data
    preds = model.predict(batch[features_used])

    # true_conc_glu = row['Target_orig']
    # preds_scaled = scalers[target_col].inverse_transform(preds.reshape(-1, 1))

    batch.loc[:, 'predicted_value'] = preds

    results.append(batch)


results_df = pd.concat(results)

# Calculate metrics
true_values = results_df[target_col]
predicted_values = results_df['predicted_value']

MAE_t = mean_absolute_error(true_values, predicted_values)
MSE_t = mean_squared_error(true_values, predicted_values)
RMSE_t = math.sqrt(MSE_t)
r2_t = r2_score(true_values, predicted_values)
mape_t = mean_absolute_percentage_error(true_values, predicted_values)

# Save results to a CSV file
# results_df.to_csv('predictions_with_row_data.csv', index=False)

# Print metrics
print(f"MAE: {MAE_t}")
print(f"MSE: {MSE_t}")
print(f"RMSE: {RMSE_t}")
print(f"R2: {r2_t}")
print(f"MAPE: {mape_t}")

results_df.to_csv(f'results/{dataset_name}/baseline_CSL312_{method}_{config}.csv', index=False)
print('***********************************')


In [None]:
results_df = pd.DataFrame(results)

# Calculate metrics
true_values = results_df['true_value']
predicted_values = results_df['predicted_value']

MAE_t = mean_absolute_error(true_values, predicted_values)
MSE_t = mean_squared_error(true_values, predicted_values)
RMSE_t = math.sqrt(MSE_t)
r2_t = r2_score(true_values, predicted_values)
mape_t = mean_absolute_percentage_error(true_values, predicted_values)

# Save results to a CSV file
# results_df.to_csv('predictions_with_row_data.csv', index=False)

# Print metrics
print(f"MAE: {MAE_t}")
print(f"MSE: {MSE_t}")
print(f"RMSE: {RMSE_t}")
print(f"R2: {r2_t}")
print(f"MAPE: {mape_t}")

In [None]:
results_df.to_csv(f'results/CSL312_{method}_{config}_200_results.csv', index=False)

In [None]:
import importlib
import sys
import time

In [None]:
min_prefix_length = 1
max_prefix_length = df_normalized[work_day_col].max()

In [None]:
# Remove both EncoderFactory and AggregateTransformer from sys.modules
modules_to_remove = ['EncoderFactory', 'transformers.AggregateTransformer', 'DatasetManager', "dataset_confs"]
for module in modules_to_remove:
    if module in sys.modules:
        del sys.modules[module]

# Re-import the modules
import EncoderFactory
import transformers.AggregateTransformer
import DatasetManager
from DatasetManager import DatasetManager

# Reload the modules
importlib.reload(transformers.AggregateTransformer)
importlib.reload(EncoderFactory)
# importlib.reload(DatasetManager)

In [None]:
dataset_manager = DatasetManager("CSL312")

In [None]:
start_test_prefix_generation = time.time()
dt_prefixes = dataset_manager.generate_prefix_data(df_normalized, min_prefix_length, max_prefix_length)
test_prefix_generation_time = time.time() - start_test_prefix_generation

In [None]:
dt_prefixes[target_col].isnull().sum()

In [None]:
dt_prefixes

In [None]:
encoder = EncoderFactory.get_encoder(method='agg', case_id_col=case_id_col, static_cat_cols=None, static_num_cols=None, dynamic_cat_cols=['Cluster'],
                dynamic_num_cols=features, target_cols=[target_col,'Titer (g/L) original'], work_day_col=work_day_col, fillna=True, max_events=None, activity_col=None, resource_col=None, timestamp_col=timestamp_col,
                scale_model=None)

In [None]:
dt_transformed = encoder.transform(dt_prefixes)
dt_transformed.columns

In [None]:
dt_transformed = dt_transformed.reset_index(drop=False, inplace=False)
dt_transformed[work_day_col] = dt_transformed[case_id_col].str.split('_').str[2]
dt_transformed[work_day_col] = dt_transformed[work_day_col].fillna(0)
dt_transformed[case_id_col] = dt_transformed[case_id_col].str.split('_').str[0] + '_' + dt_transformed[case_id_col].str.split('_').str[1]


In [None]:

dt_transformed


In [None]:
features_new = [col for col in dt_transformed.columns if col not in ["Cluster", "'Titer (g/L) original'", case_id_col, target_col, work_day_col]]

In [None]:
results = []
method = 'Catboost'
# config = 'no_encoding_no_bucketing'
# config = 'no_encoding_bucketing'
config = 'encoding_no_bucketing'
# config = 'encoding_bucketing'

data = dt_prefixes.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort')
# data['Target'] = data.groupby(case_id_col)[target_col].shift(-1)
# data['Target_orig'] = data.groupby(case_id_col)['Titer (g/L) original'].shift(-1)
# data['Target'] = data.groupby(case_id_col)['Target'].ffill()
# data['Target_orig'] = data.groupby(case_id_col)['Target_orig'].ffill()

historic, current = split_data(data, train_ratio=0.5, split="temporal sim")
historic.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)
current.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)

if config == 'no_encoding_bucketing' or config == 'encoding_bucketing':
    features_used = features + ['Cluster']
else:
    features_used = features

# Initialize the NearestNeighbors model
nn_model = NearestNeighbors(n_neighbors=200)
# Fit the model on the historic data
nn_model.fit(historic[features_used])

for index, row in current.iterrows():
    # Find the n nearest neighbors for the selected row
    distances, indices = nn_model.kneighbors([row[features_used]])
    nearest_neighbors = historic.iloc[indices[0]]

    target = nearest_neighbors['Target'].values
    target_test = row['Target']

    if target_test is None:
        continue
    

    if method == 'Catboost':
        # Create the CatBoostRegressor model
        model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, loss_function='MAE', verbose=0)
        model.fit(nearest_neighbors[features_used], target)

    if method == 'HMM':
        # Create an instance of the HMM model
        model = hmm.GaussianHMM(n_components=7)  # Specify the number of hidden states
        model.fit(df_without_last[features+['Cluster']])

    # Make predictions on the testing data
    preds = model.predict(row[features_used])

    true_conc_glu = row['Target_orig']
    preds_scaled = scalers[target_col].inverse_transform(preds.reshape(-1, 1))
    
    key = row[case_id_col] + '_' + str(row[work_day_col])
    results.append({
        'key': key,
        'row_data': row.to_dict(),
        'true_value': true_conc_glu,
        'predicted_value': preds_scaled[0][0]
    })

    # Add the current row with its prediction to the historic data
    row_with_prediction = row.copy()
    # row_with_prediction[target_col] = preds_scaled[0][0]
    historic = pd.concat([historic, pd.DataFrame([row_with_prediction])], ignore_index=True)
    historic.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)
    nn_model.fit(historic[features_used])  # Refit the model with the updated historic data


In [None]:
results_df = pd.DataFrame(results)

# Calculate metrics
true_values = results_df['true_value']
predicted_values = results_df['predicted_value']

MAE_t = mean_absolute_error(true_values, predicted_values)
MSE_t = mean_squared_error(true_values, predicted_values)
RMSE_t = math.sqrt(MSE_t)
r2_t = r2_score(true_values, predicted_values)
mape_t = mean_absolute_percentage_error(true_values, predicted_values)

# Save results to a CSV file
# results_df.to_csv('predictions_with_row_data.csv', index=False)

# Print metrics
print(f"MAE: {MAE_t}")
print(f"MSE: {MSE_t}")
print(f"RMSE: {RMSE_t}")
print(f"R2: {r2_t}")
print(f"MAPE: {mape_t}")

In [None]:
results_df.to_csv(f'results/CSL312_{method}_{config}_results.csv', index=False)

In [None]:
results_df = pd.DataFrame(results)

# Calculate metrics
true_values = results_df['true_value']
predicted_values = results_df['predicted_value']

MAE_t = mean_absolute_error(true_values, predicted_values)
MSE_t = mean_squared_error(true_values, predicted_values)
RMSE_t = math.sqrt(MSE_t)
r2_t = r2_score(true_values, predicted_values)
mape_t = mean_absolute_percentage_error(true_values, predicted_values)

# Save results to a CSV file
# results_df.to_csv('predictions_with_row_data.csv', index=False)

# Print metrics
print(f"MAE: {MAE_t}")
print(f"MSE: {MSE_t}")
print(f"RMSE: {RMSE_t}")
print(f"R2: {r2_t}")
print(f"MAPE: {mape_t}")

In [None]:
results_df

#################################################################################################

In [None]:
results = []
method = 'Catboost'
config = 'no_encoding_no_bucketing'
# config = 'no_encoding_bucketing'
# config = 'encoding_no_bucketing'
# config = 'encoding_bucketing'

data = df_normalized.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort')
data['Target'] = data.groupby(case_id_col)[target_col].shift(-1)
data['Target_orig'] = data.groupby(case_id_col)['Titer (g/L) original'].shift(-1)
data['Target'] = data.groupby(case_id_col)['Target'].ffill().bfill()
data['Target_orig'] = data.groupby(case_id_col)['Target_orig'].ffill().bfill()

historic, current = split_data(data, train_ratio=0.5, split="temporal sim")
historic.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)
current.sort_values([case_id_col, work_day_col], ascending=True, kind='mergesort', inplace=True)

if config == 'no_encoding_bucketing' or config == 'encoding_bucketing':
    features_used = features + ['Cluster']
else:
    features_used = features


for index, row in current.iterrows():

    target = historic['Target'].values
    target_test = row['Target']

    if target_test is None:
        continue
    

    if method == 'Catboost':
        # Create the CatBoostRegressor model
        model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, loss_function='MAE', verbose=0, random_state=123)
        model.fit(historic[features_used], target)

    if method == 'HMM':
        # Create an instance of the HMM model
        model = hmm.GaussianHMM(n_components=7)  # Specify the number of hidden states
        model.fit(df_without_last[features+['Cluster']])

    # Make predictions on the testing data
    preds = model.predict(row[features_used])

    true_conc_glu = row['Target_orig']
    preds_scaled = scalers[target_col].inverse_transform(preds.reshape(-1, 1))
    
    key = row[case_id_col] + '_' + str(row[work_day_col])
    results.append({
        'key': key,
        'row_data': row.to_dict(),
        'true_value': true_conc_glu,
        'predicted_value': preds_scaled[0][0]
    })

In [None]:
results_df = pd.DataFrame(results)

# Calculate metrics
true_values = results_df['true_value']
predicted_values = results_df['predicted_value']

MAE_t = mean_absolute_error(true_values, predicted_values)
MSE_t = mean_squared_error(true_values, predicted_values)
RMSE_t = math.sqrt(MSE_t)
r2_t = r2_score(true_values, predicted_values)
mape_t = mean_absolute_percentage_error(true_values, predicted_values)

# Save results to a CSV file
# results_df.to_csv('predictions_with_row_data.csv', index=False)

# Print metrics
print(f"MAE: {MAE_t}")
print(f"MSE: {MSE_t}")
print(f"RMSE: {RMSE_t}")
print(f"R2: {r2_t}")
print(f"MAPE: {mape_t}")

In [None]:
results_df.to_csv(f'results/CSL312_baseline_{method}_{config}_results.csv', index=False)

In [None]:
results_df_400 = pd.read_csv('results/CSL312_Catboost_no_encoding_bucketing_cosine_results.csv')
results_df_300 = pd.read_csv('results/CSL312_Catboost_no_encoding_bucketing_chebyshev_results.csv')
results_df_200 = pd.read_csv('results/CSL312_Catboost_no_encoding_bucketing_200_results.csv')
results_df_100 = pd.read_csv('results/CSL312_Catboost_no_encoding_bucketing_DTW_results.csv')

In [None]:
# Extract the row_data column
row_data_df = pd.json_normalize(results_df_400['row_data'])

# Drop the original row_data column from results_df_baseline
tmp = results_df_400.drop(columns=['row_data'])

# Concatenate the normalized row_data DataFrame with the original DataFrame
tmp = pd.concat([tmp, row_data_df], axis=1)

# Retain true_value and predicted_value columns
tmp = tmp[['true_value', 'predicted_value'] + list(row_data_df.columns)]

In [None]:
# Extract the row_data column
row_data_df = pd.json_normalize(results_df_300['row_data'])

# Drop the original row_data column from results_df_baseline
tmp2 = results_df_300.drop(columns=['row_data'])

# Concatenate the normalized row_data DataFrame with the original DataFrame
tmp2 = pd.concat([tmp2, row_data_df], axis=1)

# Retain true_value and predicted_value columns
tmp2 = tmp2[['true_value', 'predicted_value'] + list(row_data_df.columns)]

In [None]:
# Extract the row_data column
row_data_df = pd.json_normalize(results_df_200['row_data'])

# Drop the original row_data column from results_df_baseline
tmp3 = results_df_200.drop(columns=['row_data'])

# Concatenate the normalized row_data DataFrame with the original DataFrame
tmp3 = pd.concat([tmp3, row_data_df], axis=1)

# Retain true_value and predicted_value columns
tmp3 = tmp3[['true_value', 'predicted_value'] + list(row_data_df.columns)]

In [None]:
# Extract the row_data column
row_data_df = pd.json_normalize(results_df_100['row_data'])

# Drop the original row_data column from results_df_baseline
tmp4 = results_df_100.drop(columns=['row_data'])

# Concatenate the normalized row_data DataFrame with the original DataFrame
tmp4 = pd.concat([tmp4, row_data_df], axis=1)

# Retain true_value and predicted_value columns
tmp4 = tmp4[['true_value', 'predicted_value'] + list(row_data_df.columns)]

In [None]:
def calculate_moving_avg_mae(df, true_col, pred_col, window_size=5):
    true_values = df[true_col].to_numpy()
    predicted_values = df[pred_col].to_numpy()

    num_rows_list = []
    mae_list = []

    for i in range(2, len(true_values) + 1):
        num_rows_list.append(i)
        mae = mean_absolute_error(true_values[:i], predicted_values[:i])
        mae_list.append(mae)

    mae_df = pd.DataFrame({'num_rows': num_rows_list, 'mae': mae_list})
    mae_df['moving_avg_mae'] = mae_df['mae'].rolling(window=window_size).mean()
    
    return mae_df

# Calculate moving average MAE for tmp
mae_df_tmp = calculate_moving_avg_mae(tmp, 'true_value', 'predicted_value')

# Calculate moving average MAE for tmp2
mae_df_tmp2 = calculate_moving_avg_mae(tmp2, 'true_value', 'predicted_value')

# Calculate moving average MAE for tmp2
mae_df_tmp3 = calculate_moving_avg_mae(tmp3, 'true_value', 'predicted_value')

# Calculate moving average MAE for tmp2
mae_df_tmp4 = calculate_moving_avg_mae(tmp4, 'true_value', 'predicted_value')

# Plot the moving average MAE for both DataFrames
plt.figure(figsize=(10, 6))
plt.plot(mae_df_tmp['num_rows'], mae_df_tmp['moving_avg_mae'], label='cosine', color='blue')
plt.plot(mae_df_tmp2['num_rows'], mae_df_tmp2['moving_avg_mae'], label='levenshtein', color='red')
plt.plot(mae_df_tmp3['num_rows'], mae_df_tmp3['moving_avg_mae'], label='euclidean', color='green')
plt.plot(mae_df_tmp4['num_rows'], mae_df_tmp4['moving_avg_mae'], label='DTW', color='black')
plt.xlabel('Number of Observed Rows')
plt.xlabel('Number of Observed Rows')
plt.ylabel('Moving Average MAE')
plt.title('CSL312')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
df_results = pd.DataFrame(results).T
# catboost regular 
pd.options.display.float_format = '{:.2f}'.format
df_results[df_results.columns].mean()

In [None]:
df_results = pd.DataFrame(results).T
# HMM regular 
pd.options.display.float_format = '{:.2f}'.format
df_results[df_results.columns].mean()