In [None]:
import pandas as pd
import os

dir_path = "../.."
#dir_path = "e2_results"
file_data = "Out_E2_bis-FaceAll.csv"
#file_data = "Out_240214_bis-GesturePebble.csv"

colnames = ['dataset', 'num', 'num_data_pts', 'num_outliers_removed','num_coeffs','method','max_dist', 'MSE', 'MAE','mean_diff', 'max_fst_derivative', 'max_snd_derivative','min_fst_derivative','min_snd_derivative','avg_fst_derivative','avg_snd_derivative']
e2 = pd.read_csv(os.path.join(dir_path, file_data), names=colnames, header=None).drop_duplicates()
e2

# for each method: accuracy measures (max_dist, MSE, MAE)
Question: How does our method compare to other methods for time series approximation and/or compression? Can it be improved by including an L1 fitting?

In [None]:
import pandas as pd
#pd.set_option('display.float_format', '{:.8f}'.format)
mean_values = e2.groupby('method').agg({'max_dist': 'mean', 'MSE': 'mean', 'MAE': 'mean'}).reset_index()
mean_values

In [None]:
import matplotlib.pyplot as plt

# Plot mean values
plt.figure(figsize=(10, 6))

plt.bar(mean_values['method'], mean_values['max_dist'], color='b', label='Mean Max Dist')
plt.bar(mean_values['method'], mean_values['MAE'],color='g', label='Mean MAE')
#plt.bar(mean_values['method'], mean_values['MSE'],color='r', label='Mean MSE')
plt.xlabel('Method')
plt.ylabel('Mean Value')
plt.title('Mean max. distance, MSE, and MAE per method')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

methods = mean_values['method']
metrics = ['max_dist','MSE', 'MAE', ]

bar_width = 0.2
num_methods = len(methods)
r = np.arange(num_methods)

fig, axs = plt.subplots(len(metrics), figsize=(10, 15))

for i, metric in enumerate(metrics):
    mean_val = mean_values[metric]
    metric_without_outliers = mean_values[f'{metric}']

    r1 = r + bar_width/2

    axs[i].bar(r1, mean_val, color='b', width=bar_width)#, label='?')

    axs[i].set_xlabel('Method', fontweight='bold')
    axs[i].set_ylabel(f'Mean {metric}', fontweight='bold')
    axs[i].set_xticks([r + bar_width/2 for r in range(num_methods)])
    axs[i].set_xticklabels(methods, rotation=45, ha='right')
    axs[i].set_title(f'Comparison of mean {metric}')
    #axs[i].legend()

plt.tight_layout()
plt.show()

# for each method: outliers vs. no outliers
Question: Does the removal of outliers increase the accuracy of the approximated time series? 
(If so: outliers need to be saved separately!)

In [None]:
df_no_outliers = e2[e2['num_outliers_removed'] > 0]
print(len(df_no_outliers.index))
df_with_outliers = e2[e2['num_outliers_removed'] == 0]
print(len(df_with_outliers.index))

mean_values_no_outliers = df_no_outliers.groupby('method').agg({
    'max_dist': 'mean',
    'MSE': 'mean', 
    'MAE': 'mean'}).reset_index()
mean_values_no_outliers.rename(columns={
    'max_dist': 'max_dist w/o outliers', 
    'MSE': 'MSE w/o outliers', 
    'MAE': 'MAE w/o outliers'}, inplace=True)

mean_values_with_outliers = df_with_outliers.groupby('method').agg({
    'max_dist': 'mean', 
    'MSE': 'mean', 
    'MAE': 'mean'}).reset_index()

mean_values_with_outliers.rename(columns={
    'max_dist': 'max_dist', 
    'MSE': 'MSE', 
    'MAE': 'MAE'}, inplace=True)

mean_values_combined = pd.merge(mean_values_with_outliers, mean_values_no_outliers, on='method', how='outer')
mean_values_combined

In [None]:
import matplotlib.pyplot as plt
import numpy as np

methods = mean_values_combined['method']
metrics = ['max_dist','MSE', 'MAE']

bar_width = 0.2
num_methods = len(methods)
r = np.arange(num_methods)

fig, axs = plt.subplots(len(metrics), figsize=(10, 15))

for i, metric in enumerate(metrics):
    metric_with_outliers = mean_values_combined[metric]
    metric_without_outliers = mean_values_combined[f'{metric} w/o outliers']

    r1 = r + bar_width/2
    r2 = [x + bar_width for x in r1]

    axs[i].bar(r1, metric_with_outliers, color='b', width=bar_width, label='With Outliers')
    axs[i].bar(r2, metric_without_outliers, color='r', width=bar_width, label='Without Outliers')

    axs[i].set_xlabel('Method', fontweight='bold')
    axs[i].set_ylabel(f'Mean {metric}', fontweight='bold')
    axs[i].set_xticks([r + bar_width/2 for r in range(num_methods)])
    axs[i].set_xticklabels(methods, rotation=45, ha='right')
    axs[i].set_title(f'Comparison of Mean {metric} with and without Outlier Removal')
    axs[i].legend()

plt.tight_layout()
plt.show()

# for L8-L1: correlation between accuracy measures and time series properties
Question: Does our method approximate time series with certain properties more/less accurately?

In [None]:
df_max_l1 = e2[e2['method'] == 'L8 and L1']

In [None]:
correlation_matrix = df_max_l1[
    ['max_dist', 'MSE', 'MAE', 'num_data_pts', 'mean_diff', 'max_fst_derivative', 'max_snd_derivative']].corr()
print("Correlation Matrix:")
correlation_matrix

In [None]:
df_max_l1.iloc[:,2:]

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")
sns.pairplot(df_max_l1[['num_data_pts','method','max_dist','mean_diff','max_fst_derivative']],hue="method")
#sns.pairplot(df_max_l1[2:],hue="method")

In [None]:
for measurement in ['mean_diff','max_fst_derivative','max_snd_derivative','min_fst_derivative','min_snd_derivative','avg_fst_derivative','avg_snd_derivative']:
    sns.relplot(data=df_max_l1, x=measurement, y="max_dist")

In [None]:
g = sns.PairGrid(df_max_l1[['num_outliers_removed','num_data_pts','max_dist','max_fst_derivative','max_snd_derivative','min_fst_derivative','min_snd_derivative','avg_fst_derivative','avg_snd_derivative']]).map(sns.scatterplot)

In [None]:
"""import seaborn as sns
sns.set_theme(style="ticks")
# for method in ['L8', 'L8 and L1', 'LSQ', 'PAA', 'PLA', 'DFT']:
df = e2[e2['method']=='L8 and L1']
#sns.pairplot(df[['num_data_pts','method','max_dist','mean_diff','max_fst_derivative']],hue="method")
sns.pairplot(df[2:],hue="method")"""