In [None]:
from Declare4Py.Encodings.Aggregate import Aggregate
from sklearn.linear_model import LogisticRegression

from Declare4Py.Encodings.IndexBased import IndexBased

import re
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

max_ev = 20
min_ev = 20
max_padding = max_ev
min_padding = 3
padded_column_name = "padding_len"

RNG = 0

noise_list = [0, 5, 10, 15]

padding_dict = {}
f1_score_dict = {}

encoders = {Aggregate(case_id_col="case:concept:name", cat_cols=['concept:name'], boolean=True): "Boolean",
            Aggregate(case_id_col="case:concept:name", cat_cols=['concept:name'], boolean=False): "Frequency",
            Aggregate(case_id_col="case:concept:name", cat_cols=['concept:name'], num_cols=['valore', 'age'], boolean=False, aggregation_functions=['min', 'mean', 'max']): "Aggregate",
            IndexBased(case_id_col="case:concept:name", cat_cols=['concept:name'], create_dummies=True): "SimpleIdx",
            IndexBased(case_id_col="case:concept:name", cat_cols = ['concept:name'], num_cols=['valore', 'age'], create_dummies=True): "ComplexIdx"}

RNG = 0
classifiers = {
    "LogRegr": LogisticRegression(random_state=0),
    "SVC_rbf": SVC(kernel='rbf'),
    "Perceptron": Perceptron(tol=1e-3, random_state=0), 
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=5, random_state = RNG), 
    "GradientBoostingClassifier": GradientBoostingClassifier(max_depth=5, random_state=RNG),
    "RandomForestClassifier": RandomForestClassifier(max_depth=5, random_state=RNG),
    "MLPClassifier": MLPClassifier(random_state=1, activation='tanh', hidden_layer_sizes=(100, 100), max_iter=1000)
}

clf_styles = {
    "LogRegr": {"name": "Log. Regr.", "marker": "*", "linestyle": "-"}, 
    "SVC_rbf": {"name": "SVM", "marker": "o", "linestyle": "-"},
    "Perceptron": {"name": "Perceptron", "marker": "D", "linestyle": "-"}, 
    "DecisionTreeClassifier": {"name": "Decision Tree", "marker": "v", "linestyle": "-"}, 
    "GradientBoostingClassifier": {"name": "Gradient Boosting", "marker": "X", "linestyle": "-"},
    "RandomForestClassifier": {"name": "Random Forest", "marker": "P", "linestyle": "-"},
    "MLPClassifier": {"name": "DNN", "marker": "s", "linestyle": "-"}
}

enc_styles = {
    "ComplexIdx": {"name": "ComplexIdx", "marker": "*", "linestyle": "-"}, 
    "Boolean": {"name": "Boolean", "marker": "o", "linestyle": "-"},
    "Frequency": {"name": "Frequency", "marker": "D", "linestyle": "-"}, 
    "Aggregate": {"name": "Aggregate", "marker": "v", "linestyle": "-"}, 
    "SimpleIdx": {"name": "SimpleIdx", "marker": "X", "linestyle": "-"},
}


for noise in noise_list:
    for encoder, enc_name in encoders.items():
        clf = classifiers['GradientBoostingClassifier']

        print(enc_name, clf, noise)
        result_dataframe = pd.read_csv(f"experimental_model_pos_neg_{noise}.csv")
        mean_valore = result_dataframe['valore'].mean()
        result_dataframe['valore'].fillna(mean_valore, inplace=True)

        enc_df: pd.DataFrame = encoder.fit_transform(result_dataframe)
        target_df = result_dataframe[["case:concept:name", "case:label"]].drop_duplicates()
        enc_df = pd.merge(enc_df, target_df, on="case:concept:name").drop(["case:concept:name"], axis=1)

        # APPLY padding
        padded_list: list = []

        def pad_row(row: dict, padding: int):
            if padding < min_padding:
                return
    
            row[padded_column_name] = padding
            patterns = [rf"_{padding}$", rf"_{padding}_"]
    
            for col_name in enc_df.columns:
                for pattern in patterns:
                    if re.search(pattern, col_name, re.IGNORECASE) is not None:
                        row[col_name] = 0

            padded_list.append(row)   
            return pad_row(row.copy(), padding - 1)

        for index in enc_df.index:
            pad_row(dict(enc_df.loc[index].copy()), max_padding)

        padded_df: pd.DataFrame = pd.DataFrame(padded_list)
        padded_df.reset_index(drop=True, inplace=True)

        x_cols = list(padded_df.columns)
        x_cols.remove('case:label')
        y_cols = ['case:label', padded_column_name]

        x_train, x_test, y_train, y_test = train_test_split(padded_df[x_cols], padded_df[y_cols], test_size=0.2, random_state = RNG)
        y_train = y_train.drop(axis=1, labels=[padded_column_name])
        x_train = x_train.drop(axis=1, labels=[padded_column_name])

        x_test_dict = {}
        y_test_dict = {}

        for padding in range(min_padding, max_padding + 1):
            x_test_dict[padding] = x_test[x_test[padded_column_name] == padding].copy().drop(axis=1, labels=[padded_column_name])
            y_test_dict[padding] = y_test[y_test[padded_column_name] == padding].copy().drop(axis=1, labels=[padded_column_name])

        padding_dict[enc_name] = []
        f1_score_dict[enc_name] = []
    
        clf.fit(x_train, y_train.values.ravel())
            
        for padding in range(min_padding, max_padding):
            filtered_x_test = x_test_dict[padding]
            filtered_y_test = y_test_dict[padding]
            y_pred = clf.predict(filtered_x_test) 
        
            padding_dict[enc_name].append(padding)
            f1_score_dict[enc_name].append(round(f1_score(list(filtered_y_test["case:label"]), y_pred, average="binary", pos_label="Positive"), 5))

    plt.style.use('paper.mplstyle')
    plt.figure(figsize=(10, 10), dpi=80)

    for name in padding_dict.keys():
        paddings = padding_dict[name]
        f1_scores = f1_score_dict[name]
        plt.plot(paddings, f1_scores, label=enc_styles[name]["name"], marker=enc_styles[name]["marker"], linestyle=enc_styles[name]["linestyle"])

    plt.title(f"F1 score with {noise}\\% of noise")
    plt.xlabel("Prefix length")
    plt.ylim(0.35, 1.02)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"F1_{noise}_encoders.pdf")
    plt.show()

In [None]:
from Declare4Py.Encodings.Aggregate import Aggregate
from sklearn.linear_model import LogisticRegression

max_padding = max_ev
min_padding = 3
padded_column_name = "padding_len"
name = "NULL"

RNG = 0

noise_list = [0, 5, 10, 15]

padding_dict = {}
f1_score_dict = {}

encoders = {Aggregate(case_id_col="case:concept:name", cat_cols=['concept:name'], boolean=True): "Boolean",
            Aggregate(case_id_col="case:concept:name", cat_cols=['concept:name'], boolean=False): "Frequency",
            Aggregate(case_id_col="case:concept:name", cat_cols=['concept:name'], num_cols=['valore', 'age'], boolean=False, aggregation_functions=['min', 'mean', 'max']): "Aggregate",
            IndexBased(case_id_col="case:concept:name", cat_cols=['concept:name'], create_dummies=True): "SimpleIdx",
            IndexBased(case_id_col="case:concept:name", cat_cols = ['concept:name'], num_cols=['valore', 'age'], create_dummies=True): "ComplexIdx"}

RNG = 0
classifiers = {
    "LogRegr": LogisticRegression(random_state=0),
    "SVC_rbf": SVC(kernel='rbf'),
    "Perceptron": Perceptron(tol=1e-3, random_state=0), 
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=5, random_state = RNG), 
    "GradientBoostingClassifier": GradientBoostingClassifier(max_depth=5, random_state=RNG),
    "RandomForestClassifier": RandomForestClassifier(max_depth=5, random_state=RNG),
    "MLPClassifier": MLPClassifier(random_state=1, activation='tanh', hidden_layer_sizes=(100, 100), max_iter=1000)
}

clf_styles = {
    "LogRegr": {"name": "Log. Regr.", "marker": "*", "linestyle": "-"}, 
    "SVC_rbf": {"name": "SVM", "marker": "o", "linestyle": "-"},
    "Perceptron": {"name": "Perceptron", "marker": "D", "linestyle": "-"}, 
    "DecisionTreeClassifier": {"name": "Decision Tree", "marker": "v", "linestyle": "-"}, 
    "GradientBoostingClassifier": {"name": "Gradient Boosting", "marker": "X", "linestyle": "-"},
    "RandomForestClassifier": {"name": "Random Forest", "marker": "P", "linestyle": "-"},
    "MLPClassifier": {"name": "DNN", "marker": "s", "linestyle": "-"}
}

enc_styles = {
    "ComplexIdx": {"name": "ComplexIdx", "marker": "*", "linestyle": "-"}, 
    "Boolean": {"name": "Boolean", "marker": "o", "linestyle": "-"},
    "Frequency": {"name": "Frequency", "marker": "D", "linestyle": "-"}, 
    "Aggregate": {"name": "Aggregate", "marker": "v", "linestyle": "-"}, 
    "SimpleIdx": {"name": "SimpleIdx", "marker": "X", "linestyle": "-"},
}


for noise in noise_list:
    for clf_name, clf in classifiers.items():
        encoder = IndexBased(case_id_col="case:concept:name", cat_cols = ['concept:name'], num_cols=['valore', 'age'], create_dummies=True)

        print(encoder, clf_name, noise)
        result_dataframe = pd.read_csv(f"experimental_model_pos_neg_{noise}.csv")
        mean_valore = result_dataframe['valore'].mean()
        result_dataframe['valore'].fillna(mean_valore, inplace=True)

        enc_df: pd.DataFrame = encoder.fit_transform(result_dataframe)
        target_df = result_dataframe[["case:concept:name", "case:label"]].drop_duplicates()
        enc_df = pd.merge(enc_df, target_df, on="case:concept:name").drop(["case:concept:name"], axis=1)

        # APPLY padding
        padded_list: list = []

        def pad_row(row: dict, padding: int):
            if padding < min_padding:
                return
    
            row[padded_column_name] = padding
            patterns = [rf"_{padding}$", rf"_{padding}_"]
    
            for col_name in enc_df.columns:
                for pattern in patterns:
                    if re.search(pattern, col_name, re.IGNORECASE) is not None:
                        row[col_name] = 0

            padded_list.append(row)   
            return pad_row(row.copy(), padding - 1)

        for index in enc_df.index:
            pad_row(dict(enc_df.loc[index].copy()), max_padding)

        padded_df: pd.DataFrame = pd.DataFrame(padded_list)
        padded_df.reset_index(drop=True, inplace=True)

        x_cols = list(padded_df.columns)
        x_cols.remove('case:label')
        y_cols = ['case:label', padded_column_name]

        x_train, x_test, y_train, y_test = train_test_split(padded_df[x_cols], padded_df[y_cols], test_size=0.2, random_state = RNG)
        y_train = y_train.drop(axis=1, labels=[padded_column_name])
        x_train = x_train.drop(axis=1, labels=[padded_column_name])

        x_test_dict = {}
        y_test_dict = {}

        for padding in range(min_padding, max_padding + 1):
            x_test_dict[padding] = x_test[x_test[padded_column_name] == padding].copy().drop(axis=1, labels=[padded_column_name])
            y_test_dict[padding] = y_test[y_test[padded_column_name] == padding].copy().drop(axis=1, labels=[padded_column_name])

        padding_dict[clf_name] = []
        f1_score_dict[clf_name] = []
    
        clf.fit(x_train, y_train.values.ravel())
            
        for padding in range(min_padding, max_padding):
            filtered_x_test = x_test_dict[padding]
            filtered_y_test = y_test_dict[padding]
            y_pred = clf.predict(filtered_x_test) 
        
            padding_dict[clf_name].append(padding)
            f1_score_dict[clf_name].append(round(f1_score(list(filtered_y_test["case:label"]), y_pred, average="binary", pos_label="Positive"), 5))

    plt.style.use('paper.mplstyle')
    plt.figure(figsize=(10, 10), dpi=80)

    for name in padding_dict.keys():
        paddings = padding_dict[name]
        f1_scores = f1_score_dict[name]
        plt.plot(paddings, f1_scores, label=clf_styles[name]["name"], marker=clf_styles[name]["marker"], linestyle=clf_styles[name]["linestyle"])

    plt.title(f"F1 score with {noise}\\% of noise")
    plt.xlabel("Prefix length")
    plt.ylim(0.45, 1.02)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"F1_{noise}_classifiers.pdf")
    plt.show()