In [94]:
import pandas as pd
pd.set_option("display.float_format", "{:,.2f}".format)
from tabulate import tabulate

In [95]:
table_source = "../logs/smartRPA/202511-results/validation_experiment_results_word2vec_compCorpus_safety100_overlap_ratio_0_8_workstation_rho06.csv"
df = pd.read_csv(table_source)
df.rename(columns={"motif0-length":"motLength","motif0-occurances":"motOcc","motif0-shuffle":"motShuffle"}, inplace=True)

percentageMotifsOverLog

logLength

noOfMotifs

motif0-length >> motLength 

motif0-occurances >> motOcc

motif0-shuffle >> motShuffle

In [96]:
pivotPercentage = pd.pivot_table(df, index=["percentageMotifsOverLog"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotLogLength  = pd.pivot_table(df, index=["logLength"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotNoOfMotifs = pd.pivot_table(df, index=["noOfMotifs"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotMotOcc     = pd.pivot_table(df, index=["motOcc"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotMotLength  = pd.pivot_table(df, index=["motLength"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotMotShuffle = pd.pivot_table(df, index=["motShuffle"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')

list_of_pivots = [pivotPercentage, pivotNoOfMotifs, pivotMotOcc, pivotMotLength, pivotMotShuffle]

In [97]:
def calculate_classification_metrics(df):
    # Create a copy to avoid SettingWithCopy warnings if the input is a slice
    res_df = df.copy()

    # 1. Recall: TP / (TP + FN)
    # Measures: Out of all the actual motifs in the data, how many did we find?
    res_df["recall"] = res_df["total_tp"] / (res_df["total_tp"] + res_df["total_fn"])

    # 2. Precision: TP / (TP + FP)
    # Measures: Out of all the motifs we claimed to find, how many were actually correct?
    res_df["precision"] = res_df["total_tp"] / (res_df["total_tp"] + res_df["total_fp"])

    # 3. F1-score: Harmonic mean of Precision and Recall
    # Measures: Balance between Precision and Recall.
    res_df["f1-score"] = (2 * (res_df["precision"] * res_df["recall"]) / 
                          (res_df["precision"] + res_df["recall"]))

    # Optional: Fill NaN values with 0.0 (happens if denominator is 0)
    res_df[["recall", "precision", "f1-score"]] = res_df[["recall", "precision", "f1-score"]].fillna(0.0)

    return res_df

# Usage:
for pivot in list_of_pivots:
    pivot = calculate_classification_metrics(pivot)
    # Assuming 'final_results' is your DataFrame
    print(tabulate(pivot, headers='keys', tablefmt='psql', numalign="right"))

+---------------------------+------------+------------+------------+----------+-------------+------------+
|   percentageMotifsOverLog |   total_fn |   total_fp |   total_tp |   recall |   precision |   f1-score |
|---------------------------+------------+------------+------------+----------+-------------+------------|
|                        10 |        349 |       1082 |        146 | 0.294949 |    0.118893 |   0.169472 |
|                        25 |        977 |       1467 |        358 | 0.268165 |    0.196164 |   0.226582 |
|                        50 |       1735 |       2072 |        695 | 0.286008 |    0.251175 |   0.267462 |
|                        75 |       1534 |       1200 |        926 | 0.376423 |     0.43556 |   0.403838 |
|                       100 |       2310 |        564 |        435 |  0.15847 |    0.435435 |   0.232372 |
+---------------------------+------------+------------+------------+----------+-------------+------------+
+--------------+------------+--------

In [98]:
dfFilteredOne = df[~df["percentageMotifsOverLog"].isin([1.00,100.00])]
pivotPercentageFiltered = pd.pivot_table(dfFilteredOne, index=["percentageMotifsOverLog"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotLogLengthFiltered  = pd.pivot_table(dfFilteredOne, index=["logLength"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotNoOfMotifsFiltered = pd.pivot_table(dfFilteredOne, index=["noOfMotifs"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotMotOccFiltered     = pd.pivot_table(dfFilteredOne, index=["motOcc"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotMotLengthFiltered  = pd.pivot_table(dfFilteredOne, index=["motLength"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')
pivotMotShuffleFiltered = pd.pivot_table(dfFilteredOne, index=["motShuffle"], values=["total_tp","total_fp","total_fn"], aggfunc='sum')

list_of_filtered_pivots = [pivotPercentageFiltered, pivotNoOfMotifsFiltered, pivotMotOccFiltered, pivotMotLengthFiltered, pivotMotShuffleFiltered]

In [99]:
for pivot in list_of_filtered_pivots:
    pivot = calculate_classification_metrics(pivot)
    # Assuming 'final_results' is your DataFrame
    print(tabulate(pivot, headers='keys', tablefmt='psql', numalign="right"))

+---------------------------+------------+------------+------------+----------+-------------+------------+
|   percentageMotifsOverLog |   total_fn |   total_fp |   total_tp |   recall |   precision |   f1-score |
|---------------------------+------------+------------+------------+----------+-------------+------------|
|                        10 |        349 |       1082 |        146 | 0.294949 |    0.118893 |   0.169472 |
|                        25 |        977 |       1467 |        358 | 0.268165 |    0.196164 |   0.226582 |
|                        50 |       1735 |       2072 |        695 | 0.286008 |    0.251175 |   0.267462 |
|                        75 |       1534 |       1200 |        926 | 0.376423 |     0.43556 |   0.403838 |
+---------------------------+------------+------------+------------+----------+-------------+------------+
+--------------+------------+------------+------------+----------+-------------+------------+
|   noOfMotifs |   total_fn |   total_fp |   total