In [None]:
import sys
sys.path.insert(0, '../scripts/')

from pathlib import Path
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.filtering.log.attributes import attributes_filter
from pm4py.algo.filtering.log.timestamp import timestamp_filter
from pm4py.algo.filtering.log.start_activities import start_activities_filter
from pm4py.statistics.traces.generic.log import case_statistics
from pm4py.visualization.graphs import visualizer as graphs_visualizer
from pm4py import view_dotted_chart, view_events_distribution_graph

from utils import export_dictionary

file_name = "../data/logs"

log = xes_importer.apply(file_name)

In [None]:
# get variants list and their respective count    
variants_count = case_statistics.get_variant_statistics(log)
variants_count = sorted(
    variants_count, key=lambda x: x['count'], reverse=True)
export_dictionary(
    variants_count, "data/statistics/variants",  f"{Path(file_name).name}_variants")

In [None]:
# get start activities list and their respective count
start_activities_count = start_activities_filter.get_start_activities(log)
start_activities_count = sorted(
    start_activities_count.items(), key=lambda x: x[1],  reverse=True)
sorted_start_activities_count = {k: v for k, v in start_activities_count}
export_dictionary(sorted_start_activities_count,
                    "data/statistics/start_activities", f"{Path(file_name).name}_start_activities")

In [None]:
# get the distribution of events over time. it helps to understand in which time intervals the greatest number of events is recorded
x, y = attributes_filter.get_kde_date_attribute(
    log, attribute="time:timestamp")

gviz = graphs_visualizer.apply_plot(
    x, y, variant=graphs_visualizer.Variants.DATES)
graphs_visualizer.view(gviz)

In [None]:
# charts for distribution of events over time. distr_type can be: hours, days_week, days_month, months, years.
distr_type = "years"

view_events_distribution_graph(log, distr_type, format="png")

In [None]:
# dotted chart is a classic visualization of the events inside an event log across different dimensions (X, Y, colors).
# Each event of the event log is corresponding to a point. X = timestamp, Y = case index, colors = concept:name (tx function name)

# for the full log filtering is needed (only April 2022 is removed). an out of memory expection is returned
log = timestamp_filter.filter_traces_contained(
    log, "2018-01-01 00:00:00", "2022-03-15 13:15:47")
view_dotted_chart(log, format="svg")
