### Installing depencencies

In [None]:
ENV["PYTHON"] = "C:\\Users\\lap2r\\AppData\\Local\\Programs\\Python\\Python311\\python.exe"
ENV["PYTHON"] = "C:\\Users\\lap2r\\AppData\\Local\\Programs\\Python\\Python311\\python.exe"

### Environment configuration

In [None]:
using PyCall
pushfirst!(PyVector(pyimport("sys")["path"]), joinpath(@__DIR__, ".."))
paths_rel = pyimport("paths_rel")

alarms_data_file_path = "../" * paths_rel.REL_ALARMS_DATA_FILE

## Alarms dataset EDA

##### Viewing dataset parameters


In [None]:
using DataFrames
using CSV
using StatsPlots

alarms = DataFrame(CSV.File(alarms_data_file_path))

In [None]:
size(alarms)

In [None]:
eltype.(eachcol(alarms))

In [None]:
describe(alarms)

##### Analysing dataset contents


In [None]:
# first event date
println("First event date: ", minimum(alarms[!, :start]))

# max event date
println("Latest event date: ", maximum(alarms[!, :end]))

In [None]:
using Dates
using Statistics
using StatsBase

transform!(alarms, [:start, :end] => ((x, y) -> DateTime.(y, "yyyy-mm-dd HH:MM:SS") - DateTime.(x, "yyyy-mm-dd HH:MM:SS")) => :duration)

transform!(alarms, :duration => (x -> Minute.(round.(Int, Dates.value.(x) / (1000 * 60)))) => :duration)

println("minimum duration: ", minimum(alarms[!, :duration]))
println("maximum duration: ", maximum(alarms[!, :duration]))

sort!(alarms, :duration, rev=false)
println("Sorted by minimum duration:")
println(first(alarms, 10))
println("Sorted by maximum duration:")
sort!(alarms, :duration, rev=true)
println(first(alarms, 10))

In [None]:
using StatsBase
duration_counts_v1 = countmap(alarms.duration)
# convert Dict{Minute, Int64} to Dict{Int64, Int64}
duration_counts_v1 = Dict{Int64, Int64}(k.value => v for (k, v) in duration_counts_v1)

In [None]:
# PLOTS_DEFAULTS = Dict(:dpi => 600)

# https://docs.juliaplots.org/latest/generated/attributes_axis/
# https://docs.juliaplots.org/latest/generated/attributes_plot/
# https://docs.juliaplots.org/latest/generated/attributes_subplot/

using Plots.PlotMeasures

default(
    legend=true,
    left_margin=5mm,
    right_margin=5mm,
    top_margin=5mm,
    bottom_margin=5mm,
    xrotation=90,
    draw_arrow=true,
    grid=false,
    minorgrid=false,
    dpi=600,
    size=(800, 800),
    color=RGB(250 / 255, 135 / 255, 117 / 255),
    linecolor=RGB(250 / 255, 135 / 255, 117 / 255),
)

In [None]:
using Plots;

max_x = maximum(keys(duration_counts_v1))
max_y = maximum(values(duration_counts_v1))

bar(
    duration_counts_v1,
    title="\nAlarm duration to count",
    xlabel="Duration (minutes)",
    ylabel="Count",
    label="Count",
    xticks=(0:60:max_x+50),
    yticks=(0:15:max_y+15),
    xlims=(-20, max_x + 50),
    ylims=(-10, max_y + 15),
    grid=true,
    minorgrid=true,
    size=(1200, 800),
)

In [None]:
duration_counts = countmap(alarms.duration)

# convert dictionary to double array
duration_counts = [k => v for (k, v) in duration_counts]
duration_counts = sort(duration_counts, by=x -> x[2], rev=true)
duration_counts = duration_counts[1:50]

# take each pair and seprate them into 2 arrays first value of pair to first array and second value of pair to second array
duration, counts = [x[1] for x in duration_counts], [x[2] for x in duration_counts]
# create dataframe from items
duration_counts = DataFrame(duration=duration, counts=counts)

In [None]:
# create a vector of count to region_title
region_counts = countmap(alarms.region_title)
# convert dictionary to double array
region_counts = [k => v for (k, v) in region_counts]
# sort the array by count
region_counts = sort(region_counts, by=x -> x[2], rev=true)
# take each pair and seprate them into 2 arrays first value of pair to first array and second value of pair to second array
region_title, counts = [x[1] for x in region_counts], [x[2] for x in region_counts]
# create dataframe from items
region_counts = DataFrame(region_title=region_title, counts=counts)

In [None]:
bar(
    region_counts.region_title,
    region_counts.counts,
    xlabel="Region name",
    ylabel="Count",
    title="\nAlarms Count by region",
    label="Count",
    xrotation=90,
    xticks=:all,
    yticks=0:100:3500,
)

In [None]:
alarms

In [None]:
using Dates

transform!(alarms, :start => (x -> Dates.year.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :year)
transform!(alarms, :start => (x -> Dates.month.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :month)
transform!(alarms, :start => (x -> Dates.day.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :day)
transform!(alarms, :start => (x -> Dates.week.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :week)
transform!(alarms, :start => (x -> Dates.dayname.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :dayname)


alarms[alarms.day .== 1 .&& alarms.month .== 1 .&& alarms.year .== 2023, :week] .= 0
alarms[alarms.day .== 1 .&& alarms.month .== 1 .&& alarms.year .== 2023, :]

In [None]:
alarms[alarms.year .== 2023, :]

In [None]:
week_counts_2022 = countmap(alarms[alarms.year .== 2022, :week])
week_2022, counts_2022 = [x[1] for x in week_counts_2022], [x[2] for x in week_counts_2022]

max_x = maximum(week_2022)
max_y = maximum(counts_2022)

bar(
    week_2022,
    counts_2022,
    xlabel="Week",
    ylabel="Count",
    title="\nAlarms count by week in 2022",
    label="Count",
    xrotation=0,
    xticks=0:1:max_x,
    yticks=0:100:max_y,
    bar_width=0.3,
    size=(1400, 800),
)

In [None]:
week_counts_2023 = countmap(alarms[alarms.year .== 2023, :week])
week_2023, counts_2023 = [x[1] for x in week_counts_2023], [x[2] for x in week_counts_2023]

max_x = maximum(week_2023)
max_y = maximum(counts_2023)

bar(
    week_2023,
    counts_2023,
    xlabel="Week",
    ylabel="Count",
    title="\nAlarms count by week in 2023",
    label="Count",
    xrotation=0,
    xticks=0:1:max_x,
    yticks=0:25:max_y,
    bar_width=0.3,
    size=(600, 600),
)

In [None]:
using Dates
using DataFrames

# copy alarms
df = alarms

# assuming your dataframe is called df
df.duration = map(x -> Dates.Minute(x), df.duration) # convert duration to Minute
# filter for the year 2022
df_2022 = filter(row -> row.year == 2022, df)
# group by week and sum the duration
cumulative_duration = combine(groupby(df_2022, :week), :duration => sum)
# visualize cumulative_duration
bar(
    cumulative_duration[!, :week],
    cumulative_duration[!, :duration_sum],
    xlabel="Week",
    ylabel="Count",
    title="\nAlarms cumulative duration by week in 2022",
    label="Count",
    bar_width=0.3,
    xticks=0:1:52,
    yticks=0:5000:100000,
    size=(1400, 800),
)

In [None]:
using Dates
using DataFrames

# copy alarms
df = alarms

# assuming your dataframe is called df
df.duration = map(x -> Dates.Minute(x), df.duration) # convert duration to Minute
# filter for the year 2022
df_2023 = filter(row -> row.year == 2023, df)
# group by week and sum the duration
cumulative_duration = combine(groupby(df_2023, :week), :duration => sum)
# visualize cumulative_duration
bar(
    cumulative_duration[!, :week],
    cumulative_duration[!, :duration_sum],
    xlabel="Week",
    ylabel="Count",
    title="\nAlarms cumulative duration by week in 2023",
    label="Count",
    bar_width=0.3,
    xticks=0:1:52,
    yticks=0:5000:100000,
    rotation=0,
    size=(800, 800),
)

In [None]:
grouped_alarms = combine(groupby(alarms, :region_title), :duration => sum => :duration_sum)
duration_regions = sort(grouped_alarms, :duration_sum, rev=true)

In [None]:
# visualize duration_regions
bar(
    duration_regions.region_title,
    duration_regions.duration_sum,
    xlabel="Region name",
    ylabel="Duration (minutes)",
    title="\nAlarms duration by region",
    label="Duration",
    xrotation=90,
    xticks=:all,
    # yticks=0:100:3500,
)

In [None]:
dayofweek_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# count number of alarms for each :dayofweek
dayofweek_counts = countmap(alarms.dayname)
# convert dictionary to double array
dayofweek_counts = [k => v for (k, v) in dayofweek_counts]
# sort the array by dayofweek_order
dayofweek_counts = sort(dayofweek_counts, by=x -> findfirst(dayofweek_order .== x[1]))
# take each pair and seprate them into 2 arrays first value of pair to first array and second value of pair to second array
dayofweek_title, counts = [x[1] for x in dayofweek_counts], [x[2] for x in dayofweek_counts]

In [None]:
# visualize it converting :dayofweek to monday, tuesday, etc
bar(
    dayofweek_title,
    counts,
    xlabel="\nDay of week",
    ylabel="Count",
    title="\nAlarms count by day of week",
    label="Count",
    xrotation=0,
    xticks=:all,
    yticks=0:100:3500,
    bar_width=0.3,
    size=(600, 600),
    legend=false,
)

In [None]:
alarms_kyiv = filter(row -> row.region_city == "Київ", alarms)
alarms_kyiv = filter(row -> row.year == 2023, alarms)

city_title_counts = countmap(alarms_kyiv.dayname)
city_title_counts = [k => v for (k, v) in city_title_counts]
city_title_counts = sort(city_title_counts, by=x -> findfirst(dayofweek_order .== x[1]))
city_title_title, counts = [x[1] for x in city_title_counts], [x[2] for x in city_title_counts]

In [None]:
bar(
    city_title_title,
    counts,
    xlabel="\nDay of week",
    ylabel="Count",
    title="\nAlarms count by day of week in Kyiv 2023",
    label="Count",
    xrotation=0,
    xticks=:all,
    yticks=0:100:3500,
    bar_width=0.3,
    size=(600, 600),
    legend=false,
)

In [None]:
alarms_kyiv = filter(row -> row.region_city == "Київ", alarms)
alarms_kyiv = filter(row -> row.year == 2022, alarms)
alarms_kyiv = filter(row -> row.month in 9:12, alarms)

city_title_counts = countmap(alarms_kyiv.dayname)
city_title_counts = [k => v for (k, v) in city_title_counts]
city_title_counts = sort(city_title_counts, by=x -> findfirst(dayofweek_order .== x[1]))
city_title_title, counts = [x[1] for x in city_title_counts], [x[2] for x in city_title_counts]

In [None]:
bar(
    city_title_title,
    counts,
    xlabel="\nDay of week",
    ylabel="Count",
    title="\nAlarms count by day of week\nin Kyiv 2022 (September - December)",
    label="Count",
    xrotation=0,
    xticks=:all,
    yticks=0:100:3500,
    bar_width=0.3,
    size=(600, 600),
    legend=false,
)

In [None]:
alarms_kyiv = filter(row -> row.region_city == "Київ", alarms)
alarms_kyiv = filter(row -> row.year == 2022, alarms)
alarms_kyiv = filter(row -> row.month in 4:8, alarms)

city_title_counts = countmap(alarms_kyiv.dayname)
city_title_counts = [k => v for (k, v) in city_title_counts]
city_title_counts = sort(city_title_counts, by=x -> findfirst(dayofweek_order .== x[1]))
city_title_title, counts = [x[1] for x in city_title_counts], [x[2] for x in city_title_counts]

In [None]:
bar(
    city_title_title,
    counts,
    xlabel="\nDay of week",
    ylabel="Count",
    title="\nAlarms count by day of week\nin Kyiv 2022 (April - August)",
    label="Count",
    xrotation=0,
    xticks=:all,
    yticks=0:100:3500,
    bar_width=0.3,
    size=(600, 600),
    legend=false,
)

In [None]:
alarms_kyiv = filter(row -> row.region_city == "Київ", alarms)
alarms_kyiv = filter(row -> row.year == 2022, alarms)
alarms_kyiv = filter(row -> row.month in 2:3, alarms)

city_title_counts = countmap(alarms_kyiv.dayname)
city_title_counts = [k => v for (k, v) in city_title_counts]
city_title_counts = sort(city_title_counts, by=x -> findfirst(dayofweek_order .== x[1]))
city_title_title, counts = [x[1] for x in city_title_counts], [x[2] for x in city_title_counts]

In [None]:
bar(
    city_title_title,
    counts,
    xlabel="\nDay of week",
    ylabel="Count",
    title="\nAlarms count by day of week\nin Kyiv 2022 (February - March)",
    label="Count",
    xrotation=0,
    xticks=:all,
    yticks=0:100:3500,
    bar_width=0.3,
    size=(600, 600),
    legend=false,
)