### Installing depencencies


In [None]:
ENV["PYTHON"] = "C:\\Users\\lap2r\\AppData\\Local\\Programs\\Python\\Python311\\python.exe"
ENV["PYTHON"] = "C:\\Users\\lap2r\\AppData\\Local\\Programs\\Python\\Python311\\python.exe"

# using Pkg
# Pkg.add("PyCall")
# Pkg.build("PyCall")


# Pkg.add("OrderedCollections")
# Pkg.add("PrettyTables")
# Pkg.add("DataFrames")
# Pkg.add("DotEnv")
# Pkg.add("CSV")
# Pkg.add("Plots")
# Pkg.add("Dates")
# Pkg.add("Gadfly")
# Pkg.add("StatsPlots")
# Pkg.add("StatsBase")
# Pkg.add("JSON")
# Pkg.add("Makie")


# using Conda
# Conda.add("nltk")
# Conda.add("num2words")

### Environment configuration


In [None]:
using PyCall
pushfirst!(PyVector(pyimport("sys")["path"]), joinpath(@__DIR__, ".."))
paths_rel = pyimport("paths_rel")

alarms_data_file_path = "../" * paths_rel.REL_ALARMS_DATA_FILE
regions_data_file_path = "../" * paths_rel.REL_REGIONS_DATA_FILE
weather_data_file_path = "../" * paths_rel.REL_WEATHER_DATA_FILE
tfidf_csv_path = "../" * paths_rel.REL_ISW_TF_IDF_RESULT

## Alarms dataset analysis


##### Viewing dataset parameters


In [None]:
using DataFrames
using CSV
using StatsPlots

alarms = DataFrame(CSV.File(alarms_data_file_path))

In [None]:
size(alarms)

In [None]:
eltype.(eachcol(alarms))

In [None]:
describe(alarms)

##### Analysing dataset contents


In [None]:
# first event date
println("First event date: ", minimum(alarms[!, :start]))

# max event date
println("Latest event date: ", maximum(alarms[!, :end]))

In [None]:
using Dates
using Statistics

transform!(alarms, [:start, :end] => ((x, y) -> DateTime.(y, "yyyy-mm-dd HH:MM:SS") - DateTime.(x, "yyyy-mm-dd HH:MM:SS")) => :duration)

transform!(alarms, :duration => (x -> Minute.(round.(Int, Dates.value.(x) / (1000 * 60)))) => :duration)

println("minimum duration: ", minimum(alarms[!, :duration]))
println("maximum duration: ", maximum(alarms[!, :duration]))

sort!(alarms, :duration, rev=false)
println("Sorted by minimum duration:")
println(first(alarms, 10))
println("Sorted by maximum duration:")
sort!(alarms, :duration, rev=true)
println(first(alarms, 10))

In [None]:
using StatsBase;
duration_counts_v1 = countmap(alarms.duration)
# convert Dict{Minute, Int64} to Dict{Int64, Int64}
duration_counts_v1 = Dict{Int64, Int64}(k.value => v for (k, v) in duration_counts_v1)

In [None]:
# PLOTS_DEFAULTS = Dict(:dpi => 600)

# https://docs.juliaplots.org/latest/generated/attributes_axis/
# https://docs.juliaplots.org/latest/generated/attributes_plot/
# https://docs.juliaplots.org/latest/generated/attributes_subplot/

using Plots.PlotMeasures

default(
    legend=true,
    color=:orangered,
    background_color=:white,
    linecolor=:orangered,
    left_margin=5mm,
    right_margin=5mm,
    top_margin=5mm,
    bottom_margin=5mm,
    xrotation=90,
    draw_arrow=true,
    grid=false,
    minorgrid=false,
    dpi=600,
    size=(800, 800),
)

In [None]:
using Plots;

max_x = maximum(keys(duration_counts_v1))
max_y = maximum(values(duration_counts_v1))

bar(
    duration_counts_v1,
    title="\nAlarm duration to count",
    xlabel="Duration (minutes)",
    ylabel="Count",
    label="Count",
    xticks=(0:60:max_x+15),
    yticks=(0:15:max_y+15),
    xlims=(-20, max_x + 15),
    ylims=(-10, max_y + 15),
    grid=true,
    minorgrid=true,
)

In [None]:
duration_counts = countmap(alarms.duration)

# convert dictionary to double array
duration_counts = [k => v for (k, v) in duration_counts]
duration_counts = sort(duration_counts, by=x -> x[2], rev=true)
duration_counts = duration_counts[1:50]

# take each pair and seprate them into 2 arrays first value of pair to first array and second value of pair to second array
duration, counts = [x[1] for x in duration_counts], [x[2] for x in duration_counts]
# create dataframe from items
duration_counts = DataFrame(duration=duration, counts=counts)

In [None]:
# create a vector of count to region_title
region_counts = countmap(alarms.region_title)
# convert dictionary to double array
region_counts = [k => v for (k, v) in region_counts]
# sort the array by count
region_counts = sort(region_counts, by=x -> x[2], rev=true)
# take each pair and seprate them into 2 arrays first value of pair to first array and second value of pair to second array
region_title, counts = [x[1] for x in region_counts], [x[2] for x in region_counts]
# create dataframe from items
region_counts = DataFrame(region_title=region_title, counts=counts)

In [None]:
bar(
    region_counts.region_title,
    region_counts.counts,
    xlabel="Region name",
    ylabel="Count",
    title="Alarms Count by region",
    label="Count",
    xrotation=90,
    xticks=:all,
    yticks=0:100:3500,
)

In [None]:
alarms

In [None]:
using Dates

transform!(alarms, :start => (x -> Dates.year.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :year)
transform!(alarms, :start => (x -> Dates.month.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :month)
transform!(alarms, :start => (x -> Dates.day.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :day)
transform!(alarms, :start => (x -> Dates.week.(DateTime.(x, "yyyy-mm-dd HH:MM:SS"))) => :week)

alarms[alarms.day .== 1 .&& alarms.month .== 1 .&& alarms.year .== 2023, :week] .= 0
alarms[alarms.day .== 1 .&& alarms.month .== 1 .&& alarms.year .== 2023, :]

In [None]:
alarms[alarms.year .== 2023, :]

In [None]:
week_counts_2022 = countmap(alarms[alarms.year .== 2022, :week])
week_2022, counts_2022 = [x[1] for x in week_counts_2022], [x[2] for x in week_counts_2022]

max_x = maximum(week_2022)
max_y = maximum(counts_2022)

bar(
    week_2022,
    counts_2022,
    xlabel="Week",
    ylabel="Count",
    title="Alarms count by week in 2022",
    label="Count",
    xrotation=0,
    xticks=0:1:max_x,
    yticks=0:100:max_y,
    bar_width=0.3,
    size=(1400, 800),
)

In [None]:
week_counts_2023 = countmap(alarms[alarms.year .== 2023, :week])
week_2023, counts_2023 = [x[1] for x in week_counts_2023], [x[2] for x in week_counts_2023]

max_x = maximum(week_2023)
max_y = maximum(counts_2023)

bar(
    week_2023,
    counts_2023,
    xlabel="Week",
    ylabel="Count",
    title="Alarms count by week in 2023",
    label="Count",
    xrotation=0,
    xticks=0:1:max_x,
    yticks=0:25:max_y,
    bar_width=0.3,
    size=(600, 600),
)

In [None]:
using Dates
using DataFrames

# copy alarms
df = alarms

# assuming your dataframe is called df
df.duration = map(x -> Dates.Minute(x), df.duration) # convert duration to Minute
# filter for the year 2022
df_2022 = filter(row -> row.year == 2022, df)
# group by week and sum the duration
cumulative_duration = combine(groupby(df_2022, :week), :duration => sum)
# visualize cumulative_duration
bar(
    cumulative_duration[!, :week],
    cumulative_duration[!, :duration_sum],
    xlabel="Week",
    ylabel="Count",
    title="Alarms cumulative duration by week in 2022",
    label="Count",
    bar_width=0.3,
    xticks=0:1:52,
    yticks=0:5000:100000,
    size=(1400, 800),
)

In [None]:
using Dates
using DataFrames

# copy alarms
df = alarms

# assuming your dataframe is called df
df.duration = map(x -> Dates.Minute(x), df.duration) # convert duration to Minute
# filter for the year 2022
df_2023 = filter(row -> row.year == 2023, df)
# group by week and sum the duration
cumulative_duration = combine(groupby(df_2023, :week), :duration => sum)
# visualize cumulative_duration
bar(
    cumulative_duration[!, :week],
    cumulative_duration[!, :duration_sum],
    xlabel="Week",
    ylabel="Count",
    title="Alarms cumulative duration by week in 2023",
    label="Count",
    bar_width=0.3,
    xticks=0:1:52,
    yticks=0:5000:100000,
    size=(800, 800),
)

In [None]:
grouped_alarms = combine(groupby(alarms, :region_title), :duration => sum => :duration_sum)
duration_regions = sort(grouped_alarms, :duration_sum, rev=true)

In [None]:
# visualize duration_regions
bar(
    duration_regions.region_title,
    duration_regions.duration_sum,
    xlabel="Region name",
    ylabel="Duration (minutes)",
    title="Alarms duration by region",
    label="Duration",
    xrotation=90,
    xticks=:all,
    # yticks=0:100:3500,
)

**TF-IDF**


In [None]:
tfidf = DataFrame(CSV.File(tfidf_csv_path))

println(size(tfidf))
println(names(tfidf))
println(describe(tfidf))

using JSON
tfidf_keywords = tfidf[!, :Keywords]
tfidf_keywords = [JSON.parse(replace(tfidf_keywords[i], "'" => "\""), dicttype=Dict{String,Float64}) for i in 1:length(tfidf_keywords)]

# append tfidf with :Count column which will contain number of items in :Keywords
tfidf[!, :Count] = [length(tfidf_keywords[i]) for i in 1:length(tfidf_keywords)]

# tfidf_keywords_count = Dict{String,Int64}()
# for i in 1:length(tfidf_keywords)
#     for (k, v) in tfidf_keywords[i]
#         if haskey(tfidf_keywords_count, k)
#             tfidf_keywords_count[k] += 1
#         else
#             tfidf_keywords_count[k] = 1
#         end
#     end
# end
# tfidf_keywords_count


# show :Name to count of :Keywords like "assessment-2022-02-24" - 4343 where 4343 is a count of items in :Keywords

sort!(tfidf, [:Count], rev=true)
# show :Name :Date and :Count columns
tfidf[!, [:Name, :Date, :Count]]


start_date = Date.("2022-02-01", "yyyy-mm-dd")
end_date = Date.(Dates.now())

p = plot(
        tfidf[!, :Date],
        tfidf[!, :Count],
        seriestype=:scatter,
        xlabel="Date",
        ylabel="Count",
        xlims=Dates.value.([start_date, end_date]),
        title="Date vs Unique words count",
        legend=true,
        label="count",
        grid=false,
        size=(1200, 600),
        
);

p

### ISW Data Analysis


In [None]:
wordFrequencyToDateFrame = DataFrame()
# word_collection = Dict{String,Int64}()

file_contents = ""
for (root, dirs, files) in walkdir("../0_data_scrapping/results/isw/")
    for file in files
        if endswith(file, ".txt")
            file_path = joinpath(root, file)
            file_content = read(file_path, String)
            file_contents = file_contents * " " * file_content

            words_count = length(split(file_content))
            date = replace(file, "assessment-" => "")
            date = replace(date, ".txt" => "")
            date = Date.(date, "yyyy-mm-dd")
            wordFrequencyToDateFrame = vcat(wordFrequencyToDateFrame, DataFrame(Date=date, Count=words_count))
        end
    end
end

# call do_processing() function in 2_data_prepartion/text_preprocessing.py using pycall

# println(PyCall.python)

# println(joinpath(@__DIR__, "..", "2_data_preparation"))



using PyCall
pushfirst!(PyVector(pyimport("sys")["path"]), joinpath(@__DIR__, "..", "2_data_preparation"))
text_processing = pyimport("text_preprocessing")
file_contents = text_processing.do_preprocessing(file_contents)



wordFrequency = countmap(file_contents)
wordFrequencyToDateFrame = DataFrame(wordFrequency)
# set name for dataframe
rename!(wordFrequencyToDateFrame)
# wordFrequencyToDateFrame
describe(wordFrequencyToDateFrame)



using OrderedCollections

od = OrderedDict(wordFrequency)
od_vector = sort!(od; byvalue=true, rev=true)


od_vector = first(od_vector, 100)
od_vector = map(x -> [x.first, string(x.second)], od_vector)

words_for_table = map(x -> x[1], od_vector)
counts_for_table = map(x -> x[2], od_vector)

Base.displaysize(x::DataFrame) = (100, 100)
dadasdasd = DataFrame(Word=words_for_table, Count=counts_for_table)
show(dadasdasd, allrows=true)

In [None]:
# using Plots
# Plots.plot(words_for_table, counts_for_table)



# visualize this Vector{Vector{Any}} (od_vector)
# using PrettyTables
# pretty_table(od_vector)



# # visualize it
# using Plots

# sortedWordFrequency = sort(collect(wordFrequency), by=x->x[2], rev=true)
# arrr = map(x -> [x.first, x.second], sortedWordFrequency)

# display large table of first 50 words in wordFrequency



# convert it to dataframe
# wordFrequencyToDateFrame = DataFrame(wordFrequency)
# sort by :Count
# sort!(wordFrequencyToDateFrame, :Count, rev=true)


# using PrettyTables
# # Sort the dictionary by keys
# sorted_keys = sort(collect(keys(wordFrequency)))
# sorted_values = [wordFrequency[k] for k in sorted_keys]
# pretty_table([sorted_keys sorted_values], ["Key", "Value"])
# # Sort the dictionary by values
# sorted_pairs = sort(collect(wordFrequency), by = x -> x[2])
# pretty_table(sorted_pairs, ["Key", "Value"])

In [None]:
# create dataframe
# 3×7 DataFrame
#  Row │ variable  mean     min                                median   max                                nmissing  eltype   
#      │ Symbol    Nothing  Any                                Nothing  Any                                Int64     DataType 
# ─────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#    1 │ Name               assessment-2022-02-24                       assessment-2023-04-03                     0  String31
#    2 │ Date               2022-02-24                                  2023-04-03                                0  Date
#    3 │ Keywords           {'agent': 0.20139877526758002, '…           {'znpp': 0.26604239982226846, 'c…         0  String

# sort it


# sort!(wordFrequencyToDateFrame, :Count, rev=true)

# println(describe(wordFrequencyToDateFrame))

# using Plots
# start_date = Date.("2022-02-01", "yyyy-mm-dd")
# end_date = Date.(Dates.now())
# p = plot(wordFrequencyToDateFrame[!, :Date], wordFrequencyToDateFrame[!, :Count], seriestype=:scatter, xlabel="Date", ylabel="Count",
#     xlims=Dates.value.([start_date, end_date]), title="Date vs Total article words count",
#     legend=true, label="count", color="white", background_color="black",
#     grid=false);
# println(p)
# println(word_collection)