### Installing depencencies

In [None]:
ENV["PYTHON"] = "C:\\Users\\lap2r\\AppData\\Local\\Programs\\Python\\Python311\\python.exe"
ENV["PYTHON"] = "C:\\Users\\lap2r\\AppData\\Local\\Programs\\Python\\Python311\\python.exe"

### Environment configuration

In [None]:
using PyCall
pushfirst!(PyVector(pyimport("sys")["path"]), joinpath(@__DIR__, ".."))
paths_rel = pyimport("paths_rel")

tfidf_csv_path = "../" * paths_rel.REL_ISW_TF_IDF_RESULT

In [None]:
# PLOTS_DEFAULTS = Dict(:dpi => 600)

# https://docs.juliaplots.org/latest/generated/attributes_axis/
# https://docs.juliaplots.org/latest/generated/attributes_plot/
# https://docs.juliaplots.org/latest/generated/attributes_subplot/

using Plots.PlotMeasures

default(
    legend=true,
    left_margin=5mm,
    right_margin=5mm,
    top_margin=5mm,
    bottom_margin=5mm,
    xrotation=90,
    draw_arrow=true,
    grid=false,
    minorgrid=false,
    dpi=600,
    size=(800, 800),
    color=RGB(250 / 255, 135 / 255, 117 / 255),
    linecolor=RGB(250 / 255, 135 / 255, 117 / 255),
    markerstrokecolor=RGB(250 / 255, 135 / 255, 117 / 255),
)

## TF-IDF dataset EDA

In [None]:
using CSV
using DataFrames
tfidf = DataFrame(CSV.File(tfidf_csv_path))

println(size(tfidf))
println(names(tfidf))
println(describe(tfidf))

using JSON
tfidf_keywords = tfidf[!, :Keywords]
tfidf_keywords = [JSON.parse(replace(tfidf_keywords[i], "'" => "\""), dicttype=Dict{String,Float64}) for i in 1:length(tfidf_keywords)]

tfidf[!, :Count] = [length(tfidf_keywords[i]) for i in 1:length(tfidf_keywords)]
sort!(tfidf, [:Count], rev=true)

using Dates
start_date = Date.("2022-02-01", "yyyy-mm-dd")
end_date = Date.(Dates.now())

using Plots
p = plot(
        tfidf[!, :Date],
        tfidf[!, :Count],
        seriestype=:scatter,
        xlabel="\nDate\n",
        ylabel="Count",
        xlims=Dates.value.([start_date, end_date]),
        title="\nUnique words per day (report)\nin calculated TFIDF\n",
        legend=true,
        label="Count",
        grid=false,
        size=(1200, 800),
);

xticks!(p, Dates.value.([start_date:Dates.Month(1):end_date;]), Dates.format.([start_date:Dates.Month(1):end_date;], "yyyy-mm"))

p

In [None]:
wordFrequencyToDateFrame = DataFrame()


allFilenamesV = []
allWordsInArticle = Dict{Date,Int64}()

file_contents = ""
for (root, dirs, files) in walkdir("../0_data_scrapping/results/isw/")
    for file in files
        if endswith(file, ".txt")
            file_path = joinpath(root, file)
            allFilenamesV = vcat(allFilenamesV, file_path)
            file_content = read(file_path, String)
            file_contents = file_contents * " " * file_content

            words_count = length(split(file_content))

            date = replace(file, "assessment-" => "")
            date = replace(date, ".txt" => "")
            date = Date.(date, "yyyy-mm-dd")
            wordFrequencyToDateFrame = vcat(wordFrequencyToDateFrame, DataFrame(Date=date, Count=words_count))

            allWordsInArticle[date] = words_count
        end
    end
end


using PyCall
pushfirst!(PyVector(pyimport("sys")["path"]), joinpath(@__DIR__, "..", "1_data_preparation"))
text_processing = pyimport("text_preprocessing")
file_contents = text_processing.do_preprocessing(file_contents)


using StatsBase
wordFrequency = countmap(file_contents)
wordFrequencyToDateFrame = DataFrame(wordFrequency)
# set name for dataframe
rename!(wordFrequencyToDateFrame)
# wordFrequencyToDateFrame
describe(wordFrequencyToDateFrame)


using OrderedCollections

od = OrderedDict(wordFrequency)
od_vector = sort!(od; byvalue=true, rev=true)


od_vector = first(od_vector, 100)
od_vector = map(x -> [x.first, string(x.second)], od_vector)

words_for_table = map(x -> x[1], od_vector)
counts_for_table = map(x -> x[2], od_vector)

Base.displaysize(x::DataFrame) = (100, 100)
dadasdasd = DataFrame(Word=words_for_table, Count=counts_for_table)
dadasdasd.Count = map(x -> parse(Int64, x), dadasdasd.Count)
show(dadasdasd, allrows=true)

In [None]:
start_date = Date.("2022-02-20", "yyyy-mm-dd")
end_date = Date.(Dates.now())

b = bar(
        collect(keys(allWordsInArticle)),
        collect(values(allWordsInArticle)),
        xlabel="\nDate\n",
        ylabel="Count",
        xlims=Dates.value.([start_date, end_date]),
        xticks=:all,
        title="\nArticle words to date\n",
        legend=true,
        label="Count",
        grid=false,
        size=(1200, 800),
        
)
xticks!(b, Dates.value.([start_date:Dates.Month(1):end_date;]), Dates.format.([start_date:Dates.Month(1):end_date;], "yyyy-mm"))


In [None]:
using Printf

bar(
    dadasdasd.Word,
    dadasdasd.Count,
    xlabel="Word",
    ylabel="Count",
    title="Word frequency",
    label="Number of occurances",
    xrotation=90,
    bar_width=0.3,
    xticks=:all,
    yticks=0:1000:50000,
    yrotation=0,
    yformatter=y -> @sprintf("%d", y),
    aspect_ratio=:none,
    size=(1500, 1000),
)

In [None]:
# from each Dict in tfidf_keywords Vector of Dicts list all unique keys
tfidf_keywords_keys = Set()
for i in 1:length(tfidf_keywords)
    for (word, value) in tfidf_keywords[i]
        push!(tfidf_keywords_keys, word)
    end
end

# create array from tfidf_keywords_keys
tfidf_keywords_keys = collect(tfidf_keywords_keys)

word_to_cum_tfidf_val = Dict{String, Float64}()
for i in 1:length(tfidf_keywords_keys)
    for (word, value) in tfidf_keywords[i]
        if haskey(word_to_cum_tfidf_val, word)
            word_to_cum_tfidf_val[word] += value
        else
            word_to_cum_tfidf_val[word] = value
        end
    end
end

println(word_to_cum_tfidf_val)


wc = wordcloud(
    word_to_cum_tfidf_val,
    fonts="Tahoma",
    colors=:seaborn_dark,
    density=0.5,
) |> generate!
# paint(wc, "collection_wordcloud.png")
