/
text.jl
91 lines (77 loc) · 2.83 KB
/
text.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Converts a String to Languages.Language (using STR_TO_LANG)
convert(::Type{L}, lang::S) where {L<:Languages.Language, S<:AbstractString} = begin
TypeLang = get(STR_TO_LANG, strip(lowercase(lang)), Languages.English)
return TypeLang()
end
# Converts Languages.Language to String (using LANG_TO_STR)
convert(::Type{S}, lang::Type{L}) where {L<:Languages.Language, S<:AbstractString} =
get(LANG_TO_STR, lang, string(L))
convert(::Type{S}, lang::L) where {L<:Languages.Language, S<:AbstractString} =
convert(S, L)
# Overload ismatch to work matching any value within a vector
occursin(r::Regex, strings::T) where T<:AbstractArray{<:AbstractString} =
any(occursin(r, si) for si in sv);
# Overload lowercase function to work with vectors of strings
lowercase(v::T) where T<:AbstractArray{S} where S<:AbstractString =
Base.lowercase.(v)
"""
detect_language(text [; default=DEFAULT_LANGUAGE])
Detects the language of a piece of `text`. Returns a language of
type `Languages.Language`. If the text is empty of the confidence
is low, return the `default` language.
"""
function detect_language(text::AbstractString; default=DEFAULT_LANGUAGE)
isempty(text) && return default
detector = LanguageDetector()
l, _, c = detector(text) # returns (language, script, confidence)
if c < 0.15
return default()
else
return l
end
end
"""
summarize(sentences [;ns=1, flags=DEFAULT_SUMMARIZATION_STRIP_FLAGS])
Build a summary of the text's `sentences`. The resulting summary will be
a `ns` sentence document; each sentence is pre-procesed using the
`flags` option.
"""
function summarize(sentences::Vector{S};
ns::Int=1,
flags::UInt32=DEFAULT_SUMMARIZATION_STRIP_FLAGS
) where S<:AbstractString
# Get document term matrix
s = prepare.(sentences, flags)
filter!(d->occursin(r"[a-zA-Z0-9]", d), s)
t = dtm(DocumentTermMatrix{Float32}(s))
tf_idf!(t)
# Page rank
α = 0.85 # damping factor
n = 100 # number of iterations
ϵ = 1.0e-6 # convergence threhshold
G = Graph(t' * t)
try
p = pagerank(G, α, n, ϵ)
# Sort sentences and return
text_summary = sentences[sort(sortperm(p, rev=true)[1:min(ns, length(p))])]
return text_summary
catch
@warn "Summarization failed during TextRank. No summarization done."
return sentences
end
end
"""
Post-processes a string to fit a certain length, adding … if necessary
at the end of its choped represenation.
"""
function chop_to_length(input, len)
input = replace(input, "\n" => "")
idxs = collect(eachindex(input))
_idx = findlast(Base.:<=(len), idxs)
if _idx == nothing
_len=0
else
_len = idxs[findlast(Base.:<=(len), idxs)]
end
length(input) > len ? input[1:_len] * "…" : input
end