/
lsa.jl
419 lines (362 loc) · 14.9 KB
/
lsa.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
"""
LSAModel{S<:AbstractString, T<:AbstractFloat, A<:AbstractMatrix{T}, H<:Integer}
LSA (latent semantic analysis) model. It constructs from a document term matrix (dtm)
a model that can be used to embed documents in a latent semantic space pertaining to
the data. The model requires that the document term matrix be a
`DocumentTermMatrix{T<:AbstractFloat}` because the elements of the matrices resulted
from the SVD operation are floating point numbers and these have to match or be
convertible to type `T`.
# Fields
* `vocab::Vector{S}` a vector with all the words in the corpus
* `vocab_hash::OrderedDict{S,H}` a word to index in word embeddings matrix mapping
* `Σinv::A` diagonal of the inverse singular value matrix
* `Uᵀ::A` transpose of the word embedding matrix
* `stats::Symbol` the statistical measure to use for word importances in documents. Available values are: `:count` (term count), `:tf` (term frequency), `:tfidf` (default, term frequency-inverse document frequency) and `:bm25` (Okapi BM25)
* `idf::Vector{T}` inverse document frequencies for the words in the vocabulary
* `nwords::T` averge number of words in a document
* `ngram_complexity::Int` ngram complexity
* `κ::Int` the `κ` parameter of the BM25 statistic
* `β::Float64` the `β` parameter of the BM25 statistic
* `tol::T` minimum size of the vector components (default `T(1e-15)`)
# SVD matrices `U`, `Σinv` and `V`:
If `X` is a `m`×`n` document-term-matrix with `n` documents and `m` words so that
`X[i,j]` represents a statistical indicator of the importance of term `i` in document `j`
then:
* `U, Σ, V = svd(X)`
* `Σinv = diag(inv(Σ))`
* `Uᵀ = U'`
* `X ≈ U * Σ * V'`
The matrix `V` of document embeddings is not actually stored in the model.
# Examples
```
julia> using StringAnalysis
doc1 = StringDocument("This is a text about an apple. There are many texts about apples.")
doc2 = StringDocument("Pears and apples are good but not exotic. An apple a day keeps the doctor away.")
doc3 = StringDocument("Fruits are good for you.")
doc4 = StringDocument("This phrase has nothing to do with the others...")
doc5 = StringDocument("Simple text, little info inside")
crps = Corpus(AbstractDocument[doc1, doc2, doc3, doc4, doc5])
prepare!(crps, strip_punctuation)
update_lexicon!(crps)
dtm = DocumentTermMatrix{Float32}(crps, collect(keys(crps.lexicon)))
### Build LSA Model ###
lsa_model = LSAModel(dtm, k=3, stats=:tf)
query = StringDocument("Apples and an exotic fruit.")
idxs, corrs = cosine(lsa_model, crps, query)
println("Query: \"\$(query.text)\"")
for (idx, corr) in zip(idxs, corrs)
println("\$corr -> \"\$(crps[idx].text)\"")
end
Query: "Apples and an exotic fruit."
0.9746108 -> "Pears and apples are good but not exotic An apple a day keeps the doctor away "
0.870703 -> "This is a text about an apple There are many texts about apples "
0.7122063 -> "Fruits are good for you "
0.22725986 -> "This phrase has nothing to do with the others "
0.076901935 -> "Simple text little info inside "
```
# References:
* [The LSA wiki page](https://en.wikipedia.org/wiki/Latent_semantic_analysis)
* [Deerwester et al. 1990](http://lsa.colorado.edu/papers/JASIS.lsi.90.pdf)
"""
struct LSAModel{S<:AbstractString, T<:AbstractFloat, A<:AbstractMatrix{T}, H<:Integer}
vocab::Vector{S} # vocabulary
vocab_hash::OrderedDict{S,H} # term to column index in U
Σinv::Vector{T} # diagonal of inverse of Σ
Uᵀ::A # word vectors (transpose of U)
stats::Symbol # term/document importance
idf::Vector{T} # inverse document frequencies
nwords::T # average words/document in corpus
ngram_complexity::Int # ngram complexity
κ::Int # κ parameter for Okapi BM25 (used if stats==:bm25)
β::Float64 # β parameter for Okapi BM25 (used if stats==:bm25)
tol::T # Minimum size of vector elements
end
function LSAModel(dtm::DocumentTermMatrix{T}; kwargs...) where T<:Integer
throw(ErrorException(
"""A LSA model requires a that the document term matrix
be a DocumentTermMatrix{<:AbstractFloat}!"""))
end
function LSAModel(dtm::DocumentTermMatrix{T};
k::Int=size(dtm.dtm, 2),
stats::Symbol=:tfidf,
ngram_complexity::Int=DEFAULT_NGRAM_COMPLEXITY,
tol::T=T(1e-15),
κ::Int=BM25_KAPPA,
β::Float64=BM25_BETA
) where T<:AbstractFloat
p, n = size(dtm.dtm)
k = clamp(k, 1, min(k, n))
zeroval = zero(T)
minval = T(tol)
# Checks
length(dtm.terms) != p &&
throw(DimensionMismatch("Dimensions inside dtm are inconsistent."))
if !(stats in [:count, :tf, :tfidf, :bm25])
@warn "stats has to be either :tf, :tfidf or :bm25; defaulting to :tfidf"
stats = :tfidf
end
# Calculate inverse document frequency, mean document size
documents_containing_term = vec(sum(dtm.dtm .> 0, dims=2)) .+ one(T)
idf = log.(n ./ documents_containing_term) .+ one(T)
nwords = mean(sum(dtm.dtm, dims=1))
# Get X
if stats == :count
X = dtm.dtm
elseif stats == :tf
X = tf(dtm.dtm)
elseif stats == :tfidf
X = tf_idf(dtm.dtm)
elseif stats == :bm25
X = bm_25(dtm.dtm, κ=κ, β=β)
end
# Decompose document-word statistic
local U₀::Matrix{T}, Σ::Vector{T}
try
U₀, Σ, _ = tsvd(X, k)
catch
# Try regular svd as a
# last resort
U₀, Σ, _ = svd(Matrix(X))
U₀ = U₀[:, 1:k]
Σ = Σ[1:k]
end
# Build model components
Σinv = 1 ./ Σ
Σinv[abs.(Σinv) .< minval] .= zeroval
U₀[abs.(U₀).< minval] .= zeroval
U = Matrix{T}(U₀')
# Return the model
return LSAModel(dtm.terms, dtm.row_indices,
Σinv, U, stats, idf, nwords,
ngram_complexity, κ, β, minval)
end
function Base.show(io::IO, lm::LSAModel{S,T,A,H}) where {S,T,A,H}
len_vecs, num_terms = size(lm.Uᵀ)
print(io, "LSA Model ($(lm.stats)), $(num_terms) terms, " *
"dimensionality $(len_vecs), $(T) vectors")
end
"""
lsa(X [;k=<num documents>, stats=:tfidf, κ=2, β=0.75, tol=1e-15])
Constructs a LSA model. The input `X` can be a `Corpus` or a `DocumentTermMatrix`.
Use `?LSAModel` for more details. Vector components smaller than `tol` will be
zeroed out.
"""
function lsa(dtm::DocumentTermMatrix{T};
k::Int=size(dtm.dtm, 2),
stats::Symbol=:tfidf,
ngram_complexity::Int=DEFAULT_NGRAM_COMPLEXITY,
tol::T=T(1e-15),
κ::Int=BM25_KAPPA,
β::Float64=BM25_BETA) where T<:AbstractFloat
LSAModel(dtm, k=k, stats=stats, ngram_complexity=ngram_complexity, κ=κ, β=β, tol=tol)
end
function lsa(crps::Corpus,
::Type{T} = DEFAULT_FLOAT_TYPE;
k::Int=length(crps),
stats::Symbol=:tfidf,
ngram_complexity::Int=DEFAULT_NGRAM_COMPLEXITY,
tol::T=T(1e-15),
κ::Int=BM25_KAPPA,
β::Float64=BM25_BETA) where T<:AbstractFloat
lex = create_lexicon(crps, ngram_complexity)
lsa(DocumentTermMatrix{T}(crps, lex, ngram_complexity=ngram_complexity),
k=k, stats=stats, ngram_complexity=ngram_complexity,
κ=κ, β=β, tol=tol)
end
"""
vocabulary(lm)
Return the vocabulary as a vector of words of the LSA model `lm`.
"""
vocabulary(lm::LSAModel) = lm.vocab
"""
in_vocabulary(lm, word)
Return `true` if `word` is part of the vocabulary of the LSA model `lm` and
`false` otherwise.
"""
in_vocabulary(lm::LSAModel, word::AbstractString) = word in lm.vocab
"""
size(lm)
Return a tuple containin input and output dimensionalities of the LSA model `lm`.
"""
size(lm::LSAModel) = size(lm.Uᵀ,2), length(lm.Σinv)
"""
index(lm, word)
Return the index of `word` from the LSA model `lm`.
"""
index(lm::LSAModel, word) = lm.vocab_hash[word]
"""
get_vector(lm, word)
Returns the vector representation of `word` from the LSA model `lm`.
"""
function get_vector(lm::LSAModel{S,T,A,H}, word) where {S,T,A,H}
default = zeros(T, length(lm.Σinv))
idx = get(lm.vocab_hash, word, 0)
if idx == 0
return default
else
return lm.Uᵀ[:, idx]
end
end
"""
embed_document(lm, doc)
Return the vector representation of `doc`, obtained using the LSA model `lm`.
`doc` can be an `AbstractDocument`, `Corpus` or DTV or DTM.
"""
embed_document(lm::LSAModel{S,T,A,H}, doc::AbstractDocument) where {S,T,A,H} =
# Hijack vocabulary hash to use as lexicon (only the keys needed)
embed_document(lm, dtv(doc, lm.vocab_hash, T,
ngram_complexity=lm.ngram_complexity,
lex_is_row_indices=true))
embed_document(lm::LSAModel{S,T,A,H}, doc::AbstractString) where {S,T,A,H} =
embed_document(lm, NGramDocument{S}(doc, DocumentMetadata(), lm.ngram_complexity))
embed_document(lm::LSAModel{S,T,A,H}, doc::Vector{S2}) where {S,T,A,H,S2<:AbstractString} =
embed_document(lm, TokenDocument{S}(doc))
# Actual embedding function: takes as input the LSA model `lm` and a document
# term vector `dtv`. Returns the representation of `dtv` in the embedding space.
function embed_document(lm::LSAModel{S,T,A,H}, dtv::AbstractVector{T}) where {S,T,A,H}
words_in_document = sum(dtv)
# Calculate document vector
if lm.stats == :count
v = dtv
elseif lm.stats == :tf
v = sqrt.(dtv ./ max(words_in_document, one(T)))
elseif lm.stats == :tfidf
v = sqrt.(dtv ./ max(words_in_document, one(T))) .* lm.idf
elseif lm.stats == :bm25
k = T(lm.κ)
b = T(lm.β)
tf = sqrt.(dtv ./ max(words_in_document, one(T)))
v = lm.idf .* (k + 1) .* tf ./
(k * (one(T) - b + b * words_in_document/lm.nwords) .+ tf)
end
# Embed
d̂ = (lm.Σinv .* lm.Uᵀ) * v # embed
d̂ = d̂ ./ (norm(d̂,2) .+ eps(T)) # normalize
d̂[abs.(d̂) .< lm.tol] .= zero(T) # zero small elements
return d̂
end
function embed_document(lm::LSAModel{S,T,A,H}, dtm::DocumentTermMatrix{T}) where {S,T,A,H}
n = size(dtm.dtm,1)
k = size(lm.Uᵀ, 1)
if lm.stats == :count
X = dtm.dtm
elseif lm.stats == :tf
X = tf(dtm)
elseif lm.stats == :tfidf
X = tf_idf(dtm)
elseif lm.stats == :bm25
X = bm_25(dtm, κ=lm.κ, β=lm.β)
end
V = (lm.Σinv .* lm.Uᵀ) * X
V ./= (sqrt.(sum(V.^2, dims=1)) .+ eps(T))
V[abs.(V) .< lm.tol] .= zero(T)
return V
end
function embed_document(lm::LSAModel{S,T,A,H}, crps::Corpus) where {S,T,A,H}
lex = create_lexicon(crps, lm.ngram_complexity)
embed_document(lm, DocumentTermMatrix{T}(crps, lex, ngram_complexity=lm.ngram_complexity))
end
"""
embed_word(lm, word)
Return the vector representation of `word` using the LSA model `lm`.
"""
function embed_word(lm::LSAModel, word)
throw(ErrorException(
"""Word embedding is not supported as it would require storing
all documents in the model in order to determine the counts
of `word` across the corpus."""))
end
"""
cosine(model, docs, doc, n=10)
Return the positions of the `n` closest neighboring documents to `doc`
found in `docs`. `docs` can be a corpus or document term matrix.
The vector representations of `docs` and `doc` are obtained with the
`model` which can be either a `LSAModel` or `RPModel`.
"""
function cosine(model, docs, doc, n=10)
metrics = embed_document(model, docs)' * embed_document(model, doc)
n = min(n, length(metrics))
topn_positions = sortperm(metrics[:], rev = true)[1:n]
topn_metrics = metrics[topn_positions]
return topn_positions, topn_metrics
end
"""
similarity(model, doc1, doc2)
Return the cosine similarity value between two documents `doc1` and `doc2`
whose vector representations have been obtained using the `model`,
which can be either a `LSAModel` or `RPModel`.
"""
function similarity(model, doc1, doc2)
return embed_document(model, doc1)' * embed_document(model, doc2)
end
"""
save(lm, filename)
Saves an LSA model `lm` to disc in file `filename`.
"""
function save_lsa_model(lm::LSAModel{S,T,A,H}, filename::AbstractString) where {S,T,A,H}
k, nwords = size(lm.Uᵀ)
open(filename, "w") do fid
println(fid, "LSA Model saved at $(Dates.now())")
println(fid, "$nwords $k") # number of documents, words, k
println(fid, lm.stats)
writedlm(fid, lm.idf', " ")
println(fid, lm.nwords)
println(fid, lm.ngram_complexity)
println(fid, lm.κ)
println(fid, lm.β)
println(fid, lm.tol)
# Σinv diagonal
writedlm(fid, lm.Σinv', " ")
# Word embeddings
idxs = [lm.vocab_hash[word] for word in lm.vocab]
writedlm(fid, [lm.vocab lm.Uᵀ[:, idxs]'], " ")
end
end
"""
load_lsa_model(filename, eltype; [sparse=false])
Loads an LSA model from `filename` into an LSA model object. The embeddings matrix
element type is specified by `eltype` (default `DEFAULT_FLOAT_TYPE`) while the keyword argument
`sparse` specifies whether the matrix should be sparse or not.
"""
function load_lsa_model(filename::AbstractString, ::Type{T}=DEFAULT_FLOAT_TYPE;
sparse::Bool=false) where T<: AbstractFloat
# Matrix type for LSA model
A = ifelse(sparse, SparseMatrixCSC{T, Int}, Matrix{T})
# Define parsed variables local to outer scope of do statement
local vocab, vocab_hash, Σinv, Uᵀ, stats, idf, nwords, ngram_complexity, κ, β, tol
open(filename, "r") do fid
readline(fid) # first line, header
line = readline(fid)
vocab_size, k = map(x -> parse(Int, x), split(line, ' '))
# Preallocate
vocab = Vector{String}(undef, vocab_size)
vocab_hash = OrderedDict{String, Int}()
if sparse
Uᵀ = SparseMatrixCSC{T, Int}(UniformScaling(0), k, vocab_size)
Σinv = spzeros(T, k)
else
Uᵀ = Matrix{T}(undef, k, vocab_size)
Σinv = zeros(T, k)
end
# Start parsing the rest of the file
stats = Symbol(strip(readline(fid)))
idf = map(x->parse(T, x), split(readline(fid), ' '))
nwords = parse(T, readline(fid))
ngram_complexity = parse(Int, readline(fid))
κ = parse(Int, readline(fid))
β = parse(Float64, readline(fid))
tol = parse(Float64, readline(fid))
Σinv = map(x->parse(T, x), split(readline(fid), ' '))
for i in 1:vocab_size
line = strip(readline(fid))
parts = split(line, ' ')
word = parts[1]
vector = map(x-> parse(T, x), parts[2:end])
vocab[i] = word
push!(vocab_hash, word=>i)
Uᵀ[:, i] = vector
end
end
return LSAModel{String,T,A,Int}(vocab, vocab_hash, Σinv, Uᵀ, stats, idf,
nwords, ngram_complexity, κ, β, tol)
end