-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
1,549 additions
and
201 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
module SnowballPromptingToolsExt | ||
|
||
using PromptingTools | ||
const PT = PromptingTools | ||
|
||
using PromptingTools.Experimental.RAGTools | ||
const RT = PromptingTools.Experimental.RAGTools | ||
|
||
using Snowball | ||
|
||
# forward to Stemmer.stem | ||
RT._stem(stemmer::Snowball.Stemmer, text::AbstractString) = Snowball.stem(stemmer, text) | ||
|
||
""" | ||
get_keywords(processor::KeywordsProcessor, docs::AbstractVector{<:AbstractString}; | ||
verbose::Bool = true, | ||
stemmer = nothing, | ||
stopwords::Set{String} = Set(STOPWORDS), | ||
return_keywords::Bool = false, | ||
kwargs...) | ||
Generate a `DocumentTermMatrix` from a vector of `docs` using the provided `stemmer` and `stopwords`. | ||
# Arguments | ||
- `docs`: A vector of strings to be embedded. | ||
- `verbose`: A boolean flag for verbose output. Default is `true`. | ||
- `stemmer`: A stemmer to use for stemming. Default is `nothing`. | ||
- `stopwords`: A set of stopwords to remove. Default is `Set(STOPWORDS)`. | ||
- `return_keywords`: A boolean flag for returning the keywords. Default is `false`. Useful for query processing in search time. | ||
""" | ||
function RT.get_keywords( | ||
processor::RT.KeywordsProcessor, docs::AbstractVector{<:AbstractString}; | ||
verbose::Bool = true, | ||
stemmer = nothing, | ||
stopwords::Set{String} = Set(RT.STOPWORDS), | ||
return_keywords::Bool = false, | ||
kwargs...) | ||
## check if extension is available | ||
ext = Base.get_extension(PromptingTools, :RAGToolsExperimentalExt) | ||
if isnothing(ext) | ||
error("You need to also import LinearAlgebra and SparseArrays to use this function") | ||
end | ||
## ext = Base.get_extension(PromptingTools, :SnowballPromptingToolsExt) | ||
## if isnothing(ext) | ||
## error("You need to also import Snowball.jl to use this function") | ||
## end | ||
## Preprocess text into tokens | ||
stemmer = !isnothing(stemmer) ? stemmer : Snowball.Stemmer("english") | ||
# Single-threaded as stemmer is not thread-safe | ||
keywords = RT.preprocess_tokens(docs, stemmer; stopwords, min_length = 3) | ||
|
||
## Early exit if we only want keywords (search time) | ||
return_keywords && return keywords | ||
|
||
## Create DTM | ||
dtm = RT.document_term_matrix(keywords) | ||
|
||
verbose && @info "Done processing DocumentTermMatrix." | ||
return dtm | ||
end | ||
|
||
end # end of module |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.