Skip to content

Commit

Permalink
Adding example at bottom of file.
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Jul 4, 2020
1 parent d74420c commit f57fe86
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions src/tech/ml/dataset/text/bag_of_words.clj
Expand Up @@ -84,3 +84,27 @@
(ds-base/->>dataset)))
([path bag-of-words-colname token->idx-map]
(path-token-map->bag-of-words path bag-of-words-colname token->idx-map {})))


(comment

(def dataset-and-tokens (path->dataset-master-token-table
"test/data/medical-text.csv"
"abstract"))

(def most-common-tokens
(->> (:token-table dataset-and-tokens)
(sort-by second >)
(take 1000)
(mapv first)))

(def token->idx-map (->> most-common-tokens
(map-indexed (fn [idx tkn] [tkn idx]))
(into {})))

(def bow-ds (path-token-map->bag-of-words
"test/data/medical-text.csv"
"abstract"
token->idx-map))

)

0 comments on commit f57fe86

Please sign in to comment.