From f57fe86741fd0c0158a643c946404b78bd09c90a Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Sat, 4 Jul 2020 08:12:00 -0600 Subject: [PATCH] Adding example at bottom of file. --- src/tech/ml/dataset/text/bag_of_words.clj | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/tech/ml/dataset/text/bag_of_words.clj b/src/tech/ml/dataset/text/bag_of_words.clj index 86e8e4df..0cf44057 100644 --- a/src/tech/ml/dataset/text/bag_of_words.clj +++ b/src/tech/ml/dataset/text/bag_of_words.clj @@ -84,3 +84,27 @@ (ds-base/->>dataset))) ([path bag-of-words-colname token->idx-map] (path-token-map->bag-of-words path bag-of-words-colname token->idx-map {}))) + + +(comment + + (def dataset-and-tokens (path->dataset-master-token-table + "test/data/medical-text.csv" + "abstract")) + + (def most-common-tokens + (->> (:token-table dataset-and-tokens) + (sort-by second >) + (take 1000) + (mapv first))) + + (def token->idx-map (->> most-common-tokens + (map-indexed (fn [idx tkn] [tkn idx])) + (into {}))) + + (def bow-ds (path-token-map->bag-of-words + "test/data/medical-text.csv" + "abstract" + token->idx-map)) + + )