From 7ae0c7838f9de14c76b36719157c15e48992eff5 Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Sun, 7 Nov 2021 08:06:08 -0700 Subject: [PATCH] Fixes #273 --- src/tech/v3/dataset/missing.clj | 14 ++++++++++++++ test/tech/v3/dataset_test.clj | 9 +++++++++ 2 files changed, 23 insertions(+) diff --git a/src/tech/v3/dataset/missing.clj b/src/tech/v3/dataset/missing.clj index 8c09b4ea..50c34001 100644 --- a/src/tech/v3/dataset/missing.clj +++ b/src/tech/v3/dataset/missing.clj @@ -79,6 +79,18 @@ (missing-map-fn secondary-f missing2)) (replace-missing-with-value step1 missing2 value))))) +(defn- replace-missing-with-abb + [col ^RoaringBitmap missing] + (let [^RoaringBitmap non-missing (doto (RoaringBitmap.) + (.add 0 (dtype/ecount col)) + (.andNot missing)) ;; prepare non-missing indices + non-missing-cnt (.getCardinality non-missing) ;; how many non-missing we have + ;; bootstrap `non-missing-count` values from a column + samples1 (col/select col (repeatedly non-missing-cnt #(.select non-missing (rand-int non-missing-cnt)))) + ;; bootstrap `missing-count` values for imputation from first bootstrap round + samples2 (col/select samples1 (repeatedly (.getCardinality missing) #(rand-int (dtype/ecount samples1))))] + (replace-missing-with-value col missing samples2))) + ;; mid and range (defn- find-missing-ranges @@ -198,6 +210,7 @@ [missing-direction-prev missing-direction-next] col missing value) :up (replace-missing-with-direction [missing-direction-next missing-direction-prev] col missing value) + :abb (replace-missing-with-abb col missing) :lerp (replace-missing-with-lerp :lerp col missing) :nearest (replace-missing-with-nearest col missing) :midpoint (replace-missing-with-lerp :midpoint col missing) @@ -221,6 +234,7 @@ - `:nearest` - Use nearest of next or previous values. `:mid` is an alias for `:nearest`. - `:midpoint` - Use midpoint of averaged values between previous and next nonmissing rows. + - `:abb` - Impute missing with approximate bayesian bootstrap. See [r's ABB](https://search.r-project.org/CRAN/refmans/LaplacesDemon/html/ABB.html). - `:lerp` - Linearly interpolate values between previous and next nonmissing rows. - `:value` - Value will be provided - see below. diff --git a/test/tech/v3/dataset_test.clj b/test/tech/v3/dataset_test.clj index b3d31988..a91a7f11 100644 --- a/test/tech/v3/dataset_test.clj +++ b/test/tech/v3/dataset_test.clj @@ -864,6 +864,15 @@ (java.time.LocalDateTime/of 2020 10 1 1 1 1)])))) +(deftest replace-missing-abb + (let [dtds (ds/->dataset {:a [nil nil nil 1.0 2 nil nil nil + nil nil 4 nil 11 nil nil] + :b [2 2 2 nil nil nil nil nil + nil 13 nil 3 4 5 5]}) + fds (ds/replace-missing dtds :abb)] + (is (= 0 (dtype/ecount (ds/missing fds)))))) + + (deftest dataset-column-nippy (let [ds (ds/->dataset {:a [1 2] :datasets [(ds/->dataset [{:a 1}])