From b060a58ec1b29f21baa1d0e263f8c734e0877672 Mon Sep 17 00:00:00 2001 From: Chris Nuernberger Date: Thu, 1 Apr 2021 09:17:24 -0600 Subject: [PATCH] Faster CSV parser (about 2x) in the case of a file with a lot of doubles. --- project.clj | 2 +- src/tech/v3/dataset/impl/column_base.clj | 5 ++- src/tech/v3/dataset/io/column_parsers.clj | 48 ++++++++--------------- 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/project.clj b/project.clj index 56153e4e..2f1f3791 100644 --- a/project.clj +++ b/project.clj @@ -5,7 +5,7 @@ :url "http://www.eclipse.org/legal/epl-v10.html"} :dependencies [[org.clojure/clojure "1.10.2"] [camel-snake-kebab "0.4.2"] - [cnuernber/dtype-next "7.01"] + [cnuernber/dtype-next "7.02"] [techascent/tech.io "4.03" :exclusions [org.apache.commons/commons-compress]] [com.univocity/univocity-parsers "2.9.0"] diff --git a/src/tech/v3/dataset/impl/column_base.clj b/src/tech/v3/dataset/impl/column_base.clj index ceb6d58d..7368bf2b 100644 --- a/src/tech/v3/dataset/impl/column_base.clj +++ b/src/tech/v3/dataset/impl/column_base.clj @@ -39,8 +39,9 @@ (defn datatype->missing-value [dtype] - (let [dtype (if (packing/packed-datatype? dtype) - dtype + (let [packed? (packing/packed-datatype? dtype) + dtype (if packed? + (packing/unpack-datatype dtype) (casting/un-alias-datatype dtype))] (get dtype->missing-val-map dtype (when (casting/numeric-type? dtype) diff --git a/src/tech/v3/dataset/io/column_parsers.clj b/src/tech/v3/dataset/io/column_parsers.clj index 82a2fbff..10202793 100644 --- a/src/tech/v3/dataset/io/column_parsers.clj +++ b/src/tech/v3/dataset/io/column_parsers.clj @@ -38,6 +38,7 @@ (def parse-failure :tech.ml.dataset.parse/parse-failure) (def missing :tech.ml.dataset.parse/missing) + (defn make-safe-parse-fn [parser-fn] (fn [str-val] @@ -46,13 +47,6 @@ (catch Throwable e parse-failure)))) -(defn- packed-parser - [datatype str-parse-fn] - (make-safe-parse-fn (packing/wrap-with-packing - datatype - #(if (= datatype (dtype/elemwise-datatype %)) - % - (str-parse-fn %))))) (def default-coercers (merge @@ -142,24 +136,13 @@ (defn add-missing-values! [^PrimitiveList container ^RoaringBitmap missing missing-value ^long idx] - (let [n-elems (.lsize container) - simple-dtype (casting/simple-operation-space - (dtype/elemwise-datatype missing-value))] - (loop [n-elems n-elems] - (when (< n-elems idx) - (case simple-dtype - :bool - (.addBoolean container false) - :boolean - (.addBoolean container false) - :int64 - (.addLong container (long missing-value)) - :float64 - (.addDouble container (double missing-value)) - :object - (.addObject container missing-value)) - (.add missing n-elems) - (recur (unchecked-inc n-elems)))))) + (let [n-elems (.lsize container)] + (when (< n-elems idx) + (loop [n-elems n-elems] + (when (< n-elems idx) + (.addObject container missing-value) + (.add missing n-elems) + (recur (unchecked-inc n-elems))))))) (defn finalize-parser-data! @@ -178,10 +161,13 @@ (defn- missing-value? [value] - (or (nil? value) - (.equals "" value) - (= value :tech.ml.dataset.parse/missing) - (and (string? value) (.equalsIgnoreCase ^String value "na")))) + ;;fastpath for numbers + (if (instance? Number value) + false + (or (nil? value) + (.equals "" value) + (identical? value :tech.ml.dataset.parse/missing) + (and (string? value) (.equalsIgnoreCase ^String value "na"))))) (defn- not-missing? @@ -335,14 +321,14 @@ (let [idx (long idx)] (let [value-dtype (dtype/elemwise-datatype value)] (cond - (= value-dtype container-dtype) + (identical? value-dtype container-dtype) (do (add-missing-values! container missing missing-value idx) (.add container value)) parse-fn (let [parsed-value (parse-fn value)] (cond - (= parsed-value parse-failure) + (identical? :tech.ml.dataset.parse/parse-failure parsed-value) (let [start-idx (argops/index-of (mapv first promotion-list) container-dtype) n-elems (.size promotion-list) next-idx (if (== start-idx -1)