Skip to content

Commit

Permalink
Faster CSV parser (about 2x) in the case of a file with a lot of doub…
Browse files Browse the repository at this point in the history
…les.
  • Loading branch information
cnuernber committed Apr 1, 2021
1 parent b203e6d commit b060a58
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 34 deletions.
2 changes: 1 addition & 1 deletion project.clj
Expand Up @@ -5,7 +5,7 @@
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.clojure/clojure "1.10.2"]
[camel-snake-kebab "0.4.2"]
[cnuernber/dtype-next "7.01"]
[cnuernber/dtype-next "7.02"]
[techascent/tech.io "4.03"
:exclusions [org.apache.commons/commons-compress]]
[com.univocity/univocity-parsers "2.9.0"]
Expand Down
5 changes: 3 additions & 2 deletions src/tech/v3/dataset/impl/column_base.clj
Expand Up @@ -39,8 +39,9 @@

(defn datatype->missing-value
[dtype]
(let [dtype (if (packing/packed-datatype? dtype)
dtype
(let [packed? (packing/packed-datatype? dtype)
dtype (if packed?
(packing/unpack-datatype dtype)
(casting/un-alias-datatype dtype))]
(get dtype->missing-val-map dtype
(when (casting/numeric-type? dtype)
Expand Down
48 changes: 17 additions & 31 deletions src/tech/v3/dataset/io/column_parsers.clj
Expand Up @@ -38,6 +38,7 @@
(def parse-failure :tech.ml.dataset.parse/parse-failure)
(def missing :tech.ml.dataset.parse/missing)


(defn make-safe-parse-fn
[parser-fn]
(fn [str-val]
Expand All @@ -46,13 +47,6 @@
(catch Throwable e
parse-failure))))

(defn- packed-parser
[datatype str-parse-fn]
(make-safe-parse-fn (packing/wrap-with-packing
datatype
#(if (= datatype (dtype/elemwise-datatype %))
%
(str-parse-fn %)))))

(def default-coercers
(merge
Expand Down Expand Up @@ -142,24 +136,13 @@
(defn add-missing-values!
[^PrimitiveList container ^RoaringBitmap missing
missing-value ^long idx]
(let [n-elems (.lsize container)
simple-dtype (casting/simple-operation-space
(dtype/elemwise-datatype missing-value))]
(loop [n-elems n-elems]
(when (< n-elems idx)
(case simple-dtype
:bool
(.addBoolean container false)
:boolean
(.addBoolean container false)
:int64
(.addLong container (long missing-value))
:float64
(.addDouble container (double missing-value))
:object
(.addObject container missing-value))
(.add missing n-elems)
(recur (unchecked-inc n-elems))))))
(let [n-elems (.lsize container)]
(when (< n-elems idx)
(loop [n-elems n-elems]
(when (< n-elems idx)
(.addObject container missing-value)
(.add missing n-elems)
(recur (unchecked-inc n-elems)))))))


(defn finalize-parser-data!
Expand All @@ -178,10 +161,13 @@

(defn- missing-value?
[value]
(or (nil? value)
(.equals "" value)
(= value :tech.ml.dataset.parse/missing)
(and (string? value) (.equalsIgnoreCase ^String value "na"))))
;;fastpath for numbers
(if (instance? Number value)
false
(or (nil? value)
(.equals "" value)
(identical? value :tech.ml.dataset.parse/missing)
(and (string? value) (.equalsIgnoreCase ^String value "na")))))


(defn- not-missing?
Expand Down Expand Up @@ -335,14 +321,14 @@
(let [idx (long idx)]
(let [value-dtype (dtype/elemwise-datatype value)]
(cond
(= value-dtype container-dtype)
(identical? value-dtype container-dtype)
(do
(add-missing-values! container missing missing-value idx)
(.add container value))
parse-fn
(let [parsed-value (parse-fn value)]
(cond
(= parsed-value parse-failure)
(identical? :tech.ml.dataset.parse/parse-failure parsed-value)
(let [start-idx (argops/index-of (mapv first promotion-list) container-dtype)
n-elems (.size promotion-list)
next-idx (if (== start-idx -1)
Expand Down

0 comments on commit b060a58

Please sign in to comment.