Skip to content

Commit

Permalink
Fixes #190
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Dec 31, 2020
1 parent 3d215ef commit f472ff6
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 45 deletions.
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.clojure/clojure "1.10.2-alpha1"]
[camel-snake-kebab "0.4.2"]
[cnuernber/dtype-next "6.00-beta-14"]
[cnuernber/dtype-next "6.00-beta-16"]
[techascent/tech.io "4.02"
:exclusions [org.apache.commons/commons-compress]]
[com.univocity/univocity-parsers "2.9.0"]
Expand Down
15 changes: 11 additions & 4 deletions src/tech/v3/dataset/column.clj
Original file line number Diff line number Diff line change
Expand Up @@ -122,18 +122,25 @@ Implementations should check their metadata before doing calculations."
must return one of parsed-value, :tech.ml.dataset.parse/missing in which case a
missing value will be added or :tech.ml.dataset.parse/parse-failure in which case the
a missing index will be added and the string value will be recorded in the metadata's
:unparsed-data, :unparsed-indexes entries."
([datatype col]
:unparsed-data, :unparsed-indexes entries.
Options:
Same options roughly as ->dataset, specifically of interest may be `:text-temp-file`.
"
([datatype col options]
(let [colname (column-name col)
col-reader (dtype/emap #(when % (str %)) :string col)
col-parser (column-parsers/make-fixed-parser colname datatype)
col-parser (column-parsers/make-fixed-parser colname datatype options)
n-elems (dtype/ecount col-reader)]
(dotimes [iter n-elems]
(column-parsers/add-value! col-parser iter (col-reader iter)))

(let [{:keys [data missing metadata]}
(column-parsers/finalize! col-parser n-elems)]
(new-column colname data metadata missing)))))
(new-column colname data metadata missing))))
([datatype col]
(parse-column datatype col nil)))


(defn new-column
Expand Down
44 changes: 33 additions & 11 deletions src/tech/v3/dataset/impl/column_base.clj
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
(ns tech.v3.dataset.impl.column-base
(:require [tech.v3.datatype.datetime :as dtype-dt]
(:require [tech.v3.dataset.string-table :as str-table]
[tech.v3.dataset.file-backed-text :as file-backed-text]
[tech.v3.datatype.datetime :as dtype-dt]
[tech.v3.datatype.packing :as packing]
[tech.v3.datatype.casting :as casting]
[tech.v3.dataset.string-table :as str-table]
[tech.v3.dataset.file-backed-text :as file-backed-text]
[tech.v3.datatype :as dtype])
[tech.v3.datatype :as dtype]
[clojure.tools.logging :as log])
(:import [java.util Map List]
[tech.v3.datatype PrimitiveList]
[tech.v3.dataset Text]))
Expand Down Expand Up @@ -46,14 +47,35 @@
(casting/cast 0 dtype)))))


(defonce ^:private warn-atom* (atom false))
(defonce file-backed-text-enabled* (atom true))

(defn set-file-backed-text-enabled
[enabled]
(reset! file-backed-text-enabled* enabled)
enabled)

(defn make-container
(^PrimitiveList [dtype n-elems]
(^PrimitiveList [dtype options]
(case dtype
:string (str-table/make-string-table n-elems "")
:text (let [^PrimitiveList list-data (file-backed-text/file-backed-text)]
(dotimes [iter n-elems]
(.add list-data nil))
:string (str-table/make-string-table 0 "")
:text
(let [^PrimitiveList list-data
(try
(if (and (not= false (:text-temp-dir options))
@file-backed-text-enabled*)
(let [tmp-dir (:text-temp-dir options)]
(file-backed-text/file-backed-text (merge
{:suffix ".txt"}
(when tmp-dir
{:temp-dir tmp-dir}))))
(dtype/make-list :text))
(catch Throwable e
(when-not @warn-atom*
(reset! warn-atom* true)
(log/warn e "File backed text failed. Falling back to in-memory"))
(dtype/make-list :text)))]
list-data)
(dtype/make-container :list dtype n-elems)))
(dtype/make-list dtype)))
(^PrimitiveList [dtype]
(make-container dtype 0)))
(make-container dtype nil)))
21 changes: 15 additions & 6 deletions src/tech/v3/dataset/io.clj
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,19 @@
- `:max-num-columns` - Defaults to 8192. CSV,TSV files with more columns than this
will fail to parse. For more information on this option, please visit:
https://github.com/uniVocity/univocity-parsers/issues/301
- `:n-initial-skip-rows` - Skip N rows initially. This currently may include the header
row. Works across both csv and spreadsheet datasets.
- `:text-temp-dir` - The temporary directory to use for file-backed text. Setting
this value to boolean 'false' turns off file backed text. If a tech.v3.resource
stack context is opened the file will be deleted when the context closes else it
will be deleted when the gc cleans up the dataset. A shutdown hook is added as
a last resort to ensure the file is cleaned up. Each column's data filefile will
be created in `(System/getProperty \"java.io.tmpdir\")` by default.
- `:n-initial-skip-rows` - Skip N rows initially. This currently may include the
header row. Works across both csv and spreadsheet datasets.
- `:parser-fn` -
- `keyword?` - all columns parsed to this datatype. For example: `{:parser-fn :string}`
- `map?` - `{column-name parse-method}` parse each column with specified `parse-method`.
- `keyword?` - all columns parsed to this datatype. For example:
`{:parser-fn :string}`
- `map?` - `{column-name parse-method}` parse each column with specified
`parse-method`.
The `parse-method` can be:
- `keyword?` - parse the specified column to this datatype. For example:
`{:parser-fn {:answer :boolean :id :int32}}`
Expand All @@ -159,10 +167,11 @@
added to missing, the unparsed the column's :unparsed-values and
:unparsed-indexes will be updated.
- `string?` - for datetime types, this will turned into a DateTimeFormatter via
DateTimeFormatter/ofPattern. For encoded-text, this has to be a valid
argument to Charset/forName.
DateTimeFormatter/ofPattern. For `:text` you can specify the backing file
to use.
- `DateTimeFormatter` - use with the appropriate temporal parse static function
to parse the value.
- `map?` - the header-name-or-idx is used to lookup value. If not nil, then
value can be any of the above options. Else the default column parser
is used.
Expand Down
48 changes: 29 additions & 19 deletions src/tech/v3/dataset/io/column_parsers.clj
Original file line number Diff line number Diff line change
Expand Up @@ -264,20 +264,21 @@
(DateTimeFormatter/ofPattern parser-fn))
(and (dtype-dt/datetime-datatype? parser-datatype)
(instance? DateTimeFormatter parser-fn))
(datetime-formatter-parser-fn parser-datatype
parser-fn)
(datetime-formatter-parser-fn parser-datatype parser-fn)
(= :text parser-datatype)
[(find-fixed-parser parser-datatype)]
:else
(errors/throwf "Unrecoginzed parser fn type: %s" (type parser-fn)))]))
[parser-kwd [(find-fixed-parser parser-kwd) false]]))


(defn make-fixed-parser
[cname parser-kwd]
[cname parser-kwd options]
(let [[dtype [parse-fn relaxed?]] (parser-entry->parser-tuple parser-kwd)
[failed-values failed-indexes] (when relaxed?
[(dtype/make-container :list :object 0)
(bitmap/->bitmap)])
container (column-base/make-container dtype)
container (column-base/make-container dtype options)
missing-value (column-base/datatype->missing-value dtype)
missing (bitmap/->bitmap)]
(FixedTypeParser. container dtype missing-value parse-fn
Expand All @@ -297,10 +298,9 @@


(defn- promote-container
^PrimitiveList [old-container ^RoaringBitmap missing new-dtype]
^PrimitiveList [old-container ^RoaringBitmap missing new-dtype options]
(let [n-elems (dtype/ecount old-container)
container (column-base/make-container
new-dtype 0)
container (column-base/make-container new-dtype options)
missing-value (column-base/datatype->missing-value new-dtype)
;;Ensure we unpack a container if we have to promote it.
old-container (packing/unpack old-container)]
Expand All @@ -323,7 +323,8 @@
;;List of datatype,parser-fn tuples
^List promotion-list
column-name
^:unsynchronized-mutable ^long max-idx]
^:unsynchronized-mutable ^long max-idx
options]
dtype-proto/PECount
(ecount [this] (inc max-idx))
PParser
Expand Down Expand Up @@ -384,7 +385,8 @@
(new-parser-fn value)
value)
new-container (promote-container container missing
parser-datatype)
parser-datatype
options)
new-missing-value (column-base/datatype->missing-value
parser-datatype)]
(set! container new-container)
Expand All @@ -407,19 +409,24 @@


(defn promotional-string-parser
([column-name parser-datatype-sequence]
([column-name parser-datatype-sequence options]
(let [first-dtype (first parser-datatype-sequence)]
(PromotionalStringParser. (column-base/make-container (if (= :bool first-dtype) :boolean first-dtype))
(PromotionalStringParser. (column-base/make-container
(if (= :bool first-dtype)
:boolean
first-dtype)
options)
first-dtype
false
(default-coercers first-dtype)
(bitmap/->bitmap)
(mapv (juxt identity default-coercers)
parser-datatype-sequence)
column-name
-1)))
([column-name]
(promotional-string-parser column-name default-parser-datatype-sequence)))
-1
options)))
([column-name options]
(promotional-string-parser column-name default-parser-datatype-sequence options)))


(deftype PromotionalObjectParser [^{:unsynchronized-mutable true
Expand All @@ -428,7 +435,8 @@
^{:unsynchronized-mutable true} missing-value
^RoaringBitmap missing
column-name
^:unsynchronized-mutable ^long max-idx]
^:unsynchronized-mutable ^long max-idx
options]
dtype-proto/PECount
(ecount [this] (inc max-idx))
PParser
Expand All @@ -443,7 +451,7 @@
(= container-dtype packed-dtype))
(do
(when (== 0 container-ecount)
(set! container (column-base/make-container packed-dtype))
(set! container (column-base/make-container packed-dtype options))
(set! container-dtype packed-dtype)
(set! missing-value (column-base/datatype->missing-value packed-dtype)))
(when-not (== container-ecount idx)
Expand All @@ -454,7 +462,8 @@
org-datatype)]
(when-not (= widest-datatype container-dtype)
(let [new-container (promote-container container
missing widest-datatype)]
missing widest-datatype
options)]
(set! container new-container)
(set! container-dtype widest-datatype)
(set! missing-value (column-base/datatype->missing-value
Expand All @@ -468,10 +477,11 @@


(defn promotional-object-parser
[column-name]
[column-name options]
(PromotionalObjectParser. (dtype/make-container :list :boolean 0)
:boolean
false
(bitmap/->bitmap)
column-name
-1))
-1
options))
8 changes: 4 additions & 4 deletions src/tech/v3/dataset/io/context.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
(key-fn cname-or-index))]
(cond
(nil? parser-descriptor)
(default-parse-fn cname)
(default-parse-fn cname options)
(map? parser-descriptor)
(if-let [col-parser-desc (or (get parser-descriptor cname)
(get parser-descriptor cname-or-index))]
(column-parsers/make-fixed-parser cname col-parser-desc)
(default-parse-fn cname))
(column-parsers/make-fixed-parser cname col-parser-desc options)
(default-parse-fn cname options))
:else
(column-parsers/make-fixed-parser cname parser-descriptor))))))
(column-parsers/make-fixed-parser cname parser-descriptor options))))))


(defn- make-colname
Expand Down

0 comments on commit f472ff6

Please sign in to comment.