Skip to content

Commit

Permalink
Issues 2020 04 21 (#35)
Browse files Browse the repository at this point in the history
* Fixes #33

* Fixes #34

* Fixing fallout from issues
  • Loading branch information
cnuernber committed Apr 21, 2020
1 parent b535beb commit 43a52c3
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 27 deletions.
2 changes: 1 addition & 1 deletion deps.edn
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
:deps {org.clojure/clojure {:mvn/version "1.10.1"}
camel-snake-kebab {:mvn/version "0.4.0"}
techascent/tech.datatype {:mvn/version "5.0-beta-10"}
techascent/tech.io {:mvn/version "3.16"}
techascent/tech.io {:mvn/version "3.17"}
com.univocity/univocity-parsers {:mvn/version "2.7.5"}
org.apache.poi/poi-ooxml {:mvn/version "4.1.2"}
org.dhatim/fastexcel-reader {:mvn/version "0.10.12"}
Expand Down
5 changes: 4 additions & 1 deletion src/tech/ml/dataset.clj
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
[tech.ml.dataset.column :as ds-col]
[tech.ml.dataset.categorical :as categorical]
[tech.ml.dataset.pipeline.column-filters :as col-filters]
[tech.ml.dataset.parse.name-values-seq :as parse-nvs]
[tech.ml.dataset.impl.dataset :as ds-impl]
[tech.ml.dataset.base]
[tech.ml.dataset.modelling]
Expand Down Expand Up @@ -96,7 +97,9 @@


(par-util/export-symbols tech.ml.dataset.impl.dataset
new-dataset
new-dataset)

(par-util/export-symbols tech.ml.dataset.parse.name-values-seq
name-values-seq->dataset)


Expand Down
21 changes: 0 additions & 21 deletions src/tech/ml/dataset/impl/dataset.clj
Original file line number Diff line number Diff line change
Expand Up @@ -196,27 +196,6 @@
(str item-val)))


(defn name-values-seq->dataset
"Given a sequence of [name data-seq], produce a columns. If data-seq is
of unknown (:object) datatype, the first item is checked. If it is a number,
then doubles are used. If it is a string, then strings are used for the
column datatype.
All sequences must be the same length.
Returns a new dataset"
[name-values-seq & {:keys [dataset-name]
:or {dataset-name "_unnamed"}}]
(let [sizes (->> (map (comp dtype/ecount second) name-values-seq)
distinct)]
(when-not (= 1 (count sizes))
(throw (ex-info (format "Different sized columns detected: %s" sizes) {})))
(->> name-values-seq
(map (fn [[colname values-seq]]
(if (map? values-seq)
(ds-col/ensure-column values-seq)
(ds-col/new-column colname values-seq))))
(new-dataset dataset-name))))


(defn parse-dataset
([input options]
(->> (ds-parse/csv->columns input options)
Expand Down
54 changes: 54 additions & 0 deletions src/tech/ml/dataset/parse/name_values_seq.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
(ns tech.ml.dataset.parse.name-values-seq
(:require [tech.v2.datatype.protocols :as dtype-proto]
[tech.v2.datatype :as dtype]
[tech.v2.datatype.casting :as casting]
[tech.ml.dataset.parse.mapseq :as parse-mapseq]
[tech.ml.dataset.impl.dataset :as ds-impl]
[tech.ml.protocols.dataset :as ds-proto]
[tech.ml.dataset.column :as ds-col]))


(defn name-values-seq->dataset
"Given a sequence of [name data-seq], produce a columns. If data-seq is
of unknown (:object) datatype, the first item is checked. If it is a number,
then doubles are used. If it is a string, then strings are used for the
column datatype.
All sequences must be the same length.
Returns a new dataset"
[name-values-seq & {:keys [dataset-name]
:or {dataset-name "_unnamed"}}]
(let [sizes (->> (map (comp dtype/ecount second) name-values-seq)
distinct)
_ (when-not (= 1 (count sizes))
(throw (ex-info (format "Different sized columns detected: %s" sizes) {})))
name-order (map first name-values-seq)
;;Allow explicit missing/etc to be passed in.
map-data (filter (comp map? second) name-values-seq)
;;fastpaths for primitive arrays - no need to scan the data.
known-container-data (filter (comp casting/numeric-type?
dtype/get-datatype
second)
name-values-seq)
half-dataset (->> (concat map-data known-container-data)
(map (fn [[colname values-seq]]
(if (map? values-seq)
(ds-col/ensure-column values-seq)
(ds-col/new-column colname values-seq))))
(ds-impl/new-dataset dataset-name))
colname-set (set (ds-proto/column-names half-dataset))
leftover (remove (comp colname-set first) name-values-seq)
n-data (count leftover)
colnames (map first leftover)
;;Object columns mean we have to scan everything manually.
values (->> (map second leftover)
(apply interleave)
(partition n-data)
(map #(zipmap colnames %)))
leftover-ds (when (seq colnames)
(parse-mapseq/mapseq->dataset values))]
(-> (ds-impl/new-dataset dataset-name
{}
(concat (ds-proto/columns half-dataset)
(when leftover-ds
(ds-proto/columns leftover-ds))))
(ds-proto/select name-order :all))))
19 changes: 18 additions & 1 deletion test/tech/ml/dataset/object_columns_test.clj
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
(ns tech.ml.dataset.object-columns-test
(:require [clojure.test :refer :all]
[tech.ml.dataset :as ds]
[tech.v2.datatype :as dtype]))
[tech.v2.datatype :as dtype]
[tech.v2.datatype.datetime :as dtype-dt]
[tech.v2.tensor :as dtt]))



Expand All @@ -13,3 +15,18 @@
(dtype/get-datatype (src-ds :b))))
(is (= (vec (repeat 10 {:a 1 :b 2}))
(vec (dtype/->reader (src-ds :b)))))))



(deftest involved-object-columns
(let [src-ds (ds/name-values-seq->dataset
{:dates (list "2000-01-01" "2000-02-01" "2000-03-01"
"2000-04-01" "2000-05-01")
:integers (range 5)
:durations (repeat 5 (dtype-dt/duration))
:doubles (map double (range 5))
:tensors (repeat 5 (dtt/->tensor (partition 2 (range 4))))})]
(is (= #{:float64 :packed-local-date :int64 :object
:packed-duration}
(->> (map dtype/get-datatype src-ds)
set)))))
6 changes: 3 additions & 3 deletions test/tech/ml/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,11 @@
(let [ds (ds/name-values-seq->dataset {:a [1 nil 2 nil 3]
:b (list 1 nil 2 nil 3)})
rec (ds-pipe/replace-missing ds :all 5)]
(is (= #{:float64}
(is (= #{:int64}
(set (map dtype/get-datatype ds))))
(is (= [1.0 5.0 2.0 5.0 3.0]
(is (= [1 5 2 5 3]
(vec (rec :a))))
(is (= [1.0 5.0 2.0 5.0 3.0]
(is (= [1 5 2 5 3]
(vec (rec :b))))))


Expand Down

0 comments on commit 43a52c3

Please sign in to comment.