Skip to content

Commit

Permalink
Upgrades to various functions.
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Nov 15, 2019
1 parent bc5a63a commit fa92e66
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 42 deletions.
1 change: 0 additions & 1 deletion src/tech/ml/dataset.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
[tech.v2.datatype.functional.impl :as fn-impl]
[tech.v2.datatype.functional :as dfn]
[tech.ml.dataset.column :as ds-col]
[tech.parallel.require :as parallel-req]
[tech.ml.dataset.categorical :as categorical]
[tech.ml.dataset.pipeline.column-filters :as col-filters]
[tech.ml.dataset.base]
Expand Down
90 changes: 59 additions & 31 deletions src/tech/ml/dataset/base.clj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
[tech.v2.datatype.readers.concat :as reader-concat]
[tech.ml.dataset.column :as ds-col]
[tech.ml.protocols.dataset :as ds-proto]
[tech.io :as io]
[tech.parallel.require :as parallel-req])
(:import [java.io InputStream]
[tech.v2.datatype ObjectReader]
Expand Down Expand Up @@ -316,35 +317,38 @@ the correct type."

(defn ds-concat
[dataset & other-datasets]
(let [datasets (concat [dataset] (remove nil? other-datasets))
column-list
(->> datasets
(mapcat (fn [dataset]
(->> (columns dataset)
(mapv (fn [col]
(assoc (ds-col/metadata col)
:column
col
:table-name (dataset-name dataset)))))))
(group-by :name))
label-map (->> datasets
(map (comp :label-map metadata))
(apply merge))]
(when-not (= 1 (count (->> (vals column-list)
(map count)
distinct)))
(throw (ex-info "Dataset is missing a column" {})))
(->> column-list
(map (fn [[_colname columns]]
(let [columns (map :column columns)
column-values (reader-concat/concat-readers columns)
first-col (first columns)]
(ds-col/new-column first-col
(dtype/get-datatype first-col)
column-values
(ds-col/metadata first-col)))))
(ds-proto/from-prototype dataset (dataset-name dataset))
(#(set-metadata % {:label-map label-map})))))
(let [datasets (->> (concat [dataset] (remove nil? other-datasets))
(remove nil?)
seq)]
(when-let [dataset (first datasets)]
(let [column-list
(->> datasets
(mapcat (fn [dataset]
(->> (columns dataset)
(mapv (fn [col]
(assoc (ds-col/metadata col)
:column
col
:table-name (dataset-name dataset)))))))
(group-by :name))
label-map (->> datasets
(map (comp :label-map metadata))
(apply merge))]
(when-not (= 1 (count (->> (vals column-list)
(map count)
distinct)))
(throw (ex-info "Dataset is missing a column" {})))
(->> column-list
(map (fn [[_colname columns]]
(let [columns (map :column columns)
column-values (reader-concat/concat-readers columns)
first-col (first columns)]
(ds-col/new-column first-col
(dtype/get-datatype first-col)
column-values
(ds-col/metadata first-col)))))
(ds-proto/from-prototype dataset (dataset-name dataset))
(#(set-metadata % {:label-map label-map})))))))


(defn unique-by
Expand Down Expand Up @@ -536,11 +540,35 @@ the correct type."
(cond
(satisfies? ds-proto/PColumnarDataset dataset)
dataset
(or (instance? InputStream dataset)
(string? dataset))
(instance? InputStream dataset)
(apply
(parallel-req/require-resolve 'tech.libs.tablesaw/path->tablesaw-dataset)
dataset (apply concat options))

(string? dataset)
(let [^String dataset dataset
gzipped? (.endsWith dataset ".gz")
json? (or (.endsWith dataset ".json")
(.endsWith dataset ".json.gz"))
tsv? (or (.endsWith dataset ".tsv")
(.endsWith dataset ".tsv.gz"))
options (if (and tsv? (not (contains? options :separator)))
(assoc options :separator \tab)
options)
options (if (and json? (not (contains? options :key-fn)))
(assoc options :key-fn keyword)
options)
open-fn (if json?
#(-> (apply io/get-json % (apply concat options))
(map-seq->dataset options))
#(apply
(parallel-req/require-resolve
'tech.libs.tablesaw/path->tablesaw-dataset)
% (apply concat options)))]
(with-open [istream (if gzipped?
(io/gzip-input-stream dataset)
(io/input-stream dataset))]
(open-fn istream)))
:else
(map-seq->dataset dataset options))]
(if table-name
Expand Down
19 changes: 16 additions & 3 deletions src/tech/ml/dataset/generic_columnar_dataset.clj
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,18 @@
(declare make-dataset)


(defn- new-column-datatype
[coldata]
(if (= :object (dtype/get-datatype coldata))
(let [first-item (first coldata)]
(cond
(integer? first-item) :int64
(number? first-item) :float64
:else
:string))
(dtype/get-datatype coldata)))


(deftype GenericColumnarDataset [table-name
column-names
colmap
Expand Down Expand Up @@ -62,22 +74,23 @@
{:col-name col-name
:col-names (keys colmap)})))
(let [col (get colmap col-name)
new-col-data (col-fn col)]
new-col-data (col-fn col)
new-col-dtype (new-column-datatype col)]
(GenericColumnarDataset.
table-name
column-names
(assoc colmap col-name
(if (ds-col/is-column? new-col-data)
(ds-col/set-name new-col-data col-name)
(ds-col/new-column col (dtype/get-datatype new-col-data)
(ds-col/new-column col new-col-dtype
new-col-data {:name (ds-col/column-name col)})))
metadata)))

(add-or-update-column [dataset col-name new-col-data]
(let [col-data (if (ds-col/is-column? new-col-data)
(ds-col/set-name new-col-data col-name)
(ds-col/new-column (first dataset)
(dtype/get-datatype new-col-data)
(new-column-datatype new-col-data)
new-col-data {:name col-name}))]
(if (contains? colmap col-name)
(ds/update-column dataset col-name (constantly col-data))
Expand Down
41 changes: 34 additions & 7 deletions test/tech/ml/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[tech.ml.dataset.column :as ds-col]
[tech.ml.dataset.pipeline :as ds-pipe]
[tech.v2.tensor :as tens]
[tech.v2.datatype.functional :as dtype-fn]
[tech.v2.datatype.functional :as dfn]
[tech.ml.dataset.tensor :as ds-tens]
[tech.ml.dataset.pca :as pca]
[clojure.test :refer :all]
Expand Down Expand Up @@ -90,7 +90,7 @@
(into-array (Class/forName "[D"))))
(tens/->tensor))]
;;Make sure we get the same answer as smile.
(is (dtype-fn/equals trans-tens
(is (dfn/equals trans-tens
smile-transformed-ds
0.01))))

Expand Down Expand Up @@ -163,7 +163,7 @@
(dtype/shape (dataset/unique-by :width ds))))
(is (= [7 24]
(dtype/shape (dataset/unique-by-column :width ds))))
(is (dtype-fn/equals [5.8 5.9 6.0 6.1 6.2 6.3 6.5
(is (dfn/equals [5.8 5.9 6.0 6.1 6.2 6.3 6.5
6.7 6.8 6.9 7.0 7.1 7.2 7.3
7.4 7.5 7.6 7.7 7.8 8.0 8.4
9.0 9.2 9.6]
Expand All @@ -173,7 +173,34 @@
vec)))))


(deftest aggregate-by-test
(let [ds (dataset/->dataset (mapseq-fruit-dataset))]

))
(deftest ds-concat-nil-pun
(let [ds (-> (dataset/->dataset (mapseq-fruit-dataset))
(dataset/select :all (range 10)))
d1 (dataset/ds-concat nil ds)
d2 (dataset/ds-concat ds nil nil)
nothing (dataset/ds-concat nil nil nil)]
(is (= (vec (ds :fruit-name))
(vec (d1 :fruit-name))))
(is (= (vec (ds :fruit-name))
(vec (d2 :fruit-name))))
(is (nil? nothing))))


(deftest update-column-datatype-detect
(let [ds (-> (dataset/->dataset (mapseq-fruit-dataset))
(dataset/select :all (range 10)))
updated (dataset/update-column ds :width #(->> %
(map (fn [data]
(* 10 data)))))
add-or-updated (dataset/add-or-update-column
ds :width (->> (ds :width)
(map (fn [data]
(* 10 data)))))
width-answer (->> (ds :width)
(mapv (fn [data]
(* 10 data))))]

(is (dfn/equals width-answer
(updated :width)))
(is (dfn/equals width-answer
(add-or-updated :width)))))

0 comments on commit fa92e66

Please sign in to comment.