Skip to content

Commit

Permalink
Fixes #139. Fixes #140. Fixes #141
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Oct 19, 2020
1 parent 1258e64 commit bc8d3b8
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 35 deletions.
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
:url "http://www.eclipse.org/legal/epl-v10.html"}
:dependencies [[org.clojure/clojure "1.10.2-alpha1"]
[camel-snake-kebab "0.4.0"]
[cnuernber/dtype-next "0.4.23"]
[cnuernber/dtype-next "6.00-alpha-5"]
[techascent/tech.io "3.20"
:exclusions [org.apache.commons/commons-compress]]
[com.univocity/univocity-parsers "2.9.0"]
Expand Down
8 changes: 8 additions & 0 deletions src/tech/v3/dataset/base.clj
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,14 @@
index-seq - either keyword :all or list of indexes. May contain duplicates.
"
[dataset colname-seq index-seq]
(when (dtype-proto/has-constant-time-min-max? index-seq)
(let [lmin (long (dtype-proto/constant-time-min index-seq))
lmax (long (dtype-proto/constant-time-max index-seq))]
(errors/when-not-errorf
(and (< lmax (row-count dataset))
(>= lmin 0))
"Index sequence range [%d-%d] out of dataset row range [0-%d]"
lmin lmax (dec (row-count dataset)))))
(let [index-seq (if (number? index-seq)
[index-seq]
index-seq)]
Expand Down
2 changes: 1 addition & 1 deletion src/tech/v3/dataset/categorical.clj
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@
#(str src-name "-" (safe-str %)))
one-hot-map (->> lookup-table
(map (fn [[k v]]
[k (name-fn v)]))
[k (name-fn k)]))
(into {}))]
(map->OneHotMap
{:one-hot-table one-hot-map
Expand Down
29 changes: 19 additions & 10 deletions src/tech/v3/dataset/io/column_parsers.clj
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,18 @@
^RoaringBitmap missing
^PrimitiveList failed-values
^RoaringBitmap failed-indexes
column-name]
column-name
^:unsynchronized-mutable ^long max-idx]
dtype-proto/PECount
(ecount [this] (.lsize container))
(ecount [this] (inc max-idx))
PParser
(add-value! [p idx value]
(set! max-idx (max (long idx) max-idx))
(when-not (missing-value? value)
(let [idx (long idx)]
(let [value-dtype (dtype/datatype value)]
(if (= value-dtype container-dtype)
(if (and (not= container-dtype :string)
(= value-dtype container-dtype))
(do
(add-missing-values! container missing missing-value idx)
(.add container value))
Expand Down Expand Up @@ -272,7 +275,7 @@
missing (bitmap/->bitmap)]
(FixedTypeParser. container dtype missing-value parse-fn
missing failed-values failed-indexes
cname)))
cname -1)))


(defn parser-kwd-list->parser-tuples
Expand Down Expand Up @@ -312,11 +315,13 @@
^RoaringBitmap missing
;;List of datatype,parser-fn tuples
^List promotion-list
column-name]
column-name
^:unsynchronized-mutable ^long max-idx]
dtype-proto/PECount
(ecount [this] (.lsize container))
(ecount [this] (inc max-idx))
PParser
(add-value! [p idx value]
(set! max-idx (max (long idx) max-idx))
(when-not (missing-value? value)
(let [idx (long idx)]
(let [value-dtype (dtype/elemwise-datatype value)]
Expand Down Expand Up @@ -403,7 +408,8 @@
(bitmap/->bitmap)
(mapv (juxt identity default-coercers)
parser-datatype-sequence)
column-name)))
column-name
-1)))
([column-name]
(promotional-string-parser column-name default-parser-datatype-sequence)))

Expand All @@ -413,11 +419,13 @@
^{:unsynchronized-mutable true} container-dtype
^{:unsynchronized-mutable true} missing-value
^RoaringBitmap missing
column-name]
column-name
^:unsynchronized-mutable ^long max-idx]
dtype-proto/PECount
(ecount [this] (.lsize container))
(ecount [this] (inc max-idx))
PParser
(add-value! [p idx value]
(set! max-idx (max (long idx) max-idx))
(when-not (missing-value? value)
(let [idx (long idx)
org-datatype (dtype/datatype value)
Expand Down Expand Up @@ -457,4 +465,5 @@
:boolean
false
(bitmap/->bitmap)
column-name))
column-name
-1))
20 changes: 10 additions & 10 deletions test/tech/v3/dataset/mapseq_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -167,15 +167,15 @@
(ds-mod/set-inference-target :fruit-name)
(ds/categorical->one-hot [:fruit-name]))]
(is (= {:one-hot-table
{:orange :fruit-name-0,
:mandarin :fruit-name-1,
:apple :fruit-name-2,
:lemon :fruit-name-3},
{:orange :fruit-name-orange,
:mandarin :fruit-name-mandarin,
:apple :fruit-name-apple,
:lemon :fruit-name-lemon},
:src-column :fruit-name,
:result-datatype :float64}
(into {} (first (ds-cat/dataset->one-hot-maps dataset)))))
(is (= #{:mass :fruit-name-1 :fruit-name-0 :width :fruit-name-2 :color-score
:fruit-name-3 :height}
(is (= #{:mass :fruit-name-orange :fruit-name-mandarin :width :fruit-name-apple :color-score
:fruit-name-lemon :height}
(->> (ds/columns dataset)
(map ds-col/column-name)
set)))
Expand All @@ -187,10 +187,10 @@
vec)))

(is (= {:color-score :regression,
:fruit-name-0 :classification,
:fruit-name-1 :classification,
:fruit-name-2 :classification,
:fruit-name-3 :classification,
:fruit-name-orange :classification,
:fruit-name-lemon :classification,
:fruit-name-mandarin :classification,
:fruit-name-apple :classification,
:height :regression
:width :regression,
:mass :regression,
Expand Down
51 changes: 38 additions & 13 deletions test/tech/v3/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -665,19 +665,6 @@
(is (= ds thawed-ds))))


(deftest select-range-intersection
(let [ds (-> (ds/->dataset [{:a 1} {:a 3}])
(ds/select-rows (range -100 100)))]
(is (= 2 (ds/row-count ds))))
(let [range-ds (ds/->dataset {:a (range 100)})]
(is (= (vec (range 80 -1 -20))
(vec (-> (ds/select-rows range-ds (range 100 -100 -20))
(ds/column :a)))))
(is (= (vec (range 0 100 20))
(vec (-> (ds/select-rows range-ds (range -100 100 20))
(ds/column :a)))))))


(deftest unique-by-nil-regression
(-> (ds/->dataset [])
(ds/add-column (ds-col/new-column :abc [nil nil]))
Expand Down Expand Up @@ -715,6 +702,44 @@
(first)))))


(deftest parse-nils
(let [ds-a (ds/->dataset {:a [nil nil]})
ds-b (ds/->dataset [{:a nil} {:a nil}])]
(is (= (ds/row-count ds-a)
(ds/row-count ds-b)))
(is (= 2 (dtype/ecount (ds/missing ds-a)))
(= 2 (dtype/ecount (ds/missing ds-b))))))


(deftest parser-fn-failing-on-csv-entries
(let [stocks (ds/->dataset "test/data/stocks.csv"
{:key-fn keyword
:parser-fn {:date [:string #(subs % 0 5)]}})]
(is (= "Jan 1"
(first (stocks :date))))))

(deftest one-hot-failing
(let [str-ds (-> (ds/->dataset [{"a" 1 "b" "AA"}
{"a" 2 "b" "AA"}
{"a" 3 "b" "BB"}
{"a" 4 "b" "BB"}])
(ds/categorical->one-hot ["b"]))
kwd-ds (-> (ds/->dataset [{:a 1 :b "AA"}
{:a 2 :b "AA"}
{:a 3 :b "BB"}
{:a 4 :b "BB"}])
(ds/categorical->one-hot [:b]))]
(is (= #{"a" "b-AA" "b-BB"} (set (ds/column-names str-ds))))
(is (= #{:a :b-AA :b-BB} (set (ds/column-names kwd-ds))))))


(deftest select-memory
(let [original (ds/->dataset [{:a 0} {:a 1} {:a 2} {:a 3} {:a 4}])
new-ds (ds/select-rows original (range 4))]
(is (= (vec (range 4)) (vec (new-ds :a))))
(is (thrown? Throwable (vec (:a (ds/select-rows new-ds 4)))))))


(comment

(def test-ds (ds/->dataset
Expand Down

0 comments on commit bc8d3b8

Please sign in to comment.