Skip to content

Commit

Permalink
Better parsing of data and nice printing of items.
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Aug 24, 2019
1 parent d88c45f commit 38ac5d0
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 39 deletions.
9 changes: 5 additions & 4 deletions deps.edn
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{:paths ["src"]
:deps {org.clojure/clojure {:mvn/version "1.10.1"}
camel-snake-kebab {:mvn/version "0.4.0"}
techascent/tech.datatype {:mvn/version "4.8"}
tech.tablesaw/tablesaw-core {:mvn/version "0.34.2"}
com.github.haifengl/smile-core {:mvn/version "1.5.3"}
camel-snake-kebab {:mvn/version "0.4.0"}
techascent/tech.datatype {:mvn/version "4.9"}
techascent/tech.io {:mvn/version "3.6"}
tech.tablesaw/tablesaw-core {:mvn/version "0.34.2"}
com.github.haifengl/smile-core {:mvn/version "1.5.3"}
com.github.haifengl/smile-netlib {:mvn/version "1.5.3"}}}
94 changes: 63 additions & 31 deletions src/tech/libs/tablesaw.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,20 @@
[tech.v2.datatype :as dtype]
[tech.v2.datatype.base :as dtype-base]
[tech.v2.datatype.protocols :as dtype-proto]
[tech.v2.datatype.pprint :as dtype-pp]
[clojure.set :as c-set]
[tech.ml.dataset.seq-of-maps :as ds-seq-of-maps]
[tech.ml.dataset.generic-columnar-dataset :as columnar-dataset]
[tech.jna :as jna])
[tech.io :as io])
(:import [tech.tablesaw.api Table ColumnType
NumericColumn DoubleColumn
StringColumn BooleanColumn]
[tech.tablesaw.columns Column]
[tech.tablesaw.io.csv CsvReadOptions
CsvReadOptions$Builder]
[java.util UUID]
[java.io InputStream]
[java.io InputStream BufferedInputStream
ByteArrayInputStream]
[org.apache.commons.math3.stat.descriptive.moment Skewness]))


Expand All @@ -40,14 +42,14 @@
(declare make-column)


(defrecord TablesawColumn [^Column col metadata cache]
(deftype TablesawColumn [^Column col metadata cache]
col-proto/PIsColumn
(is-column? [this] true)

col-proto/PColumn
(column-name [this] (or (:name metadata) (.name col)))
(set-name [this colname]
(->TablesawColumn col (assoc metadata :name colname) {}))
(TablesawColumn. col (assoc metadata :name colname) {}))

(supported-stats [this] (col-proto/supported-stats col))

Expand All @@ -57,12 +59,12 @@
:datatype (dtype/get-datatype col)}))

(set-metadata [this data-map]
(->TablesawColumn col data-map cache))
(TablesawColumn. col data-map cache))

(cache [this] cache)

(set-cache [this cache-map]
(->TablesawColumn col metadata cache-map))
(TablesawColumn. col metadata cache-map))

(missing [this] (col-proto/missing col))

Expand All @@ -88,7 +90,9 @@
(col-proto/stats col missing-stats))))

(correlation [this other-column correlation-type]
(col-proto/correlation col (:col other-column) correlation-type))
(col-proto/correlation col
(.col ^TablesawColumn other-column)
correlation-type))

(column-values [this] (col-proto/column-values col))

Expand All @@ -111,7 +115,8 @@
(col-proto/column-values this)
metadata))

(to-double-array [this error-missing?] (col-proto/to-double-array col error-missing?))
(to-double-array [this error-missing?]
(col-proto/to-double-array col error-missing?))

dtype-proto/PDatatype
(get-datatype [this] (dtype-base/get-datatype col))
Expand Down Expand Up @@ -165,7 +170,7 @@

dtype-proto/PBuffer
(sub-buffer [item offset length]
(->TablesawColumn
(TablesawColumn.
(dtype-proto/sub-buffer col offset length)
metadata {}))

Expand All @@ -174,14 +179,33 @@
(->array-copy [src] (dtype-proto/->array-copy col))

dtype-proto/PCountable
(ecount [item] (dtype-proto/ecount col)))
(ecount [item] (dtype-proto/ecount col))

Object
(toString [item]
(let [n-items (dtype/ecount item)
format-str (if (> n-items 20)
"#tablesaw-column<%s>%s\n%s\n[%s...]"
"#tablesaw-column<%s>%s\n%s\n[%s]")]
(format format-str
(name (dtype/get-datatype item))
[n-items]
(col-proto/column-name item)
(-> (dtype/->reader item)
(dtype-proto/sub-buffer 0 (min 20 n-items))
(dtype-pp/print-reader-data))))))


(defmethod print-method TablesawColumn
[col ^java.io.Writer w]
(.write w (.toString ^Object col)))


(defn make-column
[datatype-col metadata & [cache]]
(if (instance? TablesawColumn datatype-col)
(throw (ex-info "Nested" {})))
(->TablesawColumn datatype-col metadata cache))
(TablesawColumn. datatype-col metadata cache))


(defmethod dtype-proto/make-container :tablesaw-column
Expand All @@ -197,22 +221,26 @@
(make-column options {})))


(defn ^tech.tablesaw.io.csv.CsvReadOptions$Builder
->csv-builder [path & {:keys [separator header? date-format]}]
(let [^CsvReadOptions$Builder builder
(cond
(instance? InputStream path)
(CsvReadOptions/builder ^InputStream path)
(string? path)
(CsvReadOptions/builder ^String path)
:else
(throw (ex-info "Failed to make builder" {})))]
(if separator
(doto builder
(.separator separator)
(.header (boolean header?)))
(doto builder
(.header (boolean header?))))))
(defn autodetect-csv-separator
[^BufferedInputStream input-stream & options]
(.mark input-stream 200)
(let [byte-data (byte-array 200)
num-read (.read input-stream byte-data)
_ (.reset input-stream)]
(apply io/autodetect-csv-separator (ByteArrayInputStream. byte-data 0 num-read)
options)))


(defn ^CsvReadOptions$Builder
->csv-builder [path & options]
(let [^BufferedInputStream input-stream (apply io/buffered-input-stream
path options)
separator (apply autodetect-csv-separator input-stream options)
opt-map (apply hash-map options)]
(doto (CsvReadOptions/builder input-stream)
(.separator separator)
(.header (boolean (or (:header? opt-map)
true))))))


(defn tablesaw-columns->tablesaw-dataset
Expand All @@ -236,10 +264,14 @@


(defn path->tablesaw-dataset
[path & {:keys [separator quote]}]
(-> (Table/read)
(.csv (->csv-builder path :separator separator :header? true))
->tablesaw-dataset))
[path & options]
(let [input (if (and (string? path)
(.endsWith ^String path ".gz"))
(io/gzip-input-stream path)
path)]
(-> (Table/read)
(.csv ^CsvReadOptions$Builder (apply ->csv-builder input options))
->tablesaw-dataset)))


(defn col-dtype-cast
Expand Down
11 changes: 7 additions & 4 deletions src/tech/ml/dataset/math.clj
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
(vec missing-columns)))
non-numeric (->> (columns dataset)
(map ds-col/metadata)
(remove #(ml-utils/numeric-datatype? (:datatype %)))
(remove #(ml-utils/numeric-datatype?
(:datatype %)))
(map :name)
seq)
_ (when non-numeric
Expand All @@ -44,7 +45,8 @@
dataset (select dataset
(->> (columns dataset)
(map ds-col/column-name)
(remove (set (concat (map :column-name missing-columns)
(remove (set (concat (map ds-col/column-name
missing-columns)
non-numeric))))
:all)
lhs-colseq (if (seq colname-seq)
Expand All @@ -56,10 +58,11 @@
[(ds-col/column-name lhs)
(->> rhs-colseq
(map (fn [rhs]
(when-not rhs
(throw (ex-info "Failed" {})))
(let [corr (ds-col/correlation lhs rhs correlation-type)]
(if (dfn/valid? corr)
[(ds-col/column-name rhs)
(ds-col/correlation lhs rhs correlation-type)]
[(ds-col/column-name rhs) corr]
(do
(log/warnf "Correlation failed: %s-%s"
(ds-col/column-name lhs)
Expand Down

0 comments on commit 38ac5d0

Please sign in to comment.