-
-
Notifications
You must be signed in to change notification settings - Fork 33
/
dataset.clj
147 lines (134 loc) · 6.1 KB
/
dataset.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
(ns tech.ml.dataset
"Column major dataset abstraction for efficiently manipulating
in memory datasets."
(:require [tech.v2.datatype :as dtype]
[tech.v2.datatype.functional.impl :as fn-impl]
[tech.ml.dataset.column :as ds-col]
[tech.ml.protocols.dataset :as ds-proto]
[tech.ml.utils :as ml-utils]
[tech.parallel :as parallel]
[clojure.core.matrix :as m]
[clojure.core.matrix.macros :refer [c-for]]
[clojure.set :as c-set]
[tech.ml.dataset.categorical :as categorical]
[tech.ml.dataset.pipeline.column-filters :as col-filters]
[tech.ml.dataset.options :as options]
[tech.ml.dataset.base]
[tech.ml.dataset.modelling]
[tech.ml.dataset.math])
(:import [smile.clustering KMeans GMeans XMeans PartitionClustering]))
(set! *warn-on-reflection* true)
(fn-impl/export-symbols tech.ml.dataset.base
dataset-name
metadata
set-metadata
maybe-column
column
columns
column-map
column-names
columns-with-missing-seq
add-column
new-column
remove-column
remove-columns
update-column
order-column-names
update-columns
select
select-columns
add-or-update-column
ds-group-by
ds-sort-by
ds-filter
ds-concat
ds-take-nth
ds-map-values
ds-column-map
->dataset
from-prototype)
(fn-impl/export-symbols tech.ml.dataset.modelling
set-inference-target
column-label-map
inference-target-label-map
dataset-label-map
inference-target-label-inverse-map
num-inference-classes
feature-ecount
model-type
column-values->categorical
reduce-column-names
has-column-label-map?
->k-fold-datasets
->train-test-split
->row-major)
(fn-impl/export-symbols tech.ml.dataset.math
correlation-table
k-means
g-means
x-means
compute-centroid-and-global-means
impute-missing-by-centroid-averages)
(defn ->flyweight
"Convert dataset to seq-of-maps dataset. Flag indicates if errors should be thrown on
missing values or if nil should be inserted in the map. IF a label map is passed in
then for the columns that are present in the label map a reverse mapping is done such
that the flyweight maps contain the labels and not their encoded values."
[dataset & {:keys [column-name-seq
error-on-missing-values?
number->string?]
:or {error-on-missing-values? true}}]
(let [label-map (when number->string?
(dataset-label-map dataset))
target-columns-and-vals
(->> (or column-name-seq
(->> (columns dataset)
(map ds-col/column-name)
((fn [colname-seq]
(if number->string?
(reduce-column-names dataset colname-seq)
colname-seq)))))
(map (fn [colname]
{:column-name colname
:column-values
(if (contains? label-map colname)
(let [retval
(categorical/column-values->categorical
dataset colname label-map)]
retval)
(let [current-column (column dataset colname)]
(when (and error-on-missing-values?
(not= 0 (count (ds-col/missing current-column))))
(throw (ex-info (format "Column %s has missing values"
(ds-col/column-name current-column))
{})))
(dtype/->reader current-column)))})))]
;;Transpose the sequence of columns into a sequence of rows
(->> target-columns-and-vals
(map :column-values)
(apply interleave)
(partition (count target-columns-and-vals))
;;Move to flyweight
(map zipmap
(repeat (map :column-name target-columns-and-vals))))))
(defn labels
"Given a dataset and an options map, generate a sequence of label-values.
If label count is 1, then if there is a label-map associated with column
generate sequence of labels by reverse mapping the column(s) back to the original
dataset values. If there are multiple label columns results are presented in
flyweight (sequence of maps) format."
[dataset]
(when-not (seq (col-filters/target? dataset))
(throw (ex-info "No label columns indicated" {})))
(let [original-label-column-names (->> (col-filters/inference? dataset)
(reduce-column-names dataset))
flyweight-labels (->flyweight dataset
:column-name-seq original-label-column-names
:number->string? true)]
(if (= 1 (count original-label-column-names))
(map #(get % (first original-label-column-names)) flyweight-labels)
flyweight-labels)))
(defn dataset->string
^String [ds]
(with-out-str
((parallel/require-resolve 'tech.ml.dataset.print/print-dataset) ds)))