-
Notifications
You must be signed in to change notification settings - Fork 28
/
ml.clj
246 lines (225 loc) · 7.38 KB
/
ml.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
(ns zero-one.geni.ml
(:refer-clojure :exclude [Double])
(:require
[camel-snake-kebab.core :refer [->kebab-case]]
[clojure.walk :refer [keywordize-keys]]
[potemkin :refer [import-vars]]
[zero-one.geni.ml-classification]
[zero-one.geni.ml-clustering]
[zero-one.geni.ml-evaluation]
[zero-one.geni.ml-feature]
[zero-one.geni.ml-fpm]
[zero-one.geni.ml-recommendation]
[zero-one.geni.ml-regression]
[zero-one.geni.ml-tuning]
[zero-one.geni.ml-xgb]
[zero-one.geni.interop :as interop])
(:import
(org.apache.spark.ml Pipeline
PipelineStage)
(org.apache.spark.ml.stat ChiSquareTest
Correlation)))
(import-vars
[zero-one.geni.ml-clustering
bisecting-k-means
gaussian-mixture
gmm
k-means
lda
latent-dirichlet-allocation])
(import-vars
[zero-one.geni.ml-evaluation
binary-classification-evaluator
clustering-evaluator
multiclass-classification-evaluator
regression-evaluator])
;; TODO: stop words remover, chi-sq-selector
(import-vars
[zero-one.geni.ml-feature
binariser
binarizer
bucketiser
bucketizer
bucketed-random-projection-lsh
count-vectoriser
count-vectorizer
dct
discrete-cosine-transform
elementwise-product
feature-hasher
hashing-tf
idf
imputer
index-to-string
interaction
max-abs-scaler
min-hash-lsh
min-max-scaler
n-gram
normaliser
normalizer
one-hot-encoder
one-hot-encoder-estimator
pca
polynomial-expansion
quantile-discretiser
quantile-discretizer
regex-tokeniser
regex-tokenizer
sql-transformer
standard-scaler
string-indexer
tokeniser
tokenizer
vector-assembler
vector-indexer
vector-size-hint
word2vec])
(import-vars
[zero-one.geni.ml-classification
decision-tree-classifier
gbt-classifier
linear-svc
logistic-regression
multilayer-perceptron-classifier
mlp-classifier
naive-bayes
one-vs-rest
random-forest-classifier])
(import-vars
[zero-one.geni.ml-fpm
fp-growth
frequent-pattern-growth
prefix-span])
(import-vars
[zero-one.geni.ml-regression
aft-survival-regression
decision-tree-regressor
gbt-regressor
generalised-linear-regression
generalized-linear-regression
glm
isotonic-regression
linear-regression
random-forest-regressor])
(import-vars
[zero-one.geni.ml-recommendation
als
alternating-least-squares
recommend-for-all-items
recommend-for-all-users
recommend-for-item-subset
recommend-for-user-subset
recommend-items
recommend-users])
(import-vars
[zero-one.geni.ml-tuning
param-grid
cross-validator
train-validation-split])
(import-vars
[zero-one.geni.ml-xgb
xgboost-classifier
xgboost-regressor])
(defn corr [dataframe col-name]
(Correlation/corr dataframe col-name))
(def correlation corr)
(defn chi-square-test [dataframe features-col label-col]
(ChiSquareTest/test dataframe features-col label-col))
(defn pipeline [& stages]
(-> (Pipeline.)
(.setStages (into-array PipelineStage stages))))
(defn fit [dataframe estimator]
(.fit estimator dataframe))
(defn transform [dataframe transformer]
(.transform transformer dataframe))
(defn evaluate [dataframe evaluator]
(.evaluate evaluator dataframe))
(defn params [stage]
(let [param-pairs (-> stage .extractParamMap .toSeq interop/scala-seq->vec)
unpack-pair (fn [p]
[(-> p .param .name ->kebab-case) (interop/->clojure (.value p))])]
(->> param-pairs
(map unpack-pair)
(into {})
keywordize-keys)))
;; TODO: turn summary into maps
(defn approx-nearest-neighbours
([dataset model key-v n-nearest]
(.approxNearestNeighbors model dataset (interop/->scala-coll key-v) n-nearest))
([dataset model key-v n-nearest dist-col]
(.approxNearestNeighbors model dataset (interop/->scala-coll key-v) n-nearest dist-col)))
(defn approx-similarity-join
([dataset-a dataset-b model threshold]
(.approxSimilarityJoin model dataset-a dataset-b threshold))
([dataset-a dataset-b model threshold dist-col]
(.approxSimilarityJoin model dataset-a dataset-b threshold dist-col)))
(defn association-rules [model] (.associationRules model))
(defn binary-summary [model] (.binarySummary model))
(defn boundaries [model] (interop/->clojure (.boundaries model)))
(defn category-maps [model]
(->> model
.categoryMaps
interop/scala-map->map
(map (fn [[k v]] [k (interop/scala-map->map v)]))
(into {})))
(defn category-sizes [model] (seq (.categorySizes model)))
(defn cluster-centers [model] (->> model .clusterCenters seq (map interop/->clojure)))
(defn coefficient-matrix [model] (interop/matrix->seqs (.coefficientMatrix model)))
(defn coefficients [model] (interop/vector->seq (.coefficients model)))
(defn compute-cost [dataset model] (.computeCost model dataset))
(defn depth [model] (.depth model))
(def describe-topics (memfn describeTopics))
(defn estimated-doc-concentration [model] (interop/->clojure (.estimatedDocConcentration model)))
(defn feature-importances [model] (interop/->clojure (.featureImportances model)))
(defn find-frequent-sequential-patterns [dataset prefix-span]
(.findFrequentSequentialPatterns prefix-span dataset))
(def find-patterns find-frequent-sequential-patterns)
(defn frequent-item-sets [model] (.freqItemsets model))
(def freq-itemsets frequent-item-sets)
(defn gaussians-df [model] (.gaussiansDF model))
(defn get-num-trees [model] (.getNumTrees model))
(defn get-size [model] (.getSize model))
(defn idf-vector [model] (interop/vector->seq (.idf model)))
(defn intercept [model] (.intercept model))
(defn intercept-vector [model] (interop/vector->seq (.interceptVector model)))
(defn is-distributed [model] (.isDistributed model))
(def distributed? is-distributed)
(defn log-likelihood [dataset model] (.logLikelihood model dataset))
(defn log-perplexity [dataset model] (.logPerplexity model dataset))
(defn max-abs [model] (interop/vector->seq (.maxAbs model)))
(defn mean [model] (interop/vector->seq (.mean model)))
(defn num-classes [model] (.numClasses model))
(defn num-features [model] (.numFeatures model))
(defn num-nodes [model] (.numNodes model))
(defn original-max [model] (interop/vector->seq (.originalMax model)))
(defn original-min [model] (interop/vector->seq (.originalMin model)))
(defn pc [model] (interop/matrix->seqs (.pc model)))
(def principal-components pc)
(defn pi [model] (interop/vector->seq (.pi model)))
(defn root-node [model] (.rootNode model))
(defn scale [model] (.scale model))
(defn summary [model] (.summary model))
(defn supported-optimizers [model] (seq (.supportedOptimizers model)))
(def supported-optimisers supported-optimizers)
(defn stages [model] (seq (.stages model)))
(defn std [model] (interop/vector->seq (.std model)))
(defn surrogate-df [model] (.surrogateDF model))
(defn theta [model] (interop/matrix->seqs (.theta model)))
(defn total-num-nodes [model] (.totalNumNodes model))
(defn tree-weights [model] (seq (.treeWeights model)))
(defn trees [model] (seq (.trees model)))
(defn uid [model] (.uid model))
(defn vocab-size [model] (.vocabSize model))
(defn vocabulary [model] (seq (.vocabulary model)))
(defn weights [model] (seq (.weights model)))
;; TODO: read-stage
(defn write-stage! [model path]
(.. model
write
overwrite
(save path)))
(comment
(import '(ml.dmlc.xgboost4j.scala.spark XGBoostRegressor))
(params (XGBoostRegressor.))
true)