/
metamorph.clj
1480 lines (1144 loc) · 48.4 KB
/
metamorph.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(ns tech.v3.dataset.metamorph
;;Autogenerated from tech.v3.dataset.metamorph-api-- DO NOT EDIT
"This is an auto-generated api system - it scans the namespaces and changes the first
to be metamorph-compliant which means transforming an argument that is just a dataset into
an argument that is a metamorph context - a map of `{:metamorph/data ds}`. They also return
their result as a metamorph context."
(:require [tech.v3.dataset.metamorph-api])
(:refer-clojure :exclude [filter group-by sort-by concat take-nth shuffle rand-nth update]))
(defn add-column
"Add a new column. Error if name collision"
([column]
(tech.v3.dataset.metamorph-api/add-column column)))
(defn add-or-update-column
"If column exists, replace. Else append new column."
([colname column]
(tech.v3.dataset.metamorph-api/add-or-update-column colname column))
([column]
(tech.v3.dataset.metamorph-api/add-or-update-column column)))
(defn append-columns
([column-seq]
(tech.v3.dataset.metamorph-api/append-columns column-seq)))
(defn assoc-ds
"If dataset is not nil, calls `clojure.core/assoc`. Else creates a new empty dataset and
then calls `clojure.core/assoc`. Guaranteed to return a dataset (unlike assoc)."
([cname cdata & args]
(apply tech.v3.dataset.metamorph-api/assoc-ds cname cdata args)))
(defn assoc-metadata
"Set metadata across a set of columns."
([filter-fn-or-ds k v & args]
(apply tech.v3.dataset.metamorph-api/assoc-metadata filter-fn-or-ds k v args)))
(defn brief
"Get a brief description, in mapseq form of a dataset. A brief description is
the mapseq form of descriptive stats."
([options]
(tech.v3.dataset.metamorph-api/brief options))
([]
(tech.v3.dataset.metamorph-api/brief )))
(defmacro build-pipelined-function
([f m]
`(tech.v3.dataset.metamorph-api/build-pipelined-function ~f ~m)))
(defn categorical->number
"Convert columns into a discrete , numeric representation
See tech.v3.dataset.categorical/fit-categorical-map."
([filter-fn-or-ds]
(tech.v3.dataset.metamorph-api/categorical->number filter-fn-or-ds))
([filter-fn-or-ds table-args]
(tech.v3.dataset.metamorph-api/categorical->number filter-fn-or-ds table-args))
([filter-fn-or-ds table-args result-datatype]
(tech.v3.dataset.metamorph-api/categorical->number filter-fn-or-ds table-args result-datatype)))
(defn categorical->one-hot
"Convert string columns to numeric columns.
See tech.v3.dataset.categorical/fit-one-hot"
([filter-fn-or-ds]
(tech.v3.dataset.metamorph-api/categorical->one-hot filter-fn-or-ds))
([filter-fn-or-ds table-args]
(tech.v3.dataset.metamorph-api/categorical->one-hot filter-fn-or-ds table-args))
([filter-fn-or-ds table-args result-datatype]
(tech.v3.dataset.metamorph-api/categorical->one-hot filter-fn-or-ds table-args result-datatype)))
(defn column
([colname]
(tech.v3.dataset.metamorph-api/column colname)))
(defn column->dataset
"Transform a column into a sequence of maps using transform-fn.
Return dataset created out of the sequence of maps."
([colname transform-fn options]
(tech.v3.dataset.metamorph-api/column->dataset colname transform-fn options))
([colname transform-fn]
(tech.v3.dataset.metamorph-api/column->dataset colname transform-fn)))
(defn column-cast
"Cast a column to a new datatype. This is never a lazy operation. If the old
and new datatypes match and no cast-fn is provided then dtype/clone is called
on the column.
colname may be a scalar or a tuple of [src-col dst-col].
datatype may be a datatype enumeration or a tuple of
[datatype cast-fn] where cast-fn may return either a new value,
:tech.v3.dataset/missing, or :tech.v3.dataset/parse-failure.
Exceptions are propagated to the caller. The new column has at least the
existing missing set (if no attempt returns :missing or :cast-failure).
:cast-failure means the value gets added to metadata key :unparsed-data
and the index gets added to :unparsed-indexes.
If the existing datatype is string, then tech.v3.datatype.column/parse-column
is called.
Casts between numeric datatypes need no cast-fn but one may be provided.
Casts to string need no cast-fn but one may be provided.
Casts from string to anything will call tech.v3.dataset.column/parse-column."
([colname datatype]
(tech.v3.dataset.metamorph-api/column-cast colname datatype)))
(defn column-count
([]
(tech.v3.dataset.metamorph-api/column-count )))
(defn column-labeled-mapseq
"Given a dataset, return a sequence of maps where several columns are all stored
in a :value key and a :label key contains a column name. Used for quickly creating
timeseries or scatterplot labeled graphs. Returns a lazy sequence, not a reader!
See also `columnwise-concat`
Return a sequence of maps with
```clojure
{... - columns not in colname-seq
:value - value from one of the value columns
:label - name of the column the value came from
}
```"
([value-colname-seq]
(tech.v3.dataset.metamorph-api/column-labeled-mapseq value-colname-seq)))
(defn column-map
"Produce a new (or updated) column as the result of mapping a fn over columns. This
function is never lazy - all results are immediately calculated.
* `dataset` - dataset.
* `result-colname` - Name of new (or existing) column.
* `map-fn` - function to map over columns. Same rules as `tech.v3.datatype/emap`.
* `res-dtype-or-opts` - If not given result is scanned to infer missing and datatype.
If using an option map, options are described below.
* `filter-fn-or-ds` - A dataset, a sequence of columns, or a `tech.v3.datasets/column-filters`
column filter function. Defaults to all the columns of the existing dataset.
Returns a new dataset with a new or updated column.
Options:
* `:datatype` - Set the dataype of the result column. If not given result is scanned
to infer result datatype and missing set.
* `:missing-fn` - if given, columns are first passed to missing-fn as a sequence and
this dictates the missing set. Else the missing set is by scanning the results
during the inference process. See `tech.v3.dataset.column/union-missing-sets` and
`tech.v3.dataset.column/intersect-missing-sets` for example functions to pass in
here.
Examples:
```clojure
;;From the tests --
(let [testds (ds/->dataset [{:a 1.0 :b 2.0} {:a 3.0 :b 5.0} {:a 4.0 :b nil}])]
;;result scanned for both datatype and missing set
(is (= (vec [3.0 6.0 nil])
(:b2 (ds/column-map testds :b2 #(when % (inc %)) [:b]))))
;;result scanned for missing set only. Result used in-place.
(is (= (vec [3.0 6.0 nil])
(:b2 (ds/column-map testds :b2 #(when % (inc %))
{:datatype :float64} [:b]))))
;;Nothing scanned at all.
(is (= (vec [3.0 6.0 nil])
(:b2 (ds/column-map testds :b2 #(inc %)
{:datatype :float64
:missing-fn ds-col/union-missing-sets} [:b]))))
;;Missing set scanning causes NPE at inc.
(is (thrown? Throwable
(ds/column-map testds :b2 #(inc %)
{:datatype :float64}
[:b]))))
;;Ad-hoc repl --
user> (require '[tech.v3.dataset :as ds]))
nil
user> (def ds (ds/->dataset \"test/data/stocks.csv\"))
#'user/ds
user> (ds/head ds)
test/data/stocks.csv [5 3]:
| symbol | date | price |
|--------|------------|-------|
| MSFT | 2000-01-01 | 39.81 |
| MSFT | 2000-02-01 | 36.35 |
| MSFT | 2000-03-01 | 43.22 |
| MSFT | 2000-04-01 | 28.37 |
| MSFT | 2000-05-01 | 25.45 |
user> (-> (ds/column-map ds \"price^2\" #(* % %) [\"price\"])
(ds/head))
test/data/stocks.csv [5 4]:
| symbol | date | price | price^2 |
|--------|------------|-------|-----------|
| MSFT | 2000-01-01 | 39.81 | 1584.8361 |
| MSFT | 2000-02-01 | 36.35 | 1321.3225 |
| MSFT | 2000-03-01 | 43.22 | 1867.9684 |
| MSFT | 2000-04-01 | 28.37 | 804.8569 |
| MSFT | 2000-05-01 | 25.45 | 647.7025 |
user> (def ds1 (ds/->dataset [{:a 1} {:b 2.0} {:a 2 :b 3.0}]))
#'user/ds1
user> ds1
_unnamed [3 2]:
| :b | :a |
|----:|---:|
| | 1 |
| 2.0 | |
| 3.0 | 2 |
user> (ds/column-map ds1 :c (fn [a b]
(when (and a b)
(+ (double a) (double b))))
[:a :b])
_unnamed [3 3]:
| :b | :a | :c |
|----:|---:|----:|
| | 1 | |
| 2.0 | | |
| 3.0 | 2 | 5.0 |
user> (ds/missing (*1 :c))
{0,1}
```"
([result-colname map-fn res-dtype-or-opts filter-fn-or-ds]
(tech.v3.dataset.metamorph-api/column-map result-colname map-fn res-dtype-or-opts filter-fn-or-ds))
([result-colname map-fn filter-fn-or-ds]
(tech.v3.dataset.metamorph-api/column-map result-colname map-fn filter-fn-or-ds))
([result-colname map-fn]
(tech.v3.dataset.metamorph-api/column-map result-colname map-fn)))
(defn column-names
"In-order sequence of column names"
([]
(tech.v3.dataset.metamorph-api/column-names )))
(defn column-values->categorical
"Given a column encoded via either string->number or one-hot, reverse
map to the a sequence of the original string column values.
In the case of one-hot mappings, src-column must be the original
column name before the one-hot map"
([src-column]
(tech.v3.dataset.metamorph-api/column-values->categorical src-column)))
(defn columns
"Return sequence of all columns in dataset."
([]
(tech.v3.dataset.metamorph-api/columns )))
(defn columns-with-missing-seq
"Return a sequence of:
```clojure
{:column-name column-name
:missing-count missing-count
}
```
or nil of no columns are missing data."
([]
(tech.v3.dataset.metamorph-api/columns-with-missing-seq )))
(defn columnwise-concat
"Given a dataset and a list of columns, produce a new dataset with
the columns concatenated to a new column with a :column column indicating
which column the original value came from. Any columns not mentioned in the
list of columns are duplicated.
Example:
```clojure
user> (-> [{:a 1 :b 2 :c 3 :d 1} {:a 4 :b 5 :c 6 :d 2}]
(ds/->dataset)
(ds/columnwise-concat [:c :a :b]))
null [6 3]:
| :column | :value | :d |
|---------+--------+----|
| :c | 3 | 1 |
| :c | 6 | 2 |
| :a | 1 | 1 |
| :a | 4 | 2 |
| :b | 2 | 1 |
| :b | 5 | 2 |
```
Options:
value-column-name - defaults to :value
colname-column-name - defaults to :column
"
([colnames options]
(tech.v3.dataset.metamorph-api/columnwise-concat colnames options))
([colnames]
(tech.v3.dataset.metamorph-api/columnwise-concat colnames)))
(defn concat
"Concatenate datasets in place using a copying-concatenation.
See also concat-inplace as it may be more efficient for your use case if you have
a small number (like less than 3) of datasets."
([& args]
(apply tech.v3.dataset.metamorph-api/concat args))
([]
(tech.v3.dataset.metamorph-api/concat )))
(defn concat-copying
"Concatenate datasets into a new dataset copying data. Respects missing values.
Datasets must all have the same columns. Result column datatypes will be a widening
cast of the datatypes."
([& args]
(apply tech.v3.dataset.metamorph-api/concat-copying args))
([]
(tech.v3.dataset.metamorph-api/concat-copying )))
(defn concat-inplace
"Concatenate datasets in place. Respects missing values. Datasets must all have the
same columns. Result column datatypes will be a widening cast of the datatypes."
([& args]
(apply tech.v3.dataset.metamorph-api/concat-inplace args))
([]
(tech.v3.dataset.metamorph-api/concat-inplace )))
(defn data->dataset
"Convert a data-ized dataset created via dataset->data back into a
full dataset"
([]
(tech.v3.dataset.metamorph-api/data->dataset )))
(defn dataset->categorical-xforms
"Given a dataset, return a map of column-name->xform information."
([]
(tech.v3.dataset.metamorph-api/dataset->categorical-xforms )))
(defn dataset->data
"Convert a dataset to a pure clojure datastructure. Returns a map with two keys:
{:metadata :columns}.
:columns is a vector of column definitions appropriate for passing directly back
into new-dataset.
A column definition in this case is a map of {:name :missing :data :metadata}."
([]
(tech.v3.dataset.metamorph-api/dataset->data )))
(defn dataset-name
([]
(tech.v3.dataset.metamorph-api/dataset-name )))
(defn dataset?
([]
(tech.v3.dataset.metamorph-api/dataset? )))
(defn descriptive-stats
"Get descriptive statistics across the columns of the dataset.
In addition to the standard stats.
Options:
:stat-names - defaults to (remove #{:values :num-distinct-values}
(all-descriptive-stats-names))
:n-categorical-values - Number of categorical values to report in the 'values'
field. Defaults to 21."
([]
(tech.v3.dataset.metamorph-api/descriptive-stats ))
([options]
(tech.v3.dataset.metamorph-api/descriptive-stats options)))
(defn drop-columns
"Same as remove-columns. Remove columns indexed by column name seq or
column filter function.
For example:
```clojure
(drop-columns DS [:A :B])
(drop-columns DS cf/categorical)
```"
([colname-seq-or-fn]
(tech.v3.dataset.metamorph-api/drop-columns colname-seq-or-fn)))
(defn drop-missing
"Remove missing entries by simply selecting out the missing indexes."
([]
(tech.v3.dataset.metamorph-api/drop-missing ))
([colname]
(tech.v3.dataset.metamorph-api/drop-missing colname)))
(defn drop-rows
"Drop rows from dataset or column"
([row-indexes]
(tech.v3.dataset.metamorph-api/drop-rows row-indexes)))
(defn empty-dataset
([]
(tech.v3.dataset.metamorph-api/empty-dataset )))
(defn ensure-array-backed
"Ensure the column data in the dataset is stored in pure java arrays. This is
sometimes necessary for interop with other libraries and this operation will
force any lazy computations to complete. This also clears the missing set
for each column and writes the missing values to the new arrays.
Columns that are already array backed and that have no missing values are not
changed and retuned.
The postcondition is that dtype/->array will return a java array in the appropriate
datatype for each column.
Options:
* `:unpack?` - unpack packed datetime types. Defaults to true"
([options]
(tech.v3.dataset.metamorph-api/ensure-array-backed options))
([]
(tech.v3.dataset.metamorph-api/ensure-array-backed )))
(defn feature-ecount
"Number of feature columns. Feature columns are columns that are not
inference targets."
([]
(tech.v3.dataset.metamorph-api/feature-ecount )))
(defn filter
"dataset->dataset transformation. Predicate is passed a map of
colname->column-value."
([predicate]
(tech.v3.dataset.metamorph-api/filter predicate)))
(defn filter-column
"Filter a given column by a predicate. Predicate is passed column values.
If predicate is *not* an instance of Ifn it is treated as a value and will
be used as if the predicate is #(= value %).
The 2-arity form of this function reads the column as a boolean reader so for
instance numeric 0 values are false in that case as are Double/NaN, Float/NaN. Objects are
only false if nil?.
Returns a dataset."
([colname predicate]
(tech.v3.dataset.metamorph-api/filter-column colname predicate))
([colname]
(tech.v3.dataset.metamorph-api/filter-column colname)))
(defn filter-dataset
"Filter the columns of the dataset returning a new dataset. This pathway is
designed to work with the tech.v3.dataset.column-filters namespace.
* If filter-fn-or-ds is a dataset, it is returned.
* If filter-fn-or-ds is sequential, then select-columns is called.
* If filter-fn-or-ds is :all, all columns are returned
* If filter-fn-or-ds is an instance of IFn, the dataset is passed into it."
([filter-fn-or-ds]
(tech.v3.dataset.metamorph-api/filter-dataset filter-fn-or-ds)))
(defn group-by
"Produce a map of key-fn-value->dataset. The argument to key-fn
is a map of colname->column-value representing a row in dataset.
Each dataset in the resulting map contains all and only rows
that produce the same key-fn-value.
Options - options are passed into dtype arggroup:
* `:group-by-finalizer` - when provided this is run on each dataset immediately after the
rows are selected. This can be used to immediately perform a reduction on each new
dataset which is faster than doing it in a separate run."
([key-fn options]
(tech.v3.dataset.metamorph-api/group-by key-fn options))
([key-fn]
(tech.v3.dataset.metamorph-api/group-by key-fn)))
(defn group-by->indexes
"(Non-lazy) - Group a dataset and return a map of key-fn-value->indexes where indexes
is an in-order contiguous group of indexes."
([key-fn options]
(tech.v3.dataset.metamorph-api/group-by->indexes key-fn options))
([key-fn]
(tech.v3.dataset.metamorph-api/group-by->indexes key-fn)))
(defn group-by-column
"Return a map of column-value->dataset. Each dataset in the
resulting map contains all and only rows with the same value in
column.
* `:group-by-finalizer` - when provided this is run on each dataset immediately after the
rows are selected. This can be used to immediately perform a reduction on each new
dataset which is faster than doing it in a separate run."
([colname options]
(tech.v3.dataset.metamorph-api/group-by-column colname options))
([colname]
(tech.v3.dataset.metamorph-api/group-by-column colname)))
(defn group-by-column->indexes
"(Non-lazy) - Group a dataset by a column return a map of column-val->indexes
where indexes is an in-order contiguous group of indexes.
Options are passed into dtype's arggroup method."
([colname options]
(tech.v3.dataset.metamorph-api/group-by-column->indexes colname options))
([colname]
(tech.v3.dataset.metamorph-api/group-by-column->indexes colname)))
(defn group-by-column-consumer
([cname]
(tech.v3.dataset.metamorph-api/group-by-column-consumer cname)))
(defn has-column?
([column-name]
(tech.v3.dataset.metamorph-api/has-column? column-name)))
(defn head
"Get the first n row of a dataset. Equivalent to
`(select-rows ds (range n)). Arguments are reversed, however, so this can
be used in ->> operators."
([n]
(tech.v3.dataset.metamorph-api/head n))
([]
(tech.v3.dataset.metamorph-api/head )))
(defn induction
"Given a dataset and a function from dataset->row produce a new dataset.
The produced row will be merged with the current row and then added to the
dataset.
Options are same as the options used for [[->dataset]] in order for the
user to control the parsing of the return values of `induct-fn`.
A new dataset is returned.
Example:
```clojure
user> (def ds (ds/->dataset {:a [0 1 2 3] :b [1 2 3 4]}))
#'user/ds
user> ds
_unnamed [4 2]:
| :a | :b |
|---:|---:|
| 0 | 1 |
| 1 | 2 |
| 2 | 3 |
| 3 | 4 |
user> (ds/induction ds (fn [ds]
{:sum-of-previous-row (dfn/sum (ds/rowvec-at ds -1))
:sum-a (dfn/sum (ds :a))
:sum-b (dfn/sum (ds :b))}))
_unnamed [4 5]:
| :a | :b | :sum-b | :sum-a | :sum-of-previous-row |
|---:|---:|-------:|-------:|---------------------:|
| 0 | 1 | 0.0 | 0.0 | 0.0 |
| 1 | 2 | 1.0 | 0.0 | 1.0 |
| 2 | 3 | 3.0 | 1.0 | 5.0 |
| 3 | 4 | 6.0 | 3.0 | 14.0 |
```"
([induct-fn & args]
(apply tech.v3.dataset.metamorph-api/induction induct-fn args)))
(defn inference-column?
([]
(tech.v3.dataset.metamorph-api/inference-column? )))
(defn inference-target-column-names
"Return the names of the columns that are inference targets."
([]
(tech.v3.dataset.metamorph-api/inference-target-column-names )))
(defn inference-target-ds
"Given a dataset return reverse-mapped inference target columns or nil
in the case where there are no inference targets."
([]
(tech.v3.dataset.metamorph-api/inference-target-ds )))
(defn inference-target-label-inverse-map
"Given options generated during ETL operations and annotated with :label-columns
sequence container 1 label column, generate a reverse map that maps from a dataset
value back to the label that generated that value."
([& args]
(apply tech.v3.dataset.metamorph-api/inference-target-label-inverse-map args)))
(defn inference-target-label-map
([& args]
(apply tech.v3.dataset.metamorph-api/inference-target-label-map args)))
(defn k-fold-datasets
"Given 1 dataset, prepary K datasets using the k-fold algorithm.
Randomize dataset defaults to true which will realize the entire dataset
so use with care if you have large datasets.
Returns a sequence of {:test-ds :train-ds}
Options:
* `:randomize-dataset?` - When true, shuffle the dataset. In that case 'seed' may be
provided. Defaults to true.
* `:seed` - when `:randomize-dataset?` is true then this can either be an
implementation of java.util.Random or an integer seed which will be used to
construct java.util.Random."
([k options]
(tech.v3.dataset.metamorph-api/k-fold-datasets k options))
([k]
(tech.v3.dataset.metamorph-api/k-fold-datasets k)))
(defn labels
"Return the labels. The labels sequence is the reverse mapped inference
column. This returns a single column of data or errors out."
([]
(tech.v3.dataset.metamorph-api/labels )))
(defn mapseq-reader
"Return a reader that produces a map of column-name->column-value
upon read."
([options]
(tech.v3.dataset.metamorph-api/mapseq-reader options))
([]
(tech.v3.dataset.metamorph-api/mapseq-reader )))
(defn min-n-by-column
"Find the minimum N entries (unsorted) by column. Resulting data will be indexed in
original order. If you want a sorted order then sort the result.
See options to [[sort-by-column]].
Example:
```clojure
user> (ds/min-n-by-column ds \"price\" 10 nil nil)
test/data/stocks.csv [10 3]:
| symbol | date | price |
|--------|------------|------:|
| AMZN | 2001-09-01 | 5.97 |
| AMZN | 2001-10-01 | 6.98 |
| AAPL | 2000-12-01 | 7.44 |
| AAPL | 2002-08-01 | 7.38 |
| AAPL | 2002-09-01 | 7.25 |
| AAPL | 2002-12-01 | 7.16 |
| AAPL | 2003-01-01 | 7.18 |
| AAPL | 2003-02-01 | 7.51 |
| AAPL | 2003-03-01 | 7.07 |
| AAPL | 2003-04-01 | 7.11 |
user> (ds/min-n-by-column ds \"price\" 10 > nil)
test/data/stocks.csv [10 3]:
| symbol | date | price |
|--------|------------|-------:|
| GOOG | 2007-09-01 | 567.27 |
| GOOG | 2007-10-01 | 707.00 |
| GOOG | 2007-11-01 | 693.00 |
| GOOG | 2007-12-01 | 691.48 |
| GOOG | 2008-01-01 | 564.30 |
| GOOG | 2008-04-01 | 574.29 |
| GOOG | 2008-05-01 | 585.80 |
| GOOG | 2009-11-01 | 583.00 |
| GOOG | 2009-12-01 | 619.98 |
| GOOG | 2010-03-01 | 560.19 |
```"
([cname N comparator options]
(tech.v3.dataset.metamorph-api/min-n-by-column cname N comparator options))
([cname N comparator]
(tech.v3.dataset.metamorph-api/min-n-by-column cname N comparator))
([cname N]
(tech.v3.dataset.metamorph-api/min-n-by-column cname N)))
(defn missing
"Given a dataset or a column, return the missing set as a roaring bitmap"
([]
(tech.v3.dataset.metamorph-api/missing )))
(defn model-type
"Check the label column after dataset processing.
Return either
:regression
:classification"
([& args]
(apply tech.v3.dataset.metamorph-api/model-type args)))
(defn new-column
"Create a new column. Data will scanned for missing values
unless the full 4-argument pathway is used."
([data]
(tech.v3.dataset.metamorph-api/new-column data))
([data metadata]
(tech.v3.dataset.metamorph-api/new-column data metadata))
([data metadata missing]
(tech.v3.dataset.metamorph-api/new-column data metadata missing))
([]
(tech.v3.dataset.metamorph-api/new-column )))
(defn new-dataset
"Create a new dataset from a sequence of columns. Data will be converted
into columns using ds-col-proto/ensure-column-seq. If the column seq is simply a
collection of vectors, for instance, columns will be named ordinally.
options map -
:dataset-name - Name of the dataset. Defaults to \"_unnamed\".
:key-fn - Key function used on all column names before insertion into dataset.
The return value fulfills the dataset protocols."
([ds-metadata column-seq]
(tech.v3.dataset.metamorph-api/new-dataset ds-metadata column-seq))
([column-seq]
(tech.v3.dataset.metamorph-api/new-dataset column-seq))
([]
(tech.v3.dataset.metamorph-api/new-dataset )))
(defn num-inference-classes
"Given a dataset and correctly built options from pipeline operations,
return the number of classes used for the label. Error if not classification
dataset."
([]
(tech.v3.dataset.metamorph-api/num-inference-classes )))
(defn order-column-names
"Order a sequence of columns names so they match the order in the
original dataset. Missing columns are placed last."
([colname-seq]
(tech.v3.dataset.metamorph-api/order-column-names colname-seq)))
(defn pmap-ds
"Parallelize mapping a function from dataset->dataset across a single dataset. Results are
coalesced back into a single dataset. The original dataset is simple sliced into n-core
results and map-fn is called n-core times. ds-map-fn must be a function from
dataset->dataset although it may return nil.
Options:
* `:max-batch-size` - this is a default for tech.v3.parallel.for/indexed-map-reduce. You
can control how many rows are processed in a given batch - the default is 64000. If your
mapping pathway produces a large expansion in the size of the dataset then it may be
good to reduce the max batch size and use :as-seq to produce a sequence of datasets.
* `:result-type`
- `:as-seq` - Return a sequence of datasets, one for each batch.
- `:as-ds` - Return a single datasets with all results in memory (default option)."
([ds-map-fn options]
(tech.v3.dataset.metamorph-api/pmap-ds ds-map-fn options))
([ds-map-fn]
(tech.v3.dataset.metamorph-api/pmap-ds ds-map-fn)))
(defn print-all
"Helper function equivalent to `(tech.v3.dataset.print/print-range ... :all)`"
([]
(tech.v3.dataset.metamorph-api/print-all )))
(defn probability-distributions->label-column
"Given a dataset that has columns in which the column names describe labels and the
rows describe a probability distribution, create a label column by taking the max
value in each row and assign column that row value."
([dst-colname label-column-datatype]
(tech.v3.dataset.metamorph-api/probability-distributions->label-column dst-colname label-column-datatype))
([dst-colname]
(tech.v3.dataset.metamorph-api/probability-distributions->label-column dst-colname)))
(defn rand-nth
"Return a random row from the dataset in map format"
([]
(tech.v3.dataset.metamorph-api/rand-nth )))
(defn remove-column
"Same as:
```clojure
(dissoc dataset col-name)
```"
([col-name]
(tech.v3.dataset.metamorph-api/remove-column col-name)))
(defn remove-columns
"Remove columns indexed by column name seq or column filter function.
For example:
```clojure
(remove-columns DS [:A :B])
(remove-columns DS cf/categorical)
```"
([colname-seq-or-fn]
(tech.v3.dataset.metamorph-api/remove-columns colname-seq-or-fn)))
(defn remove-rows
"Same as drop-rows."
([row-indexes]
(tech.v3.dataset.metamorph-api/remove-rows row-indexes)))
(defn rename-columns
"Rename columns using a map or vector of column names.
Does not reorder columns; rename is in-place for maps and
positional for vectors."
([colnames]
(tech.v3.dataset.metamorph-api/rename-columns colnames)))
(defn replace-missing
"Replace missing values in some columns with a given strategy.
The columns selector may be:
- seq of any legal column names
- or a column filter function, such as `numeric` and `categorical`
Strategies may be:
- `:down` - take value from previous non-missing row if possible else use provided value.
- `:up` - take value from next non-missing row if possible else use provided value.
- `:downup` - take value from previous if possible else use next.
- `:updown` - take value from next if possible else use previous.
- `:nearest` - Use nearest of next or previous values. `:mid` is an alias for `:nearest`.
- `:midpoint` - Use midpoint of averaged values between previous and next nonmissing
rows.
- `:abb` - Impute missing with approximate bayesian bootstrap. See [r's ABB](https://search.r-project.org/CRAN/refmans/LaplacesDemon/html/ABB.html).
- `:lerp` - Linearly interpolate values between previous and next nonmissing rows.
- `:value` - Value will be provided - see below.
value may be provided which will then be used. Value may be a function in which
case it will be called on the column with missing values elided and the return will
be used to as the filler."
([]
(tech.v3.dataset.metamorph-api/replace-missing ))
([strategy]
(tech.v3.dataset.metamorph-api/replace-missing strategy))
([columns-selector strategy]
(tech.v3.dataset.metamorph-api/replace-missing columns-selector strategy))
([columns-selector strategy value]
(tech.v3.dataset.metamorph-api/replace-missing columns-selector strategy value)))
(defn replace-missing-value
([filter-fn-or-ds scalar-value]
(tech.v3.dataset.metamorph-api/replace-missing-value filter-fn-or-ds scalar-value))
([scalar-value]
(tech.v3.dataset.metamorph-api/replace-missing-value scalar-value)))
(defn reverse-rows
"Reverse the rows in the dataset or column."
([]
(tech.v3.dataset.metamorph-api/reverse-rows )))
(defn row-at
"Get the row at an individual index. If indexes are negative then the dataset
is indexed from the end.
```clojure
user> (ds/row-at stocks 1)
{\"date\" #object[java.time.LocalDate 0x534cb03b \"2000-02-01\"],
\"symbol\" \"MSFT\",
\"price\" 36.35}
user> (ds/row-at stocks -1)
{\"date\" #object[java.time.LocalDate 0x6bf60ed5 \"2010-03-01\"],
\"symbol\" \"AAPL\",
\"price\" 223.02}
```"
([idx]
(tech.v3.dataset.metamorph-api/row-at idx)))
(defn row-count
([]
(tech.v3.dataset.metamorph-api/row-count )))
(defn row-map
"Map a function across the rows of the dataset producing a new dataset
that is merged back into the original potentially replacing existing columns.
Options are passed into the [[->dataset]] function so you can control the resulting
column types by the usual dataset parsing options described there.
Options:
See options for [[pmap-ds]]. In particular, note that you can
produce a sequence of datasets as opposed to a single large dataset.
Speed demons should attempt both `{:copying? false}` and `{:copying? true}` in the options
map as that changes rather drastically how data is read from the datasets. If you are
going to read all the data in the dataset, `{:copying? true}` will most likely be
the faster of the two.
Examples:
```clojure
user> (def stocks (ds/->dataset \"test/data/stocks.csv\"))
#'user/stocks
user> (ds/head stocks)
test/data/stocks.csv [5 3]:
| symbol | date | price |
|--------|------------|------:|
| MSFT | 2000-01-01 | 39.81 |
| MSFT | 2000-02-01 | 36.35 |
| MSFT | 2000-03-01 | 43.22 |
| MSFT | 2000-04-01 | 28.37 |
| MSFT | 2000-05-01 | 25.45 |
user> (ds/head (ds/row-map stocks (fn [row]
{\"symbol\" (keyword (row \"symbol\"))
:price2 (* (row \"price\")(row \"price\"))})))
test/data/stocks.csv [5 4]:
| symbol | date | price | :price2 |
|--------|------------|------:|----------:|
| :MSFT | 2000-01-01 | 39.81 | 1584.8361 |
| :MSFT | 2000-02-01 | 36.35 | 1321.3225 |
| :MSFT | 2000-03-01 | 43.22 | 1867.9684 |
| :MSFT | 2000-04-01 | 28.37 | 804.8569 |
| :MSFT | 2000-05-01 | 25.45 | 647.7025 |
```"
([map-fn options]
(tech.v3.dataset.metamorph-api/row-map map-fn options))
([map-fn]
(tech.v3.dataset.metamorph-api/row-map map-fn)))
(defn row-mapcat
"Map a function across the rows of the dataset. The function must produce a sequence of
maps and the original dataset rows will be duplicated and then merged into the result
of calling (->> (apply concat) (->>dataset options) on the result of `mapcat-fn`. Options
are the same as [[->dataset]].
The smaller the maps returned from mapcat-fn the better, perhaps consider using records.
In the case that a mapcat-fn result map has a key that overlaps a column name the
column will be replaced with the output of mapcat-fn. The returned map will have the
key `:_row-id` assoc'd onto it so for absolutely minimal gc usage include this
as a member variable in your map.
Options:
* See options for [[pmap-ds]]. Especially note `:max-batch-size` and `:result-type`.
In order to conserve memory it may be much more efficient to return a sequence of datasets
rather than one large dataset. If returning sequences of datasets perhaps consider
a transducing pathway across them or the [[tech.v3.dataset.reductions]] namespace.
Example:
```clojure
user> (def ds (ds/->dataset {:rid (range 10)
:data (repeatedly 10 #(rand-int 3))}))