/
string_table.clj
143 lines (133 loc) · 5.43 KB
/
string_table.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
(ns tech.ml.dataset.string-table
(:require [tech.v2.datatype :as dtype]
[tech.v2.datatype.protocols :as dtype-proto]
[tech.ml.dataset.dynamic-int-list :as int-list]
[tech.ml.dataset.parallel-unique :refer [parallel-unique]]
[tech.parallel.for :as parallel-for])
(:import [java.util List HashMap Map RandomAccess Iterator ArrayList]
[it.unimi.dsi.fastutil.ints IntArrayList IntList IntIterator]))
(defprotocol PStrTable
(get-str-table [item]))
(declare make-string-table)
(deftype StringTable
[^List int->str
^Map str->int
^IntList data]
dtype-proto/PDatatype
(get-datatype [this] :string)
dtype-proto/PClone
(clone [this]
;;We do not need to dedup any more; a java array is a more efficient
;;storage mechanism
(dtype/make-container :java-array :string this))
dtype-proto/PPrototype
(from-prototype [this new-datatype new-shape]
(let [n-elems (long (apply * new-shape))]
(if-not (= new-datatype :string)
(dtype/make-container :list new-datatype n-elems)
(make-string-table n-elems (.get int->str 0) int->str str->int))))
tech.v2.datatype.Countable
(lsize [this] (long (.size data)))
PStrTable
(get-str-table [this] {:int->str int->str
:str->int str->int})
List
(size [this] (.size data))
(add [this str-val]
(.add this (.size data) str-val)
true)
(add [this idx str-val]
(when-not (instance? String str-val)
(throw (Exception. "Can only use strings")))
(let [item-idx (int (if-let [idx-val (.get str->int str-val)]
idx-val
(let [idx-val (.size str->int)]
(.put str->int str-val idx-val)
(.add int->str idx-val str-val)
idx-val)))]
(.add data idx item-idx)
true))
(get [this idx] (.get int->str (.get data idx)))
(set [this idx str-val]
;;dtype/copy! calls set in parallel but will never call add
;;in parallel. This is unsafe really but add is called during parsing
;;a lot and it has a huge effect for some files.
(locking str->int
(when-not (instance? String str-val)
(throw (Exception. "Can only use strings")))
(let [item-idx (int (if-let [idx-val (.get str->int str-val)]
idx-val
(let [idx-val (.size str->int)]
(.put str->int str-val idx-val)
(.add int->str idx-val str-val)
idx-val)))
old-value (int (.set data idx item-idx))]
(.get int->str old-value))))
(subList [this start-offset end-offset]
(StringTable. int->str str->int (.subList data start-offset end-offset)))
(toArray [this]
(object-array this))
RandomAccess
Iterable
(iterator [this]
(let [^IntIterator src-iter (.iterator data)]
(reify Iterator
(hasNext [iter] (.hasNext src-iter))
(next [iter] (.get int->str (.nextInt src-iter)))))))
(defn make-string-table
(^List [n-elems missing-val ^List int->str ^HashMap str->int]
(let [^IntList data (int-list/dynamic-int-list (long n-elems))
missing-val (str missing-val)]
(.add int->str (int 0) missing-val)
(.put str->int missing-val (int 0))
(.size data (int n-elems))
(StringTable. int->str str->int data)))
(^List [n-elems missing-val]
(make-string-table n-elems missing-val (ArrayList.) (HashMap.)))
(^List [n-elems]
(make-string-table n-elems "" (ArrayList.) (HashMap.)))
(^List []
(make-string-table 0 "" (ArrayList.) (HashMap.))))
(defn string-table-from-strings
[str-data]
(if-let [str-reader (dtype/->reader str-data)]
(let [unique-set (parallel-unique str-reader)
_ (.remove unique-set "")
set-iter (.iterator unique-set)
n-unique-elems (inc (.size unique-set))
n-elems (dtype/ecount str-reader)
str->int (HashMap. n-unique-elems)
int->str (ArrayList. n-unique-elems)]
(.put str->int "" (unchecked-int 0))
(.add int->str (unchecked-int 0) "")
(loop [continue? (.hasNext set-iter)
idx (int 0)]
(when continue?
(let [str-entry (.next set-iter)
idx (unchecked-int idx)]
(.put str->int str-entry (unchecked-int idx))
(.add int->str (unchecked-int idx) str-entry))
(recur (.hasNext set-iter) (unchecked-inc idx))))
(cond
(< n-unique-elems Byte/MAX_VALUE)
(let [data (byte-array n-elems)]
(parallel-for/parallel-for
idx n-elems
(aset data idx (unchecked-byte (.get str->int (str-reader idx)))))
(StringTable. int->str str->int (int-list/make-from-container data)))
(< n-unique-elems Short/MAX_VALUE)
(let [data (short-array n-elems)]
(parallel-for/parallel-for
idx n-elems
(aset data idx (unchecked-short (.get str->int (str-reader idx)))))
(StringTable. int->str str->int (int-list/make-from-container data)))
:else
(let [data (int-array n-elems)]
(parallel-for/parallel-for
idx n-elems
(aset data idx (unchecked-int (.get str->int (str-reader idx)))))
(StringTable. int->str str->int (int-list/make-from-container data)))))
(let [str-table (make-string-table 0)]
(doseq [data str-data]
(.add str-table data))
str-table)))