1
+ /*
2
+ * sd.cpp - String Dictionary with prefix encoding
3
+ * Copyright (C) 2019 Anders Larsen <gislagard@gmail.com>
4
+ *
5
+ * backbit is free software: you can redistribute it and/or modify it
6
+ * under the terms of the GNU General Public License as published by the
7
+ * Free Software Foundation, either version 3 of the License, or
8
+ * (at your option) any later version.
9
+ *
10
+ * backbit is distributed in the hope that it will be useful, but
11
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13
+ * See the GNU General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU General Public License along
16
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
17
+ */
18
+
19
+ #include " sd.h"
20
+ #include < vector>
21
+ #include < string>
22
+ #include < fstream>
23
+
24
+ int32_t SD::add (const std::string& s)
25
+ {
26
+ // assert(s.length() < 32768);
27
+ // assert(s > previous);
28
+
29
+ // Time to add a new bucket? Or the very first bucket?
30
+ if ((count % BUCKET_SIZE) == 0 )
31
+ {
32
+ // Try to shrink the previous bucket, if any
33
+ if (count) buckets.back ().shrink_to_fit ();
34
+
35
+ // Store first dictionary string in bucket uncompressed
36
+ buckets.push_back (s);
37
+ buckets.back () += ' \0 ' ;
38
+ }
39
+ // Store s compressed, based on common prefix from previous?
40
+ else
41
+ {
42
+ int16_t lcp = 0 ; // Length Common Prefix
43
+
44
+ while (s[lcp] == previous[lcp])
45
+ lcp++;
46
+
47
+ // Get a reference to the current bucket, for convenience
48
+ std::string& current = buckets.back ();
49
+
50
+ // Store lcp as two bytes if lcp is 128-32767 bytes long
51
+ if (lcp > 127 )
52
+ {
53
+ // leftmost bit == 1
54
+ current += (unsigned char ) ((lcp & 127 ) | 0x80 );
55
+ current += (unsigned char ) (lcp >> 7 );
56
+ }
57
+ // Store lcp as one byte if lcp is 0-127 bytes long, leftmost bit == 0
58
+ else
59
+ current += (unsigned char ) (lcp);
60
+
61
+ current += s.substr (lcp);
62
+ current += ' \0 ' ; // Zero terminated dictionary string
63
+ }
64
+
65
+ previous = s;
66
+ count++;
67
+ return count - 1 ;
68
+ }
69
+
70
+ std::string SD::extract (int32_t i)
71
+ {
72
+ // assert(i < count);
73
+
74
+ int16_t lcp; // Length Common Prefix
75
+ auto it = buckets[i / BUCKET_SIZE].begin (); // Integer division and ...
76
+ std::string candidate = " " ;
77
+
78
+ candidate.reserve (PATH_MAX + NAME_MAX + 1 );
79
+
80
+ // ... modulo to fast forward i among buckets
81
+ i = i % BUCKET_SIZE;
82
+
83
+ // Get first uncompressed candidate dictionary string from bucket
84
+ while (*it != ' \0 ' )
85
+ {
86
+ candidate += *it;
87
+ it++;
88
+ }
89
+
90
+ it++;
91
+
92
+ // Decode bucket until correct string is candidate
93
+ for (int16_t j = 1 ; j < (i + 1 ); j++)
94
+ {
95
+ lcp = *it;
96
+
97
+ // Is lcp stored as two bytes? Leftmost bit == 1?
98
+ if (lcp & 0x80 )
99
+ {
100
+ it++;
101
+ lcp = (lcp & 127 ) + (*it << 7 ); // Yes it is
102
+ }
103
+
104
+ // Keep only the common prefix
105
+ candidate.resize (lcp);
106
+
107
+ it++;
108
+
109
+ // Get uncompressed part of dictionary string
110
+ while (*it != ' \0 ' )
111
+ {
112
+ candidate += *it;
113
+ it++;
114
+ }
115
+
116
+ it++;
117
+ }
118
+
119
+ return candidate;
120
+ }
121
+
122
+ int32_t SD::locate (const std::string& target)
123
+ {
124
+ // assert(target.length() < 32768; // Hard limit
125
+ // assert(target.length() < (PATH_MAX + NAME_MAX + 1)); // Soft limit
126
+
127
+ int32_t left = 0 ;
128
+ int32_t middle = 0 ;
129
+ int32_t right = buckets.size () - 1 ;
130
+ std::string candidate;
131
+
132
+ candidate.reserve (PATH_MAX + NAME_MAX + 1 ); // Avoid reallocs for speed
133
+
134
+ // Binary search for the correct bucket
135
+ while (left != right)
136
+ {
137
+ middle = (left + right) / 2 + ((left + right) % 2 != 0 ); // ceil x/2
138
+
139
+ auto it = buckets[middle].begin ();
140
+
141
+ candidate = " " ;
142
+
143
+ // Get first uncompressed candidate dictionary string from bucket
144
+ while (*it != ' \0 ' )
145
+ {
146
+ candidate += *it;
147
+ it++;
148
+ }
149
+
150
+ if (candidate > target)
151
+ right = middle - 1 ;
152
+ else
153
+ left = middle;
154
+ }
155
+
156
+ if (candidate > target)
157
+ middle--;
158
+
159
+ // Linear search for target in the middle bucket
160
+ auto it = buckets[middle].begin ();
161
+ const auto it_end = buckets[middle].end ();
162
+ int32_t candidate_index = middle * BUCKET_SIZE;
163
+
164
+ candidate = " \0 " ;
165
+
166
+ // Get first uncompressed candidate dictionary string from bucket
167
+ while (*it != ' \0 ' )
168
+ {
169
+ candidate += *it;
170
+ it++;
171
+ }
172
+
173
+ it++;
174
+
175
+ while (target > candidate && it != it_end)
176
+ {
177
+ int16_t lcp;
178
+ candidate_index++;
179
+
180
+ lcp = *it;
181
+
182
+ // Is lcp stored as two bytes?
183
+ if (lcp & 0x80 )
184
+ {
185
+ it++;
186
+ lcp = (lcp & 127 ) + (*it << 7 ); // Yes it is
187
+ }
188
+
189
+ // Truncate, keeping only the common prefix
190
+
191
+ candidate.resize (lcp); // marginally faster
192
+ // candidate.erase(lcp, std::string::npos);
193
+
194
+ it++;
195
+
196
+ // Get uncompressed part of dictionary string
197
+ while (*it != ' \0 ' )
198
+ {
199
+ candidate += *it;
200
+ it++;
201
+ }
202
+
203
+ it++;
204
+ }
205
+
206
+ if (target == candidate)
207
+ return candidate_index; // Found!
208
+ else
209
+ return -1 ; // Not found!
210
+ }
211
+
212
+ void SD::serialize (std::ostream &os)
213
+ {
214
+ if (os.good ())
215
+ {
216
+ size_t noOfBuckets = buckets.size ();
217
+
218
+ os.write (reinterpret_cast <const char *> (&noOfBuckets), sizeof (noOfBuckets));
219
+ os.write (reinterpret_cast <const char *> (&count), sizeof (count));
220
+ os.write (reinterpret_cast <const char *> (&BUCKET_SIZE), sizeof (BUCKET_SIZE));
221
+
222
+ for (auto i = buckets.begin (); i != buckets.end (); i++)
223
+ {
224
+ std::string& bucket = *i;
225
+ size_t sizeOfBucket = bucket.size ();
226
+ os.write (reinterpret_cast <const char *> (&sizeOfBucket), sizeof (sizeOfBucket));
227
+ os.write (reinterpret_cast <const char *> (&bucket[0 ]), sizeOfBucket);
228
+ }
229
+ }
230
+ if (os.fail ())
231
+ throw std::runtime_error (" failed to write String Dictionary" );
232
+ }
233
+
234
+ void SD::deserialize (std::istream &is)
235
+ {
236
+ size_t noOfBuckets;
237
+
238
+ buckets.clear ();
239
+ count = 0 ;
240
+
241
+ if (is.good ())
242
+ {
243
+ is.read (reinterpret_cast <char *> (&noOfBuckets), sizeof (noOfBuckets));
244
+ is.read (reinterpret_cast <char *> (&count), sizeof (count));
245
+ is.read (reinterpret_cast <char *> (&BUCKET_SIZE), sizeof (BUCKET_SIZE));
246
+
247
+ for (int i = 0 ; i != noOfBuckets; i++)
248
+ {
249
+ size_t sizeOfBucket;
250
+ std::string s;
251
+
252
+ is.read (reinterpret_cast <char *> (&sizeOfBucket), sizeof (sizeOfBucket));
253
+ s.resize (sizeOfBucket);
254
+ is.read (reinterpret_cast <char *> (&s[0 ]), sizeOfBucket);
255
+ buckets.push_back (s);
256
+ }
257
+ }
258
+ if (is.fail ())
259
+ throw std::runtime_error (" failed to read String Dictionary" );
260
+ }
0 commit comments