Skip to content

Commit 16bec1a

Browse files
authored
Add files via upload
1 parent 58dde59 commit 16bec1a

File tree

2 files changed

+317
-0
lines changed

2 files changed

+317
-0
lines changed

SD.cpp

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
/*
2+
* sd.cpp - String Dictionary with prefix encoding
3+
* Copyright (C) 2019 Anders Larsen <gislagard@gmail.com>
4+
*
5+
* backbit is free software: you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License as published by the
7+
* Free Software Foundation, either version 3 of the License, or
8+
* (at your option) any later version.
9+
*
10+
* backbit is distributed in the hope that it will be useful, but
11+
* WITHOUT ANY WARRANTY; without even the implied warranty of
12+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13+
* See the GNU General Public License for more details.
14+
*
15+
* You should have received a copy of the GNU General Public License along
16+
* with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
19+
#include "sd.h"
20+
#include <vector>
21+
#include <string>
22+
#include <fstream>
23+
24+
int32_t SD::add(const std::string& s)
25+
{
26+
// assert(s.length() < 32768);
27+
// assert(s > previous);
28+
29+
// Time to add a new bucket? Or the very first bucket?
30+
if ((count % BUCKET_SIZE) == 0)
31+
{
32+
// Try to shrink the previous bucket, if any
33+
if (count) buckets.back().shrink_to_fit();
34+
35+
// Store first dictionary string in bucket uncompressed
36+
buckets.push_back(s);
37+
buckets.back() += '\0';
38+
}
39+
// Store s compressed, based on common prefix from previous?
40+
else
41+
{
42+
int16_t lcp = 0; // Length Common Prefix
43+
44+
while (s[lcp] == previous[lcp])
45+
lcp++;
46+
47+
// Get a reference to the current bucket, for convenience
48+
std::string& current = buckets.back();
49+
50+
// Store lcp as two bytes if lcp is 128-32767 bytes long
51+
if (lcp > 127)
52+
{
53+
// leftmost bit == 1
54+
current += (unsigned char) ((lcp & 127) | 0x80);
55+
current += (unsigned char) (lcp >> 7);
56+
}
57+
// Store lcp as one byte if lcp is 0-127 bytes long, leftmost bit == 0
58+
else
59+
current += (unsigned char) (lcp);
60+
61+
current += s.substr(lcp);
62+
current += '\0'; // Zero terminated dictionary string
63+
}
64+
65+
previous = s;
66+
count++;
67+
return count - 1;
68+
}
69+
70+
std::string SD::extract(int32_t i)
71+
{
72+
// assert(i < count);
73+
74+
int16_t lcp; // Length Common Prefix
75+
auto it = buckets[i / BUCKET_SIZE].begin(); // Integer division and ...
76+
std::string candidate = "";
77+
78+
candidate.reserve(PATH_MAX + NAME_MAX + 1);
79+
80+
// ... modulo to fast forward i among buckets
81+
i = i % BUCKET_SIZE;
82+
83+
// Get first uncompressed candidate dictionary string from bucket
84+
while (*it != '\0')
85+
{
86+
candidate += *it;
87+
it++;
88+
}
89+
90+
it++;
91+
92+
// Decode bucket until correct string is candidate
93+
for (int16_t j = 1; j < (i + 1); j++)
94+
{
95+
lcp = *it;
96+
97+
// Is lcp stored as two bytes? Leftmost bit == 1?
98+
if (lcp & 0x80)
99+
{
100+
it++;
101+
lcp = (lcp & 127) + (*it << 7); // Yes it is
102+
}
103+
104+
// Keep only the common prefix
105+
candidate.resize(lcp);
106+
107+
it++;
108+
109+
// Get uncompressed part of dictionary string
110+
while (*it != '\0')
111+
{
112+
candidate += *it;
113+
it++;
114+
}
115+
116+
it++;
117+
}
118+
119+
return candidate;
120+
}
121+
122+
int32_t SD::locate(const std::string& target)
123+
{
124+
// assert(target.length() < 32768; // Hard limit
125+
// assert(target.length() < (PATH_MAX + NAME_MAX + 1)); // Soft limit
126+
127+
int32_t left = 0;
128+
int32_t middle = 0;
129+
int32_t right = buckets.size() - 1;
130+
std::string candidate;
131+
132+
candidate.reserve(PATH_MAX + NAME_MAX + 1); // Avoid reallocs for speed
133+
134+
// Binary search for the correct bucket
135+
while (left != right)
136+
{
137+
middle = (left + right) / 2 + ((left + right) % 2 != 0); // ceil x/2
138+
139+
auto it = buckets[middle].begin();
140+
141+
candidate = "";
142+
143+
// Get first uncompressed candidate dictionary string from bucket
144+
while (*it != '\0')
145+
{
146+
candidate += *it;
147+
it++;
148+
}
149+
150+
if (candidate > target)
151+
right = middle - 1;
152+
else
153+
left = middle;
154+
}
155+
156+
if (candidate > target)
157+
middle--;
158+
159+
// Linear search for target in the middle bucket
160+
auto it = buckets[middle].begin();
161+
const auto it_end = buckets[middle].end();
162+
int32_t candidate_index = middle * BUCKET_SIZE;
163+
164+
candidate = "\0";
165+
166+
// Get first uncompressed candidate dictionary string from bucket
167+
while (*it != '\0')
168+
{
169+
candidate += *it;
170+
it++;
171+
}
172+
173+
it++;
174+
175+
while (target > candidate && it != it_end)
176+
{
177+
int16_t lcp;
178+
candidate_index++;
179+
180+
lcp = *it;
181+
182+
// Is lcp stored as two bytes?
183+
if (lcp & 0x80)
184+
{
185+
it++;
186+
lcp = (lcp & 127) + (*it << 7); // Yes it is
187+
}
188+
189+
// Truncate, keeping only the common prefix
190+
191+
candidate.resize(lcp); // marginally faster
192+
// candidate.erase(lcp, std::string::npos);
193+
194+
it++;
195+
196+
// Get uncompressed part of dictionary string
197+
while (*it != '\0')
198+
{
199+
candidate += *it;
200+
it++;
201+
}
202+
203+
it++;
204+
}
205+
206+
if (target == candidate)
207+
return candidate_index; // Found!
208+
else
209+
return -1; // Not found!
210+
}
211+
212+
void SD::serialize(std::ostream &os)
213+
{
214+
if (os.good())
215+
{
216+
size_t noOfBuckets = buckets.size();
217+
218+
os.write(reinterpret_cast<const char *> (&noOfBuckets), sizeof (noOfBuckets));
219+
os.write(reinterpret_cast<const char *> (&count), sizeof (count));
220+
os.write(reinterpret_cast<const char *> (&BUCKET_SIZE), sizeof (BUCKET_SIZE));
221+
222+
for (auto i = buckets.begin(); i != buckets.end(); i++)
223+
{
224+
std::string& bucket = *i;
225+
size_t sizeOfBucket = bucket.size();
226+
os.write(reinterpret_cast<const char *> (&sizeOfBucket), sizeof (sizeOfBucket));
227+
os.write(reinterpret_cast<const char *> (&bucket[0]), sizeOfBucket);
228+
}
229+
}
230+
if (os.fail())
231+
throw std::runtime_error("failed to write String Dictionary");
232+
}
233+
234+
void SD::deserialize(std::istream &is)
235+
{
236+
size_t noOfBuckets;
237+
238+
buckets.clear();
239+
count = 0;
240+
241+
if (is.good())
242+
{
243+
is.read(reinterpret_cast<char *> (&noOfBuckets), sizeof (noOfBuckets));
244+
is.read(reinterpret_cast<char *> (&count), sizeof (count));
245+
is.read(reinterpret_cast<char *> (&BUCKET_SIZE), sizeof (BUCKET_SIZE));
246+
247+
for (int i = 0; i != noOfBuckets; i++)
248+
{
249+
size_t sizeOfBucket;
250+
std::string s;
251+
252+
is.read(reinterpret_cast<char *> (&sizeOfBucket), sizeof (sizeOfBucket));
253+
s.resize(sizeOfBucket);
254+
is.read(reinterpret_cast<char *> (&s[0]), sizeOfBucket);
255+
buckets.push_back(s);
256+
}
257+
}
258+
if (is.fail())
259+
throw std::runtime_error("failed to read String Dictionary");
260+
}

SD.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/*
2+
* sd.h - Compressed String Dictionary with prefix encoding
3+
* Copyright (C) 2019 Anders Larsen <gislagard@gmail.com>
4+
*
5+
* SD is free software: you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License as published by the
7+
* Free Software Foundation, either version 3 of the License, or
8+
* (at your option) any later version.
9+
*
10+
* backbit is distributed in the hope that it will be useful, but
11+
* WITHOUT ANY WARRANTY; without even the implied warranty of
12+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13+
* See the GNU General Public License for more details.
14+
*
15+
* You should have received a copy of the GNU General Public License along
16+
* with this program. If not, see <http://www.gnu.org/licenses/>.
17+
*/
18+
19+
#ifndef _SD_H_
20+
#define _SD_H_
21+
22+
#include <string>
23+
#include <vector>
24+
#include <cstdint>
25+
#include <linux/limits.h> // PATH_MAX and NAME_MAX
26+
#include <fstream>
27+
28+
class SD
29+
{
30+
public:
31+
32+
SD()
33+
{
34+
previous.reserve(PATH_MAX + NAME_MAX + 1); // Avoid reallocs == faster
35+
previous = "";
36+
count = 0;
37+
};
38+
39+
int32_t get_count(void)
40+
{
41+
return count;
42+
};
43+
int32_t add(const std::string& s);
44+
int32_t locate(const std::string& target);
45+
std::string extract(const int32_t i);
46+
47+
void serialize(std::ostream &os);
48+
void deserialize(std::istream &is);
49+
50+
private:
51+
int32_t count;
52+
size_t BUCKET_SIZE = 128;
53+
std::string previous;
54+
std::vector<std::string> buckets;
55+
};
56+
57+
#endif // _SD_H_

0 commit comments

Comments
 (0)