-
Notifications
You must be signed in to change notification settings - Fork 81
/
Copy pathBloomFilter.cpp
150 lines (122 loc) · 4.94 KB
/
BloomFilter.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include <Interpreters/BloomFilter.h>
#include <city.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnLowCardinality.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
static constexpr UInt64 SEED_GEN_A = 845897321;
static constexpr UInt64 SEED_GEN_B = 217728422;
static constexpr UInt64 MAX_BLOOM_FILTER_SIZE = 1 << 30;
BloomFilterParameters::BloomFilterParameters(size_t filter_size_, size_t filter_hashes_, size_t seed_)
: filter_size(filter_size_), filter_hashes(filter_hashes_), seed(seed_)
{
if (filter_size == 0)
throw Exception("The size of bloom filter cannot be zero", ErrorCodes::BAD_ARGUMENTS);
if (filter_hashes == 0)
throw Exception("The number of hash functions for bloom filter cannot be zero", ErrorCodes::BAD_ARGUMENTS);
if (filter_size > MAX_BLOOM_FILTER_SIZE)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of bloom filter cannot be more than {}", MAX_BLOOM_FILTER_SIZE);
}
BloomFilter::BloomFilter(const BloomFilterParameters & params)
: BloomFilter(params.filter_size, params.filter_hashes, params.seed)
{
}
BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_)
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0)
{
assert(size != 0);
assert(hashes != 0);
}
bool BloomFilter::find(const char * data, size_t len)
{
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
for (size_t i = 0; i < hashes; ++i)
{
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
return false;
}
return true;
}
void BloomFilter::add(const char * data, size_t len)
{
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
for (size_t i = 0; i < hashes; ++i)
{
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
}
}
void BloomFilter::clear()
{
filter.assign(words, 0);
}
bool BloomFilter::contains(const BloomFilter & bf)
{
for (size_t i = 0; i < words; ++i)
{
if ((filter[i] & bf.filter[i]) != bf.filter[i])
return false;
}
return true;
}
UInt64 BloomFilter::isEmpty() const
{
for (size_t i = 0; i < words; ++i)
if (filter[i] != 0)
return false;
return true;
}
bool operator== (const BloomFilter & a, const BloomFilter & b)
{
for (size_t i = 0; i < a.words; ++i)
if (a.filter[i] != b.filter[i])
return false;
return true;
}
void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
{
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
}
bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed)
{
size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size);
return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))));
}
DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type)
{
if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get()))
{
if (!typeid_cast<const DataTypeArray *>(array_type->getNestedType().get()))
return getPrimitiveType(array_type->getNestedType());
else
throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::BAD_ARGUMENTS);
}
if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get()))
return getPrimitiveType(nullable_type->getNestedType());
if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_type.get()))
return getPrimitiveType(low_cardinality_type->getDictionaryType());
return data_type;
}
ColumnPtr BloomFilter::getPrimitiveColumn(const ColumnPtr & column)
{
if (const auto * array_col = typeid_cast<const ColumnArray *>(column.get()))
return getPrimitiveColumn(array_col->getDataPtr());
if (const auto * nullable_col = typeid_cast<const ColumnNullable *>(column.get()))
return getPrimitiveColumn(nullable_col->getNestedColumnPtr());
if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(column.get()))
return getPrimitiveColumn(low_cardinality_col->convertToFullColumnIfLowCardinality());
return column;
}
}