diff --git a/Makefile b/Makefile index 255070ed5..b2ac4b717 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DATE ?= 2025-01-08 # Path to the code repo. VALKEY_ROOT ?= ../valkey - +VALKEY_BLOOM_ROOT ?= ../valkey-bloom # Where to install man pages INSTALL_MAN_DIR ?= /usr/local/share/man @@ -30,6 +30,10 @@ ifeq ("$(wildcard $(VALKEY_ROOT))","") $(error Please provide the VALKEY_ROOT variable pointing to the Valkey source code) endif +ifeq ("$(wildcard $(VALKEY_BLOOM_ROOT))","") + $(info Valkey bloom variable pointed to nothing, skipping bloom filter commands) +endif + ifeq ("$(shell which pandoc)","") $(error Please install pandoc) endif @@ -54,7 +58,9 @@ endif documented_commands = $(wildcard commands/*.md) commands_json_files = $(wildcard $(VALKEY_ROOT)/src/commands/*.json) -existing_commands = $(commands_json_files:$(VALKEY_ROOT)/src/commands/%.json=commands/%.md) +bloom_commands_json_files = $(wildcard $(VALKEY_BLOOM_ROOT)/src/commands/*.json) +existing_commands = $(commands_json_files:$(VALKEY_ROOT)/src/commands/%.json=commands/%.md) \ + $(bloom_commands_json_files:$(VALKEY_BLOOM_ROOT)/src/commands/%.json=commands/%.md) topics = $(wildcard topics/*) commands = $(filter $(existing_commands),$(documented_commands)) @@ -65,7 +71,9 @@ topics_pics = $(filter-out %.md,$(topics)) # ---- Temp files ---- # JSON files for the commands that have a .md file (excluding undocumented commands). -json_for_documented_commands = $(commands:commands/%.md=$(VALKEY_ROOT)/src/commands/%.json) +json_for_documented_commands = \ + $(patsubst commands/%.md,$(VALKEY_ROOT)/src/commands/%.json,$(filter $(commands_json_files:$(VALKEY_ROOT)/src/commands/%.json=commands/%.md),$(commands))) \ + $(patsubst commands/%.md,$(VALKEY_BLOOM_ROOT)/src/commands/%.json,$(filter $(bloom_commands_json_files:$(VALKEY_BLOOM_ROOT)/src/commands/%.json=commands/%.md),$(commands))) $(BUILD_DIR)/.commands-per-group.json: $(VALKEY_ROOT)/src/commands/. utils/build-command-groups.py | $(BUILD_DIR) utils/build-command-groups.py $(json_for_documented_commands) > $@~~ @@ -148,6 +156,9 @@ progs = valkey-cli valkey-server valkey-benchmark valkey-sentinel valkey-check-r programs = $(progs:valkey-%=topics/%.md) configs = topics/valkey.conf.md +# Define the base directories where valkey commands can come from +VALKEY_ROOTS := $(VALKEY_ROOT) $(VALKEY_BLOOM_ROOT) + man1_src = $(filter $(programs),$(topics_md)) man3_src = $(commands) man5_src = $(filter $(configs),$(topics_md)) @@ -175,12 +186,14 @@ $(MAN_DIR)/man1/valkey-%.1.gz: topics/%.md $(man_scripts) utils/preprocess-markdown.py --man --page-type program \ --version $(VERSION) --date $(DATE) \$< \ | utils/links-to-man.py - | $(to_man) > $@ -$(MAN_DIR)/man3/%.3valkey.gz: commands/%.md $(VALKEY_ROOT)/src/commands/%.json $(BUILD_DIR)/.commands-per-group.json $(man_scripts) - utils/preprocess-markdown.py --man --page-type command \ - --version $(VERSION) --date $(DATE) \ - --commands-per-group-json $(BUILD_DIR)/.commands-per-group.json \ - --valkey-root $(VALKEY_ROOT) $< \ - | utils/links-to-man.py - | $(to_man) > $@ +$(MAN_DIR)/man3/%.3valkey.gz: commands/%.md $(BUILD_DIR)/.commands-per-group.json $(man_scripts) + $(eval FINAL_ROOT := $(firstword $(foreach root,$(VALKEY_ROOTS),$(if $(wildcard $(root)/src/commands/$*.json),$(root))))) + $(if $(FINAL_ROOT), \ + utils/preprocess-markdown.py --man --page-type command \ + --version $(VERSION) --date $(DATE) \ + --commands-per-group-json $(BUILD_DIR)/.commands-per-group.json \ + --valkey-root $(FINAL_ROOT) $< \ + | utils/links-to-man.py - | $(to_man) > $@) $(MAN_DIR)/man5/%.5.gz: topics/%.md $(man_scripts) utils/preprocess-markdown.py --man --page-type config \ --version $(VERSION) --date $(DATE) $< \ diff --git a/README.md b/README.md index fe0383ab9..9baaf3104 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,13 @@ for generating content for the website and man pages. This repo comes with a Makefile to build and install man pages. - make VALKEY_ROOT=path/to/valkey + make VALKEY_ROOT=path/to/valkey VALKEY_BLOOM_ROOT=path/to/valkey-bloom sudo make install INSTALL_MAN_DIR=/usr/local/share/man Prerequisites: GNU Make, Python 3, Python 3 YAML (pyyaml), Pandoc. Additionally, the scripts need access to the valkey code repo, -where metadata files about the commands are stored. +where metadata files about the commands are stored. Additionally +access to the valkey-bloom repo is optional. The pages are generated under `_build/man/` by default. The default install location is `/usr/local/share/man` (in the appropriate subdirectories). diff --git a/commands/bf.add.md b/commands/bf.add.md new file mode 100644 index 000000000..88842339e --- /dev/null +++ b/commands/bf.add.md @@ -0,0 +1,14 @@ +Adds a single item to a bloom filter. If the specified bloom filter does not exist, a bloom filter is created with the provided name with default properties. + +To add multiple items to a bloom filter, you can use the `BF.MADD` or `BF.INSERT` commands. + +To create a bloom filter with non-default properties, use the `BF.INSERT` or `BF.RESERVE` command. + +## Examples + +``` +127.0.0.1:6379> BF.ADD key val +(integer) 1 +127.0.0.1:6379> BF.ADD key val +(integer) 0 +``` diff --git a/commands/bf.card.md b/commands/bf.card.md new file mode 100644 index 000000000..ac617ce55 --- /dev/null +++ b/commands/bf.card.md @@ -0,0 +1,12 @@ +Returns the cardinality of a bloom filter which is the number of items that have been successfully added to it. + +## Examples + +``` +127.0.0.1:6379> BF.ADD key val +(integer) 1 +127.0.0.1:6379> BF.CARD key +(integer) 1 +127.0.0.1:6379> BF.CARD nonexistentkey +(integer) 0 +``` diff --git a/commands/bf.exists.md b/commands/bf.exists.md new file mode 100644 index 000000000..670eae426 --- /dev/null +++ b/commands/bf.exists.md @@ -0,0 +1,18 @@ +Determines if an item has been added to the bloom filter previously. + +A bloom filter has two possible responses when you check if an item exists: + +* 0 - The item definitely does not exist since with bloom filters, false negatives are not possible. + +* 1 - The item exists with a given false positive (`fp`) percentage. There is an `fp` rate % chance that the item does not exist. You can create bloom filters with a more strict false positive rate as needed. + +## Examples + +``` +127.0.0.1:6379> BF.ADD key val +(integer) 1 +127.0.0.1:6379> BF.EXISTS key val +(integer) 1 +127.0.0.1:6379> BF.EXISTS key nonexistent +(integer) 0 +``` diff --git a/commands/bf.info.md b/commands/bf.info.md new file mode 100644 index 000000000..48ccb5359 --- /dev/null +++ b/commands/bf.info.md @@ -0,0 +1,41 @@ +Returns usage information and properties of a specific bloom filter. + +## Info Fields + +* CAPACITY - The number of unique items that would need to be added before a scale out occurs or (non scaling) before it rejects addition of unique items. +* SIZE - The number of bytes allocated by this bloom filter. +* FILTERS - Returns the number of sub filters contained within the bloom filter. +* ITEMS - The number of unique items that have been added to the bloom filter. +* ERROR - The false positive rate of the bloom filter. +* EXPANSION - The expansion rate of the bloom filter. Non scaling filters will have an expansion rate of nil. +* TIGHTENING - The tightening ratio of the bloom filter. +* MAXSCALEDCAPACITY - The [maximum capacity](../topics/bloomfilters.md) that a scalable bloom filter can be expand to and reach before a subsequent scale out will fail. + +For non-scaling filters, the `TIGHTENING` and `MAXSCALEDCAPACITY` fields are not applicable and will not be returned. +When no optional fields are specified, all available fields for the given filter type are returned. + +## Examples + +``` +127.0.0.1:6379> BF.ADD key val +(integer) 1 +127.0.0.1:6379> BF.INFO key + 1) Capacity + 2) (integer) 100 + 3) Size + 4) (integer) 384 + 5) Number of filters + 6) (integer) 1 + 7) Number of items inserted + 8) (integer) 2 + 9) Error rate +10) "0.01" +11) Expansion rate +12) (integer) 2 +13) Tightening ratio +14) "0.5" +15) Max scaled capacity +16) (integer) 26214300 +127.0.0.1:6379> BF.INFO key CAPACITY +(integer) 100 +``` \ No newline at end of file diff --git a/commands/bf.insert.md b/commands/bf.insert.md new file mode 100644 index 000000000..783c09247 --- /dev/null +++ b/commands/bf.insert.md @@ -0,0 +1,44 @@ +If the bloom filter does not exist under the specified name, a bloom filter is created with the specified parameters. Default properties will be used if the options below are not specified. + +When the `ITEMS` option is provided, all items provided will be attempted to be added. + +## Insert Fields + +* CAPACITY *capacity* - The number of unique items that would need to be added before a scale out occurs or (non scaling) before it rejects addition of unique items. +* ERROR *fp_error* - The false positive rate of the bloom filter. +* EXPANSION *expansion* - This option will specify the bloom filter as scaling and controls the size of the sub filter that will be created upon scale out / expansion of the bloom filter. +* NOCREATE - Will not create the bloom filter and add items if the filter does not exist already. +* TIGHTENING *tightening_ratio* - The tightening ratio for the bloom filter. +* SEED *seed* - The 32 byte seed the bloom filter's hash functions will use. +* NONSCALING - This option will configure the bloom filter as non scaling; it cannot expand / scale beyond its specified capacity. +* VALIDATESCALETO *validatescaleto* - Validates if the filter can scale out and reach to this capacity based on limits and if not, return an error without creating the bloom filter. +* ITEMS *item* - One or more items to be added to the bloom filter. + +Due to the nature of `NONSCALING` and `VALIDATESCALETO` arguments, specifying `NONSCALING` and `VALIDATESCALETO` together is not allowed. + +## Examples + +``` +127.0.0.1:6379> BF.INSERT key ITEMS item1 item2 +1) (integer) 1 +2) (integer) 1 +# This does not update the capacity since the filter already exists. It only adds the provided items. +127.0.0.1:6379> BF.INSERT key CAPACITY 1000 ITEMS item2 item3 +1) (integer) 0 +2) (integer) 1 +127.0.0.1:6379> BF.INSERT key_new CAPACITY 1000 +[] +``` + +``` +127.0.0.1:6379> BF.INSERT key NONSCALING VALIDATESCALETO 100 +(error) ERR cannot use NONSCALING and VALIDATESCALETO options together +127.0.0.1:6379> BF.INSERT key CAPACITY 1000 VALIDATESCALETO 999999999999999999 ITEMS item2 item3 +(error) ERR provided VALIDATESCALETO causes bloom object to exceed memory limit +127.0.0.1:6379> BF.INSERT key VALIDATESCALETO 999999999999999999 EXPANSION 1 ITEMS item2 item3 +(error) ERR provided VALIDATESCALETO causes false positive to degrade to 0 +``` +``` +127.0.0.1:6379> BF.INSERT key NOCREATE ITEMS item1 item2 +(error) ERR not found +``` \ No newline at end of file diff --git a/commands/bf.load.md b/commands/bf.load.md new file mode 100644 index 000000000..fab456546 --- /dev/null +++ b/commands/bf.load.md @@ -0,0 +1 @@ +Restores a bloom filter from a dump of an existing bloom filter with all of its specific the properties and bit vector dump of sub filter/s. This command is only generated during AOF rewrite to restore a bloom filter in the future. diff --git a/commands/bf.madd.md b/commands/bf.madd.md new file mode 100644 index 000000000..d44a8b4b9 --- /dev/null +++ b/commands/bf.madd.md @@ -0,0 +1,16 @@ +Adds one or more items to a bloom filter. If the specified bloom filter does not exist, a bloom filter is created with the provided name with default properties. + +If you want to create a bloom filter with non-default properties, use the `BF.INSERT` or `BF.RESERVE` command. + +## Examples + +``` +127.0.0.1:6379> BF.MADD key item1 item2 +1) (integer) 1 +2) (integer) 1 +127.0.0.1:6379> BF.MADD key item2 item3 +1) (integer) 0 +2) (integer) 1 +127.0.0.1:6379> BF.MADD key_new item1 +1) (integer) 1 +``` \ No newline at end of file diff --git a/commands/bf.mexists.md b/commands/bf.mexists.md new file mode 100644 index 000000000..5ee4744f2 --- /dev/null +++ b/commands/bf.mexists.md @@ -0,0 +1,21 @@ +Determines if the provided item/s have been added to a bloom filter previously. + +A Bloom filter has two possible responses when you check if an item exists: + +* 0 - The item definitely does not exist since with bloom filters, false negatives are not possible. + +* 1 - The item exists with a given false positive (`fp`) percentage. There is an `fp` rate % chance that the item does not exist. You can create bloom filters with a more strict false positive rate as needed. + +## Examples + +``` +127.0.0.1:6379> BF.MADD key item1 item2 +1) (integer) 1 +2) (integer) 1 +127.0.0.1:6379> BF.MEXISTS key item1 item2 item3 +1) (integer) 1 +2) (integer) 1 +3) (integer) 0 +127.0.0.1:6379> BF.MEXISTS key item1 +1) (integer) 1 +``` \ No newline at end of file diff --git a/commands/bf.reserve.md b/commands/bf.reserve.md new file mode 100644 index 000000000..8ced88f95 --- /dev/null +++ b/commands/bf.reserve.md @@ -0,0 +1,27 @@ +Creates an empty bloom filter with the specified capacity and false positive rate. By default, a scaling filter is created with the default expansion rate. + +To specify the scaling / non scaling nature of the bloom filter, use the options: `NONSCALING` or `SCALING `. It is invalid to provide both options together. + +## Reserve fields + +* error_rate - The false positive rate of the bloom filter +* capacity - The number of unique items that would need to be added before a scale out occurs or (non scaling) before it rejects addition of unique items. +* EXPANSION expansion - This option will specify the bloom filter as scaling and controls the size of the sub filter that will be created upon scale out / expansion of the bloom filter. +* NONSCALING - This option will configure the bloom filter as non scaling; it cannot expand / scale beyond its specified capacity. + +## Examples + +``` +127.0.0.1:6379> BF.RESERVE key 0.01 1000 +OK +127.0.0.1:6379> BF.RESERVE key 0.1 1000000 +(error) ERR item exists +``` +``` +127.0.0.1:6379> BF.RESERVE bf_expansion 0.0001 5000 EXPANSION 3 +OK +``` +``` +127.0.0.1:6379> BF.RESERVE bf_nonscaling 0.0001 5000 NONSCALING +OK +``` diff --git a/groups.json b/groups.json index 46f69a508..297240200 100644 --- a/groups.json +++ b/groups.json @@ -3,6 +3,10 @@ "display": "Bitmap", "description": "Operations on the Bitmap data type" }, + "bloom": { + "display": "Bloom filter", + "description": "Operations on the Bloom filter data type" + }, "cluster": { "display": "Cluster", "description": "Valkey Cluster management" diff --git a/modules.json b/modules.json new file mode 100644 index 000000000..03ca3a683 --- /dev/null +++ b/modules.json @@ -0,0 +1,7 @@ +{ + "valkey_bloom": { + "name": "valkey-bloom", + "repo": "https://github.com/valkey-io/valkey-bloom", + "description": "Module that allows users to use the bloom filter data type" + } +} \ No newline at end of file diff --git a/resp2_replies.json b/resp2_replies.json index b678d630e..a78942746 100644 --- a/resp2_replies.json +++ b/resp2_replies.json @@ -62,6 +62,47 @@ "AUTH": [ "[Simple string reply](../topics/protocol.md#simple-strings): `OK`, or an error if the password, or username/password pair, is invalid." ], + "BF.ADD": [ + "One of the following:", + "* [Integer reply](../topics/protocol.md#integers): `1` if the item was successfully added", + "* [Integer reply](../topics/protocol.md#integers): `0` if the item already existed in the bloom filter", + "", + "The command will be rejected if input is invalid, if a non bloom filter key with the same name already exists, if the bloom filter creation / scale out exceeds limits, or if an item is being added to a full non scaling filter." + ], + "BF.CARD": [ + "[Integer reply](../topics/protocol.md#integers): The number of items successfully added to the bloom filter, or 0 if the key does not exist" + ], + "BF.EXISTS": [ + "One of the following:", + "* [Integer reply](../topics/protocol.md#integers): `1` if the item exists in the bloom filter", + "* [Integer reply](../topics/protocol.md#integers): `0` if the bloom filter does not exist or the item has not been added to the bloom filter" + ], + "BF.INFO": [ + "When no optional arguments are provided:", + "* [Array reply](../topics/protocol.md#arrays): List of information about the bloom filter.", + "When an optional argument excluding ERROR is provided:", + "* [Integer reply](../topics/protocol.md#integers): argument value", + "When ERROR is provided as an optional argument:", + "* [String reply](../topics/protocol.md#simple-strings): argument value" + ], + "BF.INSERT": [ + "[Array reply](../topics/protocol.md#arrays): Array of ints (1’s and 0’s) - if filter already exists or if creation was successful. An empty array if no items are provided", + "", + "The command will be rejected if input is invalid, if a non bloom filter key with the same name already exists, if the bloom filter creation / scale out exceeds limits, or if an item is being added to a full non scaling filter." + ], + "BF.MADD": [ + "[Array reply](../topics/protocol.md#arrays): Array of ints (1’s and 0’s)", + "", + "The command will be rejected if input is invalid, if a non bloom filter key with the same name already exists, if the bloom filter creation / scale out exceeds limits, or if an item is being added to a full non scaling filter." + ], + "BF.MEXISTS": [ + "[Array reply](../topics/protocol.md#arrays): Array of ints (1’s and 0’s)" + ], + "BF.RESERVE": [ + "[Simple string reply](../topics/protocol.md#simple-strings): `OK`.", + "", + "The command will be rejected if input is invalid, if a key with the same name already exists, or if the bloom filter creation exceeds limits." + ], "BGREWRITEAOF": [ "[Simple string reply](../topics/protocol.md#simple-strings): a simple string reply indicating that the rewriting started or is about to start ASAP when the call is executed with success.", "", diff --git a/resp3_replies.json b/resp3_replies.json index f3fc39943..78369bbab 100644 --- a/resp3_replies.json +++ b/resp3_replies.json @@ -62,6 +62,47 @@ "AUTH": [ "[Simple string reply](../topics/protocol.md#simple-strings): `OK`, or an error if the password, or username/password pair, is invalid." ], + "BF.ADD": [ + "One of the following:", + "* [Integer reply](../topics/protocol.md#integers): `1` if the item was successfully added", + "* [Integer reply](../topics/protocol.md#integers): `0` if the item already existed in the bloom filter", + "", + "The command will be rejected if input is invalid, if a non bloom filter key with the same name already exists, if the bloom filter creation / scale out exceeds limits, or if an item is being added to a full non scaling filter." + ], + "BF.CARD": [ + "[Integer reply](../topics/protocol.md#integers): The number of items successfully added to the bloom filter, or 0 if the key does not exist" + ], + "BF.EXISTS": [ + "One of the following:", + "* [Integer reply](../topics/protocol.md#integers): `1` if the item exists in the bloom filter", + "* [Integer reply](../topics/protocol.md#integers): `0` if the bloom filter does not exist or the item has not been added to the bloom filter" + ], + "BF.INFO": [ + "When no optional arguments are provided:", + "* [Array reply](../topics/protocol.md#arrays): List of information about the bloom filter.", + "When an optional argument excluding ERROR is provided:", + "* [Integer reply](../topics/protocol.md#integers): argument value", + "When ERROR is provided as an optional argument:", + "* [String reply](../topics/protocol.md#simple-strings): argument value" + ], + "BF.INSERT": [ + "[Array reply](../topics/protocol.md#arrays): Array of ints (1’s and 0’s) - if filter already exists or if creation was successful. An empty array if no items are provided", + "", + "The command will be rejected if input is invalid, if a non bloom filter key with the same name already exists, if the bloom filter creation / scale out exceeds limits, or if an item is being added to a full non scaling filter." + ], + "BF.MADD": [ + "[Array reply](../topics/protocol.md#arrays): Array of ints (1’s and 0’s)", + "", + "The command will be rejected if input is invalid, if a non bloom filter key with the same name already exists, if the bloom filter creation / scale out exceeds limits, or if an item is being added to a full non scaling filter." + ], + "BF.MEXISTS": [ + "[Array reply](../topics/protocol.md#arrays): Array of ints (1’s and 0’s)" + ], + "BF.RESERVE": [ + "[Simple string reply](../topics/protocol.md#simple-strings): `OK`.", + "", + "The command will be rejected if input is invalid, if a key with the same name already exists, or if the bloom filter creation exceeds limits." + ], "BGREWRITEAOF": [ "[Bulk string reply](../topics/protocol.md#bulk-strings): a simple string reply indicating that the rewriting started or is about to start ASAP when the call is executed with success.", "", diff --git a/topics/bloomfilters.md b/topics/bloomfilters.md new file mode 100644 index 000000000..7f45c6ff1 --- /dev/null +++ b/topics/bloomfilters.md @@ -0,0 +1,227 @@ +--- +title: "Bloom Filters" +description: > + Introduction to Bloom Filters +--- + +In Valkey, the bloom filter data type / commands are implemented in the [valkey-bloom module](https://github.com/valkey-io/valkey-bloom) which is an official valkey module compatible with versions 8.0 and above. Users will need to load this module onto their valkey server in order to use this feature. + +Bloom filters are a space efficient probabilistic data structure that allows adding elements and checking whether elements exist. False positives are possible where a filter incorrectly indicates that an element exists, even though it was not added. However, Bloom Filters guarantee that false negatives (incorrectly indicating that an element does not exist, even though it was added) do not occur. + +## Basic Bloom commands + +* `BF.ADD` adds an item to a bloom filter +* `BF.CARD` returns the cardinality of a bloom filter +* `BF.EXISTS` checks if an item has been added to a bloom filter +* `BF.INFO` returns information about a bloom filter + +See the [complete list of bloom filter commands](../commands/#bloom). + +## Common use cases for bloom filters + +### Advertisement / Campaign placement and deduplication + +Bloom filters can help e-commerce sites, streaming services, advertising networks, or marketing platforms answer the following questions: + +* Has an advertisement already been shown to a user? +* Has a promotional email or notification already been sent to a user? +* Has a product already been purchased by a user? + +Example: For each user, use a Bloom filter to store all the products they have purchased. The recommendation engine can then suggest a new product and check if it is present in the user's Bloom filter. + +* If the product is not in the filter, the ad is shown to the user, and the product is added to the filter. +* If the product is already in the filter, it means the ad has already been shown to the user and the recommendation engine finds a different ad to show. + +### Fraud detection + +Bloom filters can be used to answer the question, "Has this card been flagged as stolen?". To do this, use a bloom filter that contains cards reported as stolen. When a card is used, check whether it is present in the bloom filter. If the card is not found, it means it is not marked as stolen. If the card is present in the filter, a check can be made against the main database, or the purchase can be denied. + +### Filtering Spam / Harmful Content +Bloom filters provide an efficient way to screen content for potential threats and harmful material. Here's how they can be effectively used: + +Example: Bloom filters can answer the question "is a URL malicious?". Any URL inputted would be checked against a malicious URL bloom filter. + +* If no, then we allow access to the site. +* If yes, then we can deny access or perform a full check of the URL. + +Example: Bloom filters can answer the question is this content harmful or spam. Create a bloom filter that contains spam email addresses or spam phone numbers. When an email or text is received then check if the number or email is present in the bloom filter. + +* If no, then the message can be displayed to the user. +* If yes, then we can send the message to the spam folder or perform a full check on the email or number. + +### Check if a username is taken + +Bloom filters can answer the question: Has this username/email/domain name/slug already been used? + +In this username example, we can use use a Bloom filter to track every username that has signed up. When a new user attempts to sign up with their desired username, the app checks if the username exists in the Bloom filter. + +* If no, the user is created and the username is added to the Bloom filter. +* If yes, the app can decide to either check the main database or reject the username. + +## Scaling and non scaling bloom filters + +The bloom filter data type can act either as a "scaling bloom filter" or "non scaling bloom filter" depending on user configuration. + +The difference between scaling and non scaling bloom filters is that scaling bloom filters do not have a fixed capacity, instead they can grow. Non-scaling bloom filters will have a fixed capacity, meaning only a fixed number of items can be inserted to it. Scaling bloom filters consist of a vector of "Sub filters" with length >= 1, while non scaling will only contain 1 sub filter. + +When a scaling bloom filter reaches its capacity, adding a new unique item will trigger a scale out and a new sub filter is created and added to the vector of sub filters. This new sub filter will have a larger capacity (previous bloom filter's capacity * expansion rate of the bloom object). + +After a non scaling bloom filter reaches its capacity, if a user tries to add a new unique item, an error will be returned + +The expansion rate is the rate that a scaling bloom filter's capacity is increased by upon scale out. For example, we have a bloom filter with capacity 100 at creation with an expansion rate of 2. After adding 101 unique items, it will scale out and create a new sub filter with capacity 200. Then, after adding 200 more unique items (301 items total), a new sub filter of capacity 400 is added upon scale out and so on. + +### When should you use scaling vs non-scaling filters + +If the capacity (number of items we want to add) is known and fixed, using a non-scaling bloom filter is preferred. Likewise the reverse case, if the capacity is unknown / dynamically calculated, using a scaling bloom filters is ideal. + +There are a few benefits for using non scaling filters. A non scaling filter will have better performance than a filter that has scaled out several times (e.g. > 100). Also, non scaling filters in general use less memory for a scaling filter that has scaled out several times to hold the same capacity. + +However, to ensure you do not hit any capacity related errors, and want use-as-you-go capacity, scaling is better. + +## Bloom properties + +* Capacity - The number of unique items that would need to be added before a scale out occurs or (non scaling) before it rejects addition of unique items. + +* False Positive Rate (Error rate) - The rate that controls the probability of bloom check/set operations being false positives. Example: An item addition returning 0 (or an item check returning 1) indicating that the item was already added even though it was not. + +* Expansion - This is a property of scalable bloom filters which controls the growth in overall capacity when a bloom filter scales out by determining the capacity of the new sub filter which gets created. This new capacity is equal to the previous filters capacity * expansion rate + +### Advanced Properties + +The following two properties can be specified in the `BF.INSERT` command: + +* Seed - This is the key with which hash functions are created for the bloom filter. In case of scalable bloom filters, the same seed is used across all sub filters. This property is only useful if you have a specific 32 byte seed that you want your bloom filter to use. By default every bloom filter will use a random seed. + +* Tightening Ratio - This is a property of scalable bloom filters which controls the overall correctness of the bloom filter as it scales out by keeping the actual false positive rate closer to the user requested false positive rate when the bloom filter was created. This is done by using the tightening ratio to set a stricter false positive on the new sub filter which gets created during each scale out. We do not recommend fine tuning this unless there is a specific use case for lower memory usage with higher false positive or vice versa. + +### Default bloom properties + +These are the default bloom properties along with the commands and configs which allow customizing. + +| Property | Default Value | Command Name | Configuration name | +|----------|--------------|--------------|-------------------| +| Capacity | 100 | BF.INSERT, BF.RESERVE | BF.BLOOM-CAPACITY | +| False Positive Rate | 0.01 | BF.INSERT, BF.RESERVE | BF.BLOOM-FP-RATE | +| Scaling / Non Scaling | Scaling | BF.INSERT, BF.RESERVE | BF.BLOOM-EXPANSION | +| Expansion Rate | 2 | BF.INSERT, BF.RESERVE | BF.BLOOM-EXPANSION | +| Tightening Ratio | 0.5 | BF.INSERT | BF.BLOOM-TIGHTENING-RATIO | +| Seed | Random Seed | BF.INSERT | BF.BLOOM-USE-RANDOM-SEED | + + +Since bloom filters have a default expansion of 2, this means any default creation as a result of `BF.ADD`, `BF.MADD`, `BF.INSERT` will be a scalable bloom filter. Users can create a non scaling bloom filter using `BF.RESERVE NONSCALING` or by specifying `NONSCALING` in `BF.INSERT`. Additionally, the other default properties of a bloom filter creation can be seen in the table above and BF.INFO command response below. These default properties can be configured through configs on the bloom module. + +Example of default bloom filter information: + +``` +127.0.0.1:6379> BF.ADD default_filter item +1 +127.0.0.1:6379> BF.INFO default_filter + 1) Capacity + 2) (integer) 100 + 3) Size + 4) (integer) 384 + 5) Number of filters + 6) (integer) 1 + 7) Number of items inserted + 8) (integer) 2 + 9) Error rate +10) "0.01" +11) Expansion rate +12) (integer) 2 +13) Tightening ratio +14) "0.5" +15) Max scaled capacity +16) (integer) 26214300 +``` + +## Performance + +The bloom commands which involve adding items or checking the existence of items have a time complexity of O(N * K) where N is the number of hash functions used by the bloom filter and K is the number of elements being inserted. This means that both BF.ADD and BF.EXISTS are both O(N) as they only operate on one item. + +In case of scalable bloom filters, with every scale out, we increase the number of checks (using hash functions of each sub filter) performed during any add / exists operation. For this reason, it is recommended that users choose a capacity and expansion rate after evaluating the use case / workload to avoid several scale outs and reduce the number of checks. + +The other bloom filter commands are O(1) time complexity: BF.CARD, BF.INFO, BF.RESERVE, and BF.INSERT (when no items are provided). + +## Monitoring + +To check the server's overall bloom filter metrics, you can use the `INFO BF` or the `INFO MODULES` command. + +Example of `INFO BF` calls in different scenarios: + +``` +127.0.0.1:6379> INFO BF +# bf_bloom_core_metrics +bf_bloom_total_memory_bytes:0 +bf_bloom_num_objects:0 +bf_bloom_num_filters_across_objects:0 +bf_bloom_num_items_across_objects:0 +bf_bloom_capacity_across_objects:0 + +# bf_bloom_defrag_metrics +bf_bloom_defrag_hits:0 +bf_bloom_defrag_misses:0 +127.0.0.1:6379> bf.add key value +(integer) 1 +127.0.0.1:6379> info bf +# bf_bloom_core_metrics +bf_bloom_total_memory_bytes:384 +bf_bloom_num_objects:1 +bf_bloom_num_filters_across_objects:1 +bf_bloom_num_items_across_objects:1 +bf_bloom_capacity_across_objects:100 + +# bf_bloom_defrag_metrics +bf_bloom_defrag_hits:0 +bf_bloom_defrag_misses:0 +``` + +### Bloom filter core metrics + +* `bf_bloom_total_memory_bytes`: Current total number of bytes used by all bloom filters. + +* `bf_bloom_num_objects`: Current total number of bloom filters. + +* `bf_bloom_num_filters_across_objects`: Current total number of sub filters across all bloom filters. + +* `bf_bloom_num_items_across_objects`: Current total number of items across all bloom filters. + +* `bf_bloom_capacity_across_objects`: Current total capacity across all bloom filters. + +### Bloom filter defrag metrics + +* `bf_bloom_defrag_hits`: Total number of defrag hits that have occurred on bloom filters. + +* `bf_bloom_defrag_misses`: Total number of defrag misses that have occurred on bloom filters. + +## Handling Large Bloom Filters + +There are two notable validations bloom filters faces. + +1. Memory Usage: + + The memory usage limit per bloom filter by default is defined by the `BF.BLOOM-MEMORY-USAGE-LIMIT` module configuration which has a default value of 128 MB. If a command results in a creation / scale out causing the overall memory usage to exceed this limit, the command is rejected. This config is modifiable and can be increased as needed. + +2. Number of sub filters (in case of scalable bloom filters): + + When a bloom filter scales out, a new sub filter is added. The limit on the number of sub filters depends on the false positive rate and tightening ratio. Each sub filter has a stricter false positive, and this is controlled by the tightening ratio. If a command attempting a scale out results in the sub filter reaching a false positive of 0, the command is rejected. + + +You can use `VALIDATESCALETO` as an optional arg of `BF.INSERT` to help determine whether the bloom filter can scale out to the reach the specified capacity without hitting either limits mentioned above. It will reject the command otherwise. + +As seen below, when trying to create a bloom filter with a capacity that cannot be achieved through scale outs (given the memory limits), the command is rejected. However, if the capacity can be achieved through scale out (even with the limits), then the creation of the bloom filter will succeed. + +Example: + +``` +127.0.0.1:6379> BF.INSERT validate_scale_fail VALIDATESCALETO 26214301 +(error) ERR provided VALIDATESCALETO causes bloom object to exceed memory limit +127.0.0.1:6379> BF.INSERT validate_scale_valid VALIDATESCALETO 26214300 +[] +``` + +The `BF.INFO` command's `MAXSCALEDCAPACITY` field can be used to find out the maximum capacity that the scalable bloom filter can expand to hold. + +``` +127.0.0.1:6379> BF.INFO validate_scale_valid MAXSCALEDCAPACITY +(integer) 26214300 +``` diff --git a/topics/data-types.md b/topics/data-types.md index cea10c4d3..e1f06d893 100644 --- a/topics/data-types.md +++ b/topics/data-types.md @@ -92,6 +92,17 @@ The [HyperLogLog](hyperloglogs.md) data structures provide probabilistic estimat * [Overview of HyperLogLog](hyperloglogs.md) * [HyperLogLog command reference](../commands/#hyperloglog) +## Bloom Filter + +[Bloom filters](bloomfilters.md) are a space efficient probabilistic data structure that allows adding elements and checking if item/s are definitely not present, or if there is a possibility they exist (with the configured false positive rate). + +The Bloom filter data type / command support is provided by the `valkey-bloom` module. +For more information, see: + +* [Overview of Bloom Filters](bloomfilters.md) +* [Bloom filter command reference](../commands/#bloom) +* [Valkey-bloom module on GitHub](https://github.com/valkey-io/valkey-bloom/) + ## Extensions To extend the features provided by the included data types, use one of these options: diff --git a/wordlist b/wordlist index 19841317c..9ca5bdeb8 100644 --- a/wordlist +++ b/wordlist @@ -81,6 +81,7 @@ behaviour benchmarked Benchmarking benchmarking +BF.BLOOM-FP-RATE big-endian BigNumber \w+:\w.* @@ -179,6 +180,7 @@ deauthenticate deauthenticated Deauthenticates deduplicated +deduplication Defrag defrag defragging @@ -277,6 +279,7 @@ FlameGraph fmt foo[0-9] formatter +fp_error france_location FreeBSD FreeString @@ -908,6 +911,7 @@ UTF-8 utf8 utils v[0-9\.]+ +validatescaleto [Vv]alkey [Vv]alkey-[\w+-]+ Valkey's