Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
44999fa
started moving to workspace
zommiommy Mar 28, 2025
976abf5
fixing workspace changes
zommiommy Mar 28, 2025
2e040ae
fixing workspace changes
zommiommy Mar 28, 2025
958867a
fmt
zommiommy Mar 28, 2025
192882b
moving deps
zommiommy Mar 28, 2025
18ef565
added unsafes that clippy suggested
zommiommy Mar 28, 2025
dc0c9a5
Added visit code
vigna Mar 31, 2025
beccde9
now build ef mmaps offsets so there is no need to pad the offsets
zommiommy Mar 31, 2025
a17f9ff
added back cnr-2000 data
zommiommy Mar 31, 2025
ab6c7a1
fixed cli tests
zommiommy Mar 31, 2025
c4134f8
Completed test support
vigna Mar 31, 2025
fcfb2af
Authors
vigna Mar 31, 2025
22cbddb
README
vigna Mar 31, 2025
81e7906
README
vigna Mar 31, 2025
c72d11b
README
vigna Mar 31, 2025
1ff8997
README
vigna Mar 31, 2025
3e5dfbc
README
vigna Mar 31, 2025
c8ef48e
README
vigna Mar 31, 2025
230f0b8
Fixed crate name
vigna Mar 31, 2025
84c8e5d
Use resolver 2 instead of 3
progval Mar 31, 2025
1c0e641
Merge pull request #120 from progval/resolver2
zommiommy Mar 31, 2025
ddd0ad8
Switch from edition 2024 to 2021
progval Mar 31, 2025
3ac8b8c
Merge pull request #121 from progval/edition2021
zommiommy Mar 31, 2025
4826fd7
made 'build ef' work with non-padded offset files and it writes first…
zommiommy Mar 31, 2025
0bf549c
reverted the buile ef tmp file as not compatibile with std::fs::rename
zommiommy Mar 31, 2025
cce307d
added a .gitignore to the data folder
zommiommy Mar 31, 2025
8bda4e7
Acyclicity and topological sort
vigna Mar 31, 2025
a4c2491
renamed webgraph_cli to webgraph-cli
zommiommy Mar 31, 2025
8612380
added transposed cnr-2000-t
zommiommy Mar 31, 2025
ae66648
added cnr-2000 sccs data
zommiommy Mar 31, 2025
4f77032
removed cnr-2000-hc and built offsets, ef, dcf for cnr-2000-t
zommiommy Mar 31, 2025
9be67d9
added offsets ef and dcf for real
zommiommy Mar 31, 2025
5efb8fe
Strongly connected components
vigna Mar 31, 2025
dfa8df8
ExactSumSweep
vigna Mar 31, 2025
b2caf96
Removed enum
vigna Mar 31, 2025
415734e
ExactSumSweep
vigna Mar 31, 2025
b690c2a
No more rev
vigna Mar 31, 2025
7895d6f
Renaming
vigna Mar 31, 2025
5720535
Renaming
vigna Mar 31, 2025
c78a6cb
Renaming
vigna Mar 31, 2025
9162f20
Iterators instead of slices
vigna Mar 31, 2025
514179e
Simplified asserts
vigna Mar 31, 2025
fc5f1ee
Docs
vigna Mar 31, 2025
706583c
refactored argmin and argmax into iterator extension traits and moved…
zommiommy Apr 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 19 additions & 76 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,84 +1,32 @@
[package]
name = "webgraph"
version = "0.2.1"
edition = "2021"
description = "A Rust port of the WebGraph framework (http://webgraph.di.unimi.it/)."
repository = "https://github.com/vigna/webgraph-rs/"
license = "Apache-2.0 OR LGPL-2.1-or-later"
readme = "README.md"
keywords = ["graph", "compression", "codes", "webgraph"]
categories = ["compression", "data-structures"]
[workspace]
resolver = "2"
members = [
"algo",
"cli",
"webgraph",
]

[workspace.dependencies]
webgraph = {path="./webgraph", version="0.2.1"}
webgraph-algo = {path="./algo", version="0.1.0"}

[features]
default = ["cli"]
cli = [
"dep:clap",
"dep:clap_complete",
"dep:env_logger",
] # Enable the compilation of the webgraph binary
slow_tests = [] # Test feature that enables long running tests
be_bins = [] # Enable read / write of only BE bvgraphs (to reduce code size)
le_bins = [] # Enable read / write of only LE bvgraphs (to reduce code size)
fuzz = [
"dep:arbitrary",
"dep:zip",
"dsi-bitstream/fuzz",
] # Expose the fuzzing harnesses
serde = ["dep:serde"]

[dependencies]
anyhow = { version = "1.0.79", features = ["backtrace"] }
java-properties = "2.0.0"
mmap-rs = "0.6.1"
num_cpus = "1.16.0"
epserde = "0.8.0"
sux = "0.7.2"
#dsi-bitstream = {git="https://github.com/vigna/dsi-bitstream-rs.git"}
#dsi-bitstream = { path="../dsi-bitstream-rs" }
dsi-bitstream = "0.5.0"
dsi-progress-logger = "0.8.1"
log = "0.4.22"
sux = "0.7.2"
common_traits = "0.11.2"
lender = "0.3.1"
rand = { version = "0.9.0", features = ["small_rng"] }
rayon = "1.10.0"
tempfile = "3.10.1"
libc = "0.2.155"
itertools = "0.14.0"
lender = "0.3.1"
common_traits = "0.11.2"
impl-tools = "0.10.0"
bitflags = "2.6.0"
dary_heap = "0.3.6"
rdst = { version = "0.20.14", features = ["multi-threaded"] }
sealed = "0.6.0"
serde = { version = "1.0.217", features = ["serde_derive"], optional = true }
crossbeam-channel = "0.5"

# Cli
clap = { version = "4.5.11", features = ["derive", "string"], optional = true }
clap_complete = { version = "4.4.11", optional = true }
env_logger = { version = "0.11.5", default-features = false, optional = true, features = ["auto-color", "regex"] }

# Fuzzing deps
arbitrary = { version = "1.3.2", features = ["derive"], optional = true }
zip = { version = "2.1.5", optional = true }
rayon = "1.10.0"
mmap-rs = "0.6.1"
predicates = "3.1.2"
sysinfo = "0.33.1"
sync-cell-slice = "0.9.9"
jiff = "0.2.5"

[dev-dependencies]
serde = "1.0.217"
serde_json = "1.0.137"
env_logger = { version = "0.11.5" }

[build-dependencies]
built = { version = "0.7", features = ["git2"] }
chrono = "0.4.39"

[[bin]]
name = "webgraph"
path = "src/main.rs"
required-features = ["cli"]
java-properties = "2.0.0"
env_logger = { version = "0.11.5", default-features = false, features = ["auto-color", "regex"] }
clap = { version = "4.5.11", features = ["derive", "string"] }

[profile.release]
opt-level = 3 # like --release
Expand All @@ -95,8 +43,3 @@ overflow-checks = false # Disable integer overflow checks.
debug = true # Include debug info.
debug-assertions = false # Enables debug assertions.
codegen-units = 1 # slower compile times, but maybe better perf


[lib]
name = "webgraph"
path = "src/lib.rs"
205 changes: 9 additions & 196 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,186 +1,21 @@
# WebGraph

[![downloads](https://img.shields.io/crates/d/webgraph)](https://crates.io/crates/webgraph)
[![dependents](https://img.shields.io/librariesio/dependents/cargo/webgraph)](https://crates.io/crates/webgraph/reverse_dependencies)
![GitHub CI](https://github.com/vigna/webgraph-rs/actions/workflows/rust.yml/badge.svg)
![license](https://img.shields.io/crates/l/webgraph)
[![Line count](https://tokei.rs/b1/github/vigna/webgraph-rs)](https://github.com/vigna/webgraph-rs)
[![Latest version](https://img.shields.io/crates/v/webgraph.svg)](https://crates.io/crates/webgraph)
[![Documentation](https://docs.rs/webgraph/badge.svg)](https://docs.rs/webgraph)
[![Coverage Status](https://coveralls.io/repos/github/vigna/webgraph-rs/badge.svg?branch=main)](https://coveralls.io/github/vigna/webgraph-rs?branch=main)

A Rust implementation of the [WebGraph framework] for graph compression.
A Rust implementation of the [WebGraph
framework](http://webgraph.di.unimi.it/) for graph compression.

WebGraph is a framework for graph compression aimed at studying web graphs, but
currently being applied to several other types of graphs. It
provides simple ways to manage very large graphs, exploiting modern compression
techniques. More precisely, it is currently made of:
## At a Glance

- A set of simple codes, called ζ _codes_, which are particularly suitable for
storing web graphs (or, in general, integers with a power-law distribution in a
certain exponent range).
- Compressed graph representation (start from here):
[`webgraph`](http://crates.io/crates/webgraph) ([repo](/webgraph))

- Algorithms for compressing web graphs that exploit gap compression and
differential compression (à la
[LINK](https://ieeexplore.ieee.org/document/999950)),
intervalization, and ζ codes to provide a high compression ratio (see [our
datasets](http://law.di.unimi.it/datasets.php)). The algorithms are controlled
by several parameters, which provide different tradeoffs between access speed
and compression ratio.
- Algorithms: [`webgraph_algo`](http://crates.io/crates/webgraph_algo)
([repo](/algo))

- Algorithms for accessing a compressed graph without actually decompressing
it, using lazy techniques that delay the decompression until it is actually
necessary.

- Algorithms for analyzing very large graphs, such as
[HyperBall](https://dl.acm.org/doi/10.5555/2606262.2606545), which has been
used to show that Facebook has just [four degrees of
separation](http://vigna.di.unimi.it/papers.php#BBRFDS).

- A [Java implementation](http://webgraph.di.unimi.it/) of the algorithms above,
now in maintenance mode.

- This crate, providing a complete, documented implementation of the algorithms
above in Rust. It is free software distributed under either the [GNU Lesser
General Public License
2.1+](https://www.gnu.org/licenses/old-licenses/lgpl-2.1.html) or the [Apache
Software License 2.0](https://www.apache.org/licenses/LICENSE-2.0).

- [Data sets](http://law.di.unimi.it/datasets.php) for large graphs (e.g.,
billions of links).

## Citation

You are welcome to use and improve WebGraph for your research work! If you find
our software useful for research, please cite the following papers in your own:

- “[WebGraph: The Next Generation (Is in
Rust)](http://vigna.di.unimi.it/papers.php#FVZWNG)”, by Tommaso Fontana,
Sebastiano Vigna, and Stefano Zacchiroli, in _WWW '24: Companion Proceedings
of the ACM on Web Conference 2024_, pages 686–689. [DOI
10.1145/3589335.3651581](https://dl.acm.org/doi/10.1145/3589335.3651581).

- “[The WebGraph Framework I: Compression
Techniques](http://vigna.di.unimi.it/papers.php#BoVWFI)”, by Paolo Boldi and
Sebastiano Vigna, in _Proc. of the 13th international conference on World
Wide Web_, WWW 2004, pages 595–602, ACM. [DOI
10.1145/988672.988752](https://dl.acm.org/doi/10.1145/988672.988752).

## Quick Setup

Assuming you have built all binaries, you will first need a graph in BV format,
for example downloading it from the [LAW website]. For a graph with basename
`BASENAME`, you will need the `BASENAME.graph` file (the bitstream containing a
compressed representation of the graph), the `BASENAME.properties` file
(metadata), and the `BASENAME.offsets` file (a bitstream containing pointers into
the graph bitstream).

As a first step, if you need random access to the successors of a node, you need
to build an [Elias–Fano] representation of the offsets (this part can be skipped
if you just need sequential access). There is a CLI command `webgraph` with many
subcommands, among which `build`, and `webgraph build ef BASENAME` will build
the representation for you, serializing it with [ε-serde] in a file
named `BASENAME.ef`.

Then, to load the graph you need to call

```ignore
let graph = BVGraph::with_basename("BASENAME").load()?;
```

The [`with_basename`] method returns a [`LoadConfig`] instance that can be
further customized, selecting endianness, type of memory access, and so on. By
default you will get big endianness, memory mapping for both the graph and the
offsets, and dynamic code dispatch.

Note that on Windows memory mapping requires that the length of the graph file
is a multiple of the internal bit buffer. You can use the CLI command `run pad
u32` to ensure that your graph file is properly padded.

Once you load the graph, you can [retrieve the successors of a node] or
[iterate on the whole graph]. In particular, using the handy [`for_`] macro,
you can write an iteration on the graph as

```ignore
for_![(src, succ) in graph {
for dst in succ {
[do something with the arc src -> dst]
}
}];
```

## Mutable Graphs

A number of structures make it possible to create dynamically growing graphs:
[`BTreeGraph`], [`VecGraph`] and their labeled counterparts
[`LabeledBTreeGraph`] and [`LabeledVecGraph`]. These structures can also
be serialized with [serde](https://crates.io/crates/serde) using the feature
gate `serde`; [`VecGraph`]/[`LabeledVecGraph`] can also be serialized with
[ε-serde](https://crates.io/crates/epserde).

## Command–Line Interface

We provide a command-line interface to perform various operations on graphs. The
CLI is the main method of the library, so it can be executed with `cargo run`.

## More Options

- By starting from the [`BVGraphSeq`] class you can obtain an instance that does
not need the `BASENAME.ef` file, but provides only [iteration].

- Graphs can be labeled by [zipping] them together with a [labeling]. In fact,
graphs are just labelings with `usize` labels.

## Operating on Graphs

There are many operations available on graphs, such as [transpose],
[simplify], and [permute].

## Compressing Graphs Given as List of Arcs

A simple way to compress a graph is to provide it as a list of arcs. The
`webgraph` CLI provides a command `from` with a subcommand `arcs` that reads a
list of TAB-separated list of arcs from standard input and writes a compressed
[`BvGraph`]. For example,

```bash
echo -e "0\t1\n1\t2\n2\t3" >3-cycle.tsv
cargo run --release from arcs --exact 3-cycle <3-cycle.tsv
```

will create a file compressed graph with basename `3-cycle`. The `--exact` flag
is used to specify that the labels provided are exactly the node numbers,
numbered starting from zero: otherwise, a mapping from assigned node number to
labels will be created in RAM and store in `3-cycle.nodes` file.
The labels are stored in a `HashMap`, so, for very large graphs, the mapping
might not fit in RAM. For example,

```bash
echo -e "a\tb\nb\tc\nc\ta" > graph.tsv
# convert to bvgraph
cat graph.tsv | cargo run --release from arcs graph
```

The graph can be converted back in the arcs format using the `to arcs` command.
Passing the `.nodes` files to `--labels` will write the labels instead of the
node numbers.

```bash
# convert back to tsv
cargo run --release to arcs --labels=graph.nodes graph > back.tsv
```

Moreover, the `--separator` argument can be used in both `from arcs` and `to arcs`
to change the character that separates source and target to parse other formats
such as `csv`. For example,

```bash
echo -e "a,b\nb,c\nc,a" > graph.csv
# convert to bvgraph
$ cat graph.csv | cargo run --release from arcs --separator=',' graph
# convert back to csv
$ cargo run --release to arcs --separator=',' --labels=graph.nodes graph > back.csv
```
- CLI commands: [`webgraph_cli`](http://crates.io/crates/webgraph_cli)
([repo](/cli))

## Acknowledgments

Expand All @@ -190,25 +25,3 @@ grant ANR-20-CE23-0002 of the French Agence Nationale de la Recherche. Views and
opinions expressed are however those of the authors only and do not necessarily
reflect those of the European Union or the Italian MUR. Neither the European
Union nor the Italian MUR can be held responsible for them.

[transpose]: <https://docs.rs/webgraph/latest/webgraph/transform/fn.transpose.html>
[simplify]: <https://docs.rs/webgraph/latest/webgraph/transform/fn.simplify.html>
[permute]: <https://docs.rs/webgraph/latest/webgraph/transform/fn.permute.html>
[`with_basename`]: <https://docs.rs/webgraph/latest/webgraph/graphs/bvgraph/random_access/struct.BvGraph.html#method.with_basename>
[`BVGraphSeq`]: <https://docs.rs/webgraph/latest/webgraph/graphs/bvgraph/sequential/struct.BvGraphSeq.html>
[`BVGraph`]: <https://docs.rs/webgraph/latest/webgraph/graphs/bvgraph/sequential/struct.BvGraph.html>
[`LoadConfig`]: <https://docs.rs/webgraph/latest/webgraph/graphs/bvgraph/load/struct.LoadConfig.html>
[iterate on the whole graph]: <https://docs.rs/webgraph/latest/webgraph/traits/labels/trait.SequentialLabeling.html#method.iter>
[zipping]: <https://docs.rs/webgraph/latest/webgraph/labels/zip/struct.Zip.html>
[labeling]: <https://docs.rs/webgraph/latest/webgraph/traits/labels/trait.SequentialLabeling.html>
[iteration]: <https://docs.rs/webgraph/latest/webgraph/traits/labels/trait.SequentialLabeling.html#method.iter>
[retrieve the successors of a node]: <https://docs.rs/webgraph/latest/webgraph/traits/graph/trait.RandomAccessGraph.html#method.successors>
[LAW website]: <http://law.di.unimi.it/>
[Elias–Fano]: <sux::dict::EliasFano>
[WebGraph framework]: <https://webgraph.di.unimi.it/>
[ε-serde]: <https://crates.io/crates/epserde/>
[`for_`]: <https://docs.rs/lender/latest/lender/macro.for_.html>
[`VecGraph`]: <https://docs.rs/webgraph/latest/webgraph/graphs/vec_graph/struct.VecGraph.html>
[`LabeledVecGraph`]: <https://docs.rs/webgraph/latest/webgraph/graphs/vec_graph/struct.LabeledVecGraph.html>
[`BTreeGraph`]: <https://docs.rs/webgraph/latest/webgraph/graphs/btree_graph/struct.BTreeGraph.html>
[`LabeledBTreeGraph`]: <https://docs.rs/webgraph/latest/webgraph/graphs/btree_graph/struct.LabeledBTreeGraph.html>
19 changes: 19 additions & 0 deletions algo/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Change Log

## [0.1.1] - 2025-04-01

### New

* ExactSumSweep algorithm for eccentricities, radius, and diameter.

### Fixed

* Fixed crate name.

* Strongly connected components.

## [0.1.0] - 2025-03-31

### New

* First release.
32 changes: 32 additions & 0 deletions algo/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[package]
name = "webgraph-algo"
version = "0.1.1"
edition = "2021"
description = "Algorithms for the Rust port of the WebGraph framework (http://webgraph.di.unimi.it/)."
repository = "https://github.com/vigna/webgraph-rs/"
license = "Apache-2.0 OR LGPL-2.1-or-later"
readme = "README.md"
keywords = ["graph", "compression", "codes", "webgraph"]
categories = ["compression", "data-structures", "algorithms"]
authors = [
"Tommaso Fontana <tommaso.fontana.96@gmail.com>",
"Matteo Dell'Acqua <dellacqua.matteo99@gmail.com>",
"Sebastiano Vigna <sebastiano.vigna@unimi.it>",
]

[dependencies]
nonmax = "0.5.5"
rayon.workspace = true
sealed = "0.6.0"
sux.workspace = true
thiserror = "2.0.12"
parallel_frontier = "0.1.0"
webgraph.workspace = true
sync-cell-slice = "0.9.11"
anyhow.workspace = true
dsi-progress-logger.workspace = true
no-break = "0.1.2"
lender = "0.3.1"

[dev-dependencies]
rand.workspace = true
Loading