In [24]:
include("Source/QuasiStableCardinalityEstimator.jl")
using Distributions
using DataStructures: counter, Dict, Set, Vector, inc!
using Plots

┌ Info: Precompiling QuasiStableColors [9c3856af-3e7c-4d34-a6af-a406867b22e4]
└ @ Base loading.jl:1664
[33m[1m│ [22m[39mThis may mean Graphs [86223c79-3864-5bf0-83f7-82e725a168b6] does not support precompilation but is imported by a module that does.
[33m[1m└ [22m[39m[90m@ Base loading.jl:1325[39m
┌ Info: Skipping precompilation since __precompile__(false). Importing QuasiStableColors [9c3856af-3e7c-4d34-a6af-a406867b22e4].
└ @ Base loading.jl:1341
┌ Info: Precompiling GraphsFlows [06909019-6f44-4949-96fc-b9d9aaa02889]
└ @ Base loading.jl:1664
[33m[1m│ [22m[39mThis may mean Graphs [86223c79-3864-5bf0-83f7-82e725a168b6] does not support precompilation but is imported by a module that does.
[33m[1m└ [22m[39m[90m@ Base loading.jl:1325[39m
┌ Info: Skipping precompilation since __precompile__(false). Importing GraphsFlows [06909019-6f44-4949-96fc-b9d9aaa02889].
└ @ Base loading.jl:1341
┌ Info: Precompiling JuMP [4076af6c-e467-56ae-b986-b466b2749572]
└ @ Base loading.jl:1

First, we generate a zipfian graph of moderate size.

In [25]:
n = 200000
numEdgeLabels = 100
numVertexLabels = 3
numVertices = 1000
d = DiscreteNonParametric(1:numVertices, [1.0/(i^.5) for i in 1:numVertices]/sum([1.0/(i^.5) for i in 1:numVertices]))
dEdgeLabels = DiscreteNonParametric(1:numEdgeLabels, [1.0/(i^.75) for i in 1:numEdgeLabels]/sum([1.0/(i^.75) for i in 1:numEdgeLabels]))
dVertexLabels = DiscreteNonParametric(1:numVertexLabels, [1.0/(i^.75) for i in 1:numVertexLabels]/sum([1.0/(i^.75) for i in 1:numVertexLabels]))
x1 = rand(d, n) .% numVertices .+ 1
x2 = rand(d, n) .% numVertices .+ 1
g = PropertyGraph(numVertices)
for x in hcat(x1, x2)
    add_labeled_node!(g, x, [only(rand(dVertexLabels, 1))])
end
for i in range(1, length(x1))
    add_labeled_edge!(g, (x1[i], x2[i]), only(rand(dEdgeLabels, 1)))
end

Then, we generate our lifted graph summary which includes cardinality and min/avg/max degree information about every edge between colors.

In [26]:
summary = generate_color_summary(g, 64)
nothing

In [27]:
get_color_summary_size(summary)

1521180

Lastly, we create a simple acyclic query graph composed of a star with a small leg and see how our bounds compare with the exact result.

In [28]:
query = PropertyGraph(4)
add_labeled_node!(query, 1, [1])
add_labeled_node!(query, 2, [1])
add_labeled_node!(query, 3, [1])
add_labeled_node!(query, 4, [1])
add_labeled_edge!(query, (1,2), 10)
add_labeled_edge!(query, (2,3), 5)
add_labeled_edge!(query, (3,4), 1)
add_labeled_edge!(query, (4,1), 6)

1-element Vector{Int64}:
 6

In [29]:
bounds_without_partial_agg = get_cardinality_bounds(query, summary; use_partial_sums = false, try_all_starting_nodes=false, verbose=false)

3-element Vector{Float64}:
  0.0
 31.444869513359386
  2.072297e7

In [30]:
bounds_with_partial_agg = get_cardinality_bounds(query, summary; use_partial_sums = true, try_all_starting_nodes=false, verbose=false)

3-element Vector{Float64}:
  0.0
 31.444869513359432
  2.072297e7

In [31]:
bounds_without_partial_agg ./ bounds_with_partial_agg # Check that the partial summation doesn't affect the output (beyond floating point issues)

3-element Vector{Float64}:
 NaN
   0.9999999999999986
   1.0

In [32]:
exact_size = only(get_exact_size(query, g; verbose=false))

685

In [33]:
println("Relative Error (Lower): ", only(bounds_with_partial_agg[1])/exact_size)
println("Relative Error (Avg): ", only(bounds_with_partial_agg[2])/exact_size)
println("Relative Error (Upper): ", only(bounds_with_partial_agg[3])/exact_size)

Relative Error (Lower): 0.0
Relative Error (Avg): 0.04590491899760501
Relative Error (Upper): 30252.51094890511


Next, we plot the error for our bounds on a variety of graph sizes.

In [34]:
query_graph = DiGraph(6)
add_edge!(query_graph, (1,2))
add_edge!(query_graph, (2,3))
add_edge!(query_graph, (2,6))
add_edge!(query_graph, (3,4))
add_edge!(query_graph, (3,5))

bounds = []
exact_sizes = []
graph_sizes = []
for i in range(1, 12)
    numVertices = 5*2^i
    push!(graph_sizes, numVertices)
    n = numVertices*5
    zipf = [1.0/(i^.5) for i in 1:numVertices]
    zipf = zipf ./ sum(zipf)
    d = DiscreteNonParametric(1:numVertices, zipf)
    x1 = rand(d, n) .% numVertices
    x2 = rand(d, n) .% numVertices
    g = DiGraph(numVertices)
    for i in range(1, length(x1))
        add_edge!(g, x1[i], x2[i])
    end
    summary = generate_color_summary(g, 64)
    bounds_with_partial_agg = get_cardinality_bounds(query_graph, summary; use_partial_sums = true, try_all_starting_nodes=true, verbose=false)
    push!(bounds, bounds_with_partial_agg)
    exact_size = only(get_exact_size(query_graph, g; verbose=false))
    push!(exact_sizes, exact_size)
end

LoadError: MethodError: no method matching generate_color_summary(::SimpleDiGraph{Int64}, ::Int64)
[0mClosest candidates are:
[0m  generate_color_summary([91m::PropertyGraph[39m, ::Int64; weighting) at ~/Cardinality-with-Colors/Source/QuasiStableCardinalityEstimator.jl:21

In [None]:
lower_bounds = max.(.1, [x[1] for x in bounds]./exact_sizes)
avg_bounds = [x[2] for x in bounds]./exact_sizes
upper_bounds = [x[3] for x in bounds]./exact_sizes
plot(graph_sizes, [lower_bounds avg_bounds upper_bounds], title="4 Chain Query", 
                label = ["Lower" "Avg" "Upper"], xlabel="Graph Size", ylabel = "Relative Error", 
                xscale=:log10, yscale=:log10, minorgrid=true)

In [None]:
using Graphs

function get_min_width_node_order(g::DiGraph)    
    partial_orders = [x for x in vertices(g)]
    while length(partial_orders[0]) < nv(g)
        
    end
    



    min_width = nv(g)
    min_order = []
    for starting_node in vertices(g)
        max_width = 0
        visited_nodes = [starting_node]
        while length(visited_nodes) < nv(g)
            new_width = nv(g)
            next_node = -1
            for potential_node in vertices(g)
                if potential_node in visited_nodes || !any([x in all_neighbors(g, potential_node) for x in visited_nodes])
                    continue
                end
                potential_visited_nodes = []
                copy!(potential_visited_nodes, visited_nodes)
                push!(potential_visited_nodes, potential_node)
                potential_num_active_nodes = 0
                for v in potential_visited_nodes
                    if ! all([x in potential_visited_nodes for x in all_neighbors(g, v)])
                        potential_num_active_nodes += 1
                    end
                end
                if potential_num_active_nodes <= new_width
                    next_node = potential_node
                    new_width = potential_num_active_nodes
                end
            end
            push!(visited_nodes, next_node)
            max_width = max(max_width, new_width)
        end
        if max_width <= min_width
            min_order = visited_nodes
            min_width = max_width
        end
    end
    return min_order
end