## Joins with Quasi-stable Coloring

In [4]:
using Distributions
using DataStructures: counter, Dict, Set, Vector, inc!

In [5]:
n = 200000
numVertices = 100000
zipf = [1.0/(i^.5) for i in 1:numVertices]
zipf = zipf ./ sum(zipf)
nothing #hide

Let's generate two arrays of size $n$ representing the edges in a graph G where the edges are generated according to a zipf distribution. Then makes two graph formats for convenience.

In [6]:
d = DiscreteNonParametric(1:numVertices, zipf)
x1 = rand(d, n) .% numVertices
x2 = rand(d, n) .% numVertices
EDict = Dict()
for x in Set(x1)
    EDict[x] = Set()
end
for i in 1:length(x1)
    push!(EDict[x1[i]], x2[i])
end

numEdges = sum(length(EDict[x]) for x in keys(EDict))
ETable = Array{Int64}(undef, numEdges, 2)
edgeCounter = 1
for x in keys(EDict)
    for y in EDict[x]
        ETable[edgeCounter,1] = x
        ETable[edgeCounter,2] = y
        edgeCounter += 1
    end
end

hash them then count the hash values:

In [7]:
numPartitions = 8
inDegCounters = [counter(Int32) for _ in 1:numPartitions]
outDegCounters = [counter(Int32) for _ in 1:numPartitions]
hashETable = (ETable .% numPartitions) .+ 1
for i in 1:size(ETable)[1]
    inc!(inDegCounters[hashETable[i,1]], ETable[i,1])
    inc!(outDegCounters[hashETable[i,2]], ETable[i,2])
end


hashCardinality::Dict{Int, Dict{Int, Int}} = Dict()
hashMaxDeg::Dict{Int, Dict{Int, Int}} = Dict()
for c1 in 1:numPartitions
    hashMaxDeg[c1] = Dict()
    for c2 in 1:numPartitions
        hashMaxDeg[c1][c2] = 0
        for x in keys(EDict)
            if x .% numPartitions + 1 == c1
                numEdgesToColor = 0
                for y in EDict[x]
                    if y .% numPartitions + 1 == c2
                        numEdgesToColor += 1
                    end
                end
                hashMaxDeg[c1][c2] = max(numEdgesToColor, hashMaxDeg[c1][c2])
                hashCardinality[c1][c2] += numEdgesToColor
            end
        end
    end
end

LoadError: syntax: incomplete: "for" at In[7]:13 requires end

by *Walter's method* we get an upper cardinality bound of the following for 3-hop paths in our graph: 

In [8]:
estimate_prior = 0
for x in keys(hashMaxDeg)
    if x in keys(hashMaxDeg)
        for y in keys(hashMaxDeg[x])
            if y in keys(hashMaxDeg)
                for z in keys(hashMaxDeg[y])
                    if z in keys(hashMaxDeg)
                        for u in keys(hashMaxDeg[z])
                            estimate_prior += min(hashCardinality[x]*hashMaxDeg[x][y]*hashMaxDeg[y][z]*hashMaxDeg[z][u],
                                                hashMaxDeg[y][x]*hashCardinality[y]*hashMaxDeg[y][z]*hashMaxDeg[z][u],
                                                hashMaxDeg[y][x]*hashMaxDeg[z][y]*hashCardinality[z]*hashMaxDeg[z][u],
                                                hashMaxDeg[y][x]*hashMaxDeg[z][y]*hashMaxDeg[u][z]*hashCardinality[u],
                                                )
                        end
                    end
                end
            end
        end
    end
end
estimate_prior

87381531694

## Graph Coloring
Now let's transform this into a graph coloring problem:

In [9]:
using Graphs
using QuasiStableColors
numPartitions = 32
g = Graph(numVertices)
for x in keys(EDict)
    for y in EDict[x]
        add_edge!(g, x, y)
    end
end

C = q_color(g, n_colors=numPartitions)
color_hash::Dict{Int, Int} = Dict()
for (color, nodes) in enumerate(C)
    for x in nodes
        color_hash[x - 1] = color
    end
end

In [10]:
colorColorCounter = Dict()
for x in keys(EDict)
    for y in EDict[x] 
        c1 = color_hash[x]
        c2 = color_hash[y]
        if !(c1 in keys(colorColorCounter))
           colorColorCounter[c1] = Dict()
        end
        if !(c2 in keys(colorColorCounter[c1]))
           colorColorCounter[c1][c2] = counter(Int)
        end
        inc!(colorColorCounter[c1][c2], x)
    end
end

colorEdgeCardinality::Dict{Int, Dict{Int, Int}} = Dict()
colorEdgeMaxDeg::Dict{Int, Dict{Int, Int}} = Dict()
for c1 in keys(colorColorCounter)
    colorEdgeMaxDeg[c1] = Dict()
    for c2 in keys(colorColorCounter[c1])
        colorEdgeMaxDeg[c1][c2] =  0 
        colorEdgeCardinality[c1][c2] = 0
        for v in values(colorColorCounter[c1][c2])
            colorEdgeMaxDeg[c1][c2] = max(v, colorEdgeMaxDeg[c1][c2])
            colorEdgeCardinality[c1][c2] += v
        end 
    end
end

In [249]:
colorEdgeMaxDeg

Dict{Int64, Dict{Int64, Int64}} with 32 entries:
  5  => Dict(5=>3, 16=>1, 20=>1, 30=>2, 28=>2, 24=>1, 17=>1, 8=>1, 1=>15, 19=>1…
  16 => Dict(5=>5, 16=>1, 30=>8, 8=>1, 1=>129, 19=>1, 22=>7, 6=>4, 31=>1, 3=>23…
  20 => Dict(5=>4, 24=>1, 30=>3, 8=>1, 17=>1, 28=>1, 1=>52, 19=>1, 22=>2, 32=>1…
  12 => Dict(5=>4, 20=>1, 30=>1, 28=>1, 23=>1, 1=>36, 22=>3, 19=>1, 3=>8, 29=>2…
  30 => Dict(5=>3, 16=>1, 20=>1, 12=>1, 30=>3, 28=>3, 24=>1, 8=>1, 17=>1, 1=>8…)
  28 => Dict(5=>2, 16=>1, 20=>2, 12=>1, 24=>1, 28=>2, 30=>1, 17=>1, 8=>1, 1=>16…
  24 => Dict(5=>3, 16=>1, 30=>2, 28=>1, 17=>1, 1=>21, 19=>1, 32=>1, 22=>1, 6=>1…
  17 => Dict(5=>4, 16=>1, 20=>1, 30=>4, 28=>3, 17=>1, 1=>50, 19=>2, 22=>3, 32=>…
  8  => Dict(5=>3, 20=>1, 30=>2, 24=>1, 28=>2, 8=>1, 1=>31, 19=>1, 22=>1, 23=>1…
  1  => Dict(5=>2, 16=>1, 20=>1, 12=>1, 30=>2, 28=>2, 24=>1, 17=>1, 8=>1, 1=>6…)
  19 => Dict(5=>3, 16=>1, 20=>1, 12=>1, 30=>2, 28=>2, 8=>1, 17=>1, 24=>1, 1=>16…
  22 => Dict(5=>3, 16=>1, 20=>1, 12=>1, 30=>2, 28=>2, 8=>1, 

In [12]:
estimate = 0
for x in keys(colorEdgeMaxDeg)
    for y in keys(colorEdgeMaxDeg[x])
        if y in keys(colorEdgeMaxDeg)
            for z in keys(colorEdgeMaxDeg[y])
                if z in keys(colorEdgeMaxDeg)
                    for u in keys(colorEdgeMaxDeg[z])
                        if u in keys(colorEdgeMaxDeg[z])
                            estimate += min(colorEdgeCardinality[x][y]*colorEdgeMaxDeg[y][z]*colorEdgeMaxDeg[z][u],
                                            colorEdgeMaxDeg[x][y]*colorEdgeCardinality[y][z]*colorEdgeMaxDeg[z][u],
                                            colorEdgeMaxDeg[x][y]*colorEdgeMaxDeg[y][z]*colorEdgeCardinality[z][u]
                                            )
                        end
                    end
                end
            end
        end
    end
end
estimate

2.1777812181572247e7

In [240]:
94091272833556/396669879387

237.20296831955434

In [241]:
colorCardinality

Dict{Int64, Int64} with 128 entries:
  5   => 164
  56  => 217
  35  => 2
  55  => 3
  110 => 6
  114 => 16
  123 => 9
  60  => 2
  30  => 440
  32  => 3
  6   => 20
  67  => 7
  45  => 2
  117 => 5
  73  => 3820
  115 => 2
  112 => 2
  64  => 22
  90  => 1190
  4   => 1516
  13  => 11
  54  => 56
  63  => 3
  86  => 4
  104 => 9
  ⋮   => ⋮

In [242]:
colorEdgeMaxDeg

Dict{Int64, Dict{Int64, Int64}} with 128 entries:
  5   => Dict(5=>1, 56=>2, 123=>1, 55=>1, 30=>2, 32=>1, 6=>1, 45=>1, 73=>3, 90=…
  56  => Dict(5=>1, 56=>1, 123=>1, 55=>1, 35=>1, 114=>1, 60=>1, 30=>2, 32=>1, 6…
  55  => Dict(5=>1, 56=>1, 81=>1, 72=>1, 30=>1, 1=>20, 22=>1, 43=>5, 69=>1, 99=…
  123 => Dict(56=>1, 16=>1, 79=>1, 20=>1, 72=>1, 30=>1, 75=>1, 37=>1, 111=>1, 1…
  110 => Dict(56=>1, 81=>1, 30=>1, 75=>1, 1=>11, 83=>1, 43=>2, 45=>1, 98=>1, 73…
  114 => Dict(5=>1, 56=>1, 81=>1, 30=>1, 1=>8, 53=>1, 22=>1, 43=>3, 69=>1, 68=>…
  35  => Dict(5=>1, 56=>1, 30=>1, 75=>1, 1=>15, 53=>1, 83=>1, 49=>1, 43=>3, 73=…
  60  => Dict(5=>2, 56=>1, 20=>1, 105=>1, 30=>1, 75=>1, 1=>26, 43=>4, 99=>1, 31…
  30  => Dict(5=>1, 56=>1, 123=>1, 114=>1, 55=>1, 60=>1, 30=>1, 6=>1, 67=>1, 45…
  32  => Dict(35=>1, 105=>1, 28=>1, 75=>1, 1=>19, 53=>1, 22=>1, 43=>4, 99=>1, 7…
  6   => Dict(5=>1, 56=>1, 30=>1, 117=>1, 45=>1, 73=>3, 115=>1, 64=>1, 90=>2, 4…
  67  => Dict(5=>1, 56=>1, 81=>1, 30=>1, 1=>17, 53=>1, 92=>

The actual cardinality is:

In [13]:
cardinality = 0
for x in keys(EDict)
    for y in EDict[x]
        if y in keys(EDict)
            for z in EDict[y]
                if z in keys(EDict)
                    cardinality += length(EDict[z])
                end
            end
        end
    end
end
cardinality

7141028

In [14]:
"Hashing relative error: $(estimate_prior / cardinality), color hashing relative error: $(estimate / cardinality)"

"Hashing relative error: 12236.547972364764, color hashing relative error: 3.049674666108612"

In [255]:
12000/99

121.21212121212122