In [None]:
# non-gcare datasets:
# dblp, eu2005, hprd, human, patents, wordnet, yeast, youtube
# from https://github.com/RapidsAtHKUST/SubgraphMatching

In [6]:
using Graphs: DiGraph
using StatsPlots
include("Source/datasets.jl")
include("Source/QuasiStableCardinalityEstimator.jl")

get_cardinality_bounds (generic function with 1 method)

In [7]:
dblp_data_file_path = "dataset/dblp/dblp.graph"
eu2005_data_file_path = "dataset/eu2005/eu2005.graph"
hprd_data_file_path = "dataset/hprd/hprd.graph"
patents_data_file_path = "dataset/patents/patents.graph"
wordnet_data_file_path = "dataset/wordnet/wordnet.graph"
yeast_data_file_path = "dataset/yeast/yeast.graph"
youtube_data_file_path = "dataset/youtube/youtube.graph"

# dblp_data = load_dataset(dblp_data_file_path)
# eu2005_data = load_dataset(eu2005_data_file_path)
# hprd_data = load_dataset(hprd_data_file_path)
# patents_data = load_dataset(patents_data_file_path)
# wordnet_data = load_dataset(wordnet_data_file_path)
yeast_data = load_dataset(yeast_data_file_path, subgraph_matching_data=true)
# youtube_data = load_dataset(youtube_data_file_path)

nothing

In [8]:

# datasets = Dict("dblp"=>dblp_data, "eu2005"=>eu2005_data, "hprd"=>hprd_data,
#                 "patents"=>patents_data, "wordnet"=>wordnet_data,
#                 "yeast"=>yeast_data, "youtube"=>youtube_data)
# dataset_names = ["dblp", "eu2005", "hprd", "patents", "wordnet", "yeast", "youtube"]
datasets = Dict("yeast"=>yeast_data)
dataset_names = ["yeast"]
num_sample_nodes = Dict("dblp"=>100, "eu2005"=>100, "hprd"=>100,
                        "patents"=>100, "wordnet"=>100, 
                        "yeast"=>100, "youtube"=>100)
max_cycle_size = Dict("dblp"=>4, "eu2005"=>4, "hprd"=>4,
                      "patents"=>4, "wordnet"=>4, 
                      "yeast"=>4, "youtube"=>4)

Dict{String, Int64} with 7 entries:
  "wordnet" => 4
  "yeast"   => 4
  "hprd"    => 4
  "eu2005"  => 4
  "patents" => 4
  "youtube" => 4
  "dblp"    => 4

In [9]:
build_time = Dict()
summary_size = Dict()
color_summaries = Dict()
for dataset in dataset_names
    results = @timed generate_color_summary(datasets[dataset], 8, verbose=true, max_size = max_cycle_size[dataset], num_sample_nodes=num_sample_nodes[dataset])
    build_time[dataset] = results[2]
    summary_size[dataset] = get_color_summary_size(results[1])
    color_summaries[dataset] = results[1]
end

Started coloring
Finished coloring
Generating Cycles of Size: 

2


Generating Cycles of Size: 3


Generating Cycles of Size: 4


Started bloom filters
Finished bloom filters
Started cardinality counts
Finished cardinality counts
Started tracking statistics


Finished tracking statistics


In [10]:
include("Source/QuasiStableCardinalityEstimator.jl")

yeast_query_directories = ["/queryset/yeast"]
yeast_query_paths = [readdir(pwd() * dir, join=true) for dir in yeast_query_directories]
yeast_query_paths = [(yeast_query_paths...)...]
yeast_exact_sizes = []
yeast_bounds = []
yeast_bounds_with_stats = []
yeast_relative_errors = []
yeast_relative_errors_with_stats = []
println("Summary Size: ", summary_size["yeast"])
println("Summary Build Time: ", build_time["yeast"])
for query_path in yeast_query_paths
    #println("Query: ", query_path)
    id_and_query = load_query(query_path, subgraph_matching_data=true)
    id = id_and_query[1]
    query = id_and_query[2]
    bound_results = @timed get_cardinality_bounds(query, color_summaries["yeast"], usingStoredStats=false)
    bound_results_with_stats = @timed get_cardinality_bounds(query, color_summaries["yeast"], usingStoredStats=true)
    exact_size_results = @timed get_exact_size(query, datasets["yeast"])
#    println("Bound Time: ", bound_results[2])
#    println("Exact Size: ", gcare_size)
#    println("Bound [Lower, Avg, Upper]: ", bound_results[1])
#    println("Relative Error [Lower, Avg, Upper]: ", bound_results[1]./gcare_size)
    bound_results[1][2] = max(1, bound_results[1][2])
    bound_results_with_stats[1][2] = max(1, bound_results_with_stats[1][2])
    push!(yeast_exact_sizes, exact_size_results[1])
    push!(yeast_bounds, bound_results[1])
    push!(yeast_bounds_with_stats, bound_results_with_stats[1])
    push!(yeast_relative_errors, bound_results[1] ./ exact_size_results)
    push!(yeast_relative_errors_with_stats, bound_results_with_stats[1] ./ gcare_size)
end

# need to update packages...

Summary Size: nothing
Summary Build Time: 7.9703741


ArgumentError: ArgumentError: broadcasting over dictionaries and `NamedTuple`s is reserved

In [None]:
using Statistics
println(maximum([log10(x[2]) for x in yeast_relative_errors]))
println(mean([abs(log10(x[2])) for x in yeast_relative_errors]))
println(minimum([log10(x[2]) for x in yeast_relative_errors]))

In [None]:
using Statistics
println(maximum([log10(x[2]) for x in yeast_relative_errors_with_stats]))
println(mean([abs(log10(x[2])) for x in yeast_relative_errors_with_stats]))
println(minimum([log10(x[2]) for x in yeast_relative_errors_with_stats]))

In [None]:
StatsPlots.boxplot(yeast_query_type, [x[2] for x in yeast_relative_errors_with_stats],  yscale=:log10, fillalpha=0.5, linewidth=2, label="With Cycle Stats", ylims=[10^-4, 10^7], yticks=[10^-4,10^-2, 1, 10^2, 10^4, 10^6])
StatsPlots.boxplot!(yeast_query_type, [x[2] for x in yeast_relative_errors],  yscale=:log10, fillalpha=0.5, linewidth=2, label="Without Cycle Stats")