In [1]:
using Graphs: DiGraph
using StatsPlots
include("Source/datasets.jl")
include("Source/QuasiStableCardinalityEstimator.jl")

aids_data_file_path = "dataset/aids/aids.txt"
human_data_file_path = "dataset/human/human.txt"
lubm80_data_file_path = "dataset/lubm80/lubm80.txt"
yago_data_file_path = "dataset/yago/yago.txt"
yeast_data_file_path = "dataset/yeast/yeast.graph"
hprd_data_file_path = "dataset/hprd/hprd.graph"
wordnet_data_file_path = "dataset/wordnet/wordnet.graph"
dblp_data_file_path = "dataset/dblp/dblp.graph"
youtube_data_file_path = "dataset/youtube/youtube.graph"

human_data = load_dataset(human_data_file_path)
aids_data = load_dataset(aids_data_file_path)
yeast_data = load_dataset(yeast_data_file_path, subgraph_matching_data=true)
hprd_data = load_dataset(hprd_data_file_path, subgraph_matching_data=true)
wordnet_data = load_dataset(wordnet_data_file_path, subgraph_matching_data=true)
dblp_data = load_dataset(dblp_data_file_path, subgraph_matching_data=true)
youtube_data = load_dataset(youtube_data_file_path, subgraph_matching_data=true)
#lubm80_data = load_dataset(lubm80_data_file_path)
#yago_data = load_dataset(yago_data_file_path)
nothing

In [2]:
#datasets = [aids_data, human_data, lubm80_data, yago_data]
#dataset_names = ["aids", "human", "lubm80", "yago"]
datasets = Dict("aids"=>aids_data, "human"=>human_data, "yeast"=>yeast_data, "hprd" => hprd_data, "wordnet" => wordnet_data)
#dataset_names = ["aids", "human", "yeast"]
dataset_names = ["wordnet"]
num_sample_nodes = Dict("aids"=>100, "human"=>100, "yeast"=>50, "hprd"=>100, "wordnet"=>10)
gcare_dataset = Dict("aids"=>true, "human"=>true, "yeast"=>false, "hprd"=>false, "wordnet"=>false)
max_cycle_size = Dict("aids"=>6, "human"=>-1, "yeast"=>6, "hprd"=>-1, "wordnet"=>3)

Dict{String, Int64} with 5 entries:
  "yeast"   => 6
  "wordnet" => 3
  "hprd"    => -1
  "aids"    => 6
  "human"   => -1

In [9]:
coloring_methods = ["Quasistable", "DirectedDegree"]
dataset_names = ["wordnet"]

# set up queries
query_directories = Dict()
query_directories["human"] = ["/queryset/human/Chain_3/",
                                "/queryset/human/Graph_3/",
                                "/queryset/human/Star_3/",
                                "/queryset/human/Tree_3/"]
query_directories["aids"] = ["/queryset/aids/Chain_3/",
                                "/queryset/aids/Chain_6/",
                                "/queryset/aids/Chain_9/",
                                "/queryset/aids/Chain_12/",
                                "/queryset/aids/Cycle_3/",
                                "/queryset/aids/Cycle_6/",
                                "/queryset/aids/Flower_6/",
                                "/queryset/aids/Flower_9/",
                                "/queryset/aids/Flower_12/",
                                "/queryset/aids/Graph_3/",
                                "/queryset/aids/Graph_6/",
                                "/queryset/aids/Graph_9/",
                                "/queryset/aids/Graph_12/",
                                "/queryset/aids/Petal_6/",
                                "/queryset/aids/Petal_9/",
                                "/queryset/aids/Petal_12/",
                                "/queryset/aids/Star_3/",
                                "/queryset/aids/Star_6/",
                                "/queryset/aids/Star_9/",
                                "/queryset/aids/Tree_3/",
                                "/queryset/aids/Tree_6/",
                                "/queryset/aids/Tree_9/",
                                "/queryset/aids/Tree_12/"]
query_directories["yeast"] = ["/queryset/yeast"]
query_directories["hprd"] = ["/queryset/hprd"]
query_directories["wordnet"] = ["/queryset/wordnet"]

query_paths = Dict()
for dataset in dataset_names
    query_paths[dataset] = [readdir(pwd() * dir, join=true) for dir in query_directories[dataset]]
    query_paths[dataset] = [(query_paths[dataset]...)...]
end

num_colors = 16
# Create a dictionary of method => color summary
summaries = Dict()
bounds = Dict(method => Dict(dataset => [] for dataset in dataset_names) for method in coloring_methods)
relative_errors = Dict()
exact_sizes = Dict(dataset => [] for dataset in dataset_names)
for method in coloring_methods
    summaries[method] = Dict()
    for dataset in dataset_names
        summaries[method][dataset] = generate_color_summary(datasets[dataset], num_colors, verbose=false, max_size = max_cycle_size[dataset], max_partial_paths=num_sample_nodes[dataset], partitioner="DirectedDegree") 
    end
    bounds[method] = Dict(dataset => [] for dataset in dataset_names)
end

query_types = Dict(dataset => [] for dataset in dataset_names)
count = 0
# now run the datasets
for dataset in dataset_names
    println("Running: ", dataset)

    for query_path in query_paths[dataset]
        occursin("dense_32", query_path) && continue
        occursin("dense_24", query_path) && continue
        occursin("dense_20", query_path) && continue
        # occursin("dense_16", query_path) && continue
        # occursin("dense_12", query_path) && continue
        # occursin("dense_8", query_path) && continue
        occursin("sparse_32", query_path) && continue
        occursin("sparse_24", query_path) && continue
        occursin("sparse_20", query_path) && continue
        occursin("sparse_16", query_path) && continue
        occursin("sparse_12", query_path) && continue
        occursin("sparse_8", query_path) && continue
        count += 1
        count % 10 != 0 && continue
        if gcare_dataset[dataset]
            query_type = match(r".*/queryset/.*\\(.*)_.*\.*", query_path).captures[1]
            exact_size = load_true_cardinality(replace(query_path, "queryset"=>"TrueCardinalities"))
            id_and_query = load_query(query_path)
            id = id_and_query[1]
            query = id_and_query[2]
        else
            thing = match(r".*/queryset/.*\\query_(.*)_.*", query_path)
            if (thing === nothing)
                println("missed")
                continue
            end
            query_type = match(r".*/queryset/.*\\query_(.*)_.*", query_path).captures[1]
            if isfile(replace(query_path, "queryset"=>"TrueCardinalities"))
                exact_size = load_true_cardinality(replace(query_path, "queryset"=>"TrueCardinalities"))
            else
                continue
            end
            id_and_query = load_query(query_path, subgraph_matching_data=true)
            id = id_and_query[1]
            query = id_and_query[2]
        end
        
        push!(exact_sizes[dataset], exact_size)
        for method in coloring_methods
            current_bounds = get_cardinality_bounds(query, summaries[method][dataset], usingStoredStats=false)
            current_bounds[2] = max(1, current_bounds[2])
            push!(bounds[method][dataset], current_bounds)
            push!(query_types[dataset], query_type)
        end
    end
    for method in coloring_methods
        relative_errors[method] = Dict(dataset => bounds[method][dataset] ./ exact_sizes[dataset] for dataset in dataset_names)
    end
end

current_dataset = "wordnet"
grouped_errors = vec([[x[2] for x in relative_errors[method][current_dataset]]... for method in coloring_methods])
grouped_query_types = vec([query_types[current_dataset]... for method in coloring_methods])
grouped_datasets = vec([[method for _ in 1:length(query_types[current_dataset])]... for method in coloring_methods])

groupedboxplot(grouped_query_types, grouped_errors, group = grouped_datasets, yscale =:log10,  ylims=(10^-5, 10^11), yticks=[10^-5, 1, 10^5, 10^10])


Running: wordnet


ErrorException: syntax: "..." expression outside call around c:\Users\diand\OneDrive\Desktop\Cardinality-with-Colors\g-care-diandre-new.ipynb:115