Skip to content

Commit 7237db0

Browse files
committed
Add sketch quality figure
1 parent b70f505 commit 7237db0

File tree

4 files changed

+69
-5
lines changed

4 files changed

+69
-5
lines changed

analysis/latex.R

+3-3
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ latex_table_best <- function(data) {
2727
format = "latex"
2828
),
2929
# more compact names
30-
algorithm = str_remove(algorithm, "LSH"),
31-
dataset = if_else(dataset == "Livejournal", "LJ", as.character(dataset))
30+
# algorithm = str_remove(algorithm, "LSH"),
31+
# dataset = if_else(dataset == "Livejournal", "LJ", as.character(dataset))
3232
) %>%
3333
ungroup() %>%
3434
select(dataset, threshold, algorithm, total_time, recall, k, sketch_bits) %>%
@@ -40,7 +40,7 @@ latex_table_best <- function(data) {
4040
) %>%
4141
kbl(
4242
format = "latex",
43-
align = "ll rrll rrll",
43+
align = c("l", "l", "r", "r", "l", "l", "r", "r", "l", "l"),
4444
escape = F,
4545
booktabs = T,
4646
linesep = c("", "", "", "\\addlinespace"),

analysis/plot_k_dep.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ plot_counters <- function(data, t, ylabels = FALSE) {
101101
}
102102

103103
(plot_counters(plotdata, 0.5, ylabels = TRUE) | plot_counters(plotdata, 0.7)) / guide_area() +
104-
plot_layout(guides = "collect", heights = c(3, 1))
104+
plot_layout(guides = "collect", heights = c(10, 1))
105105

106106
ggsave("imgs/counters.png", width = 8, height = 5)
107107

analysis/plot_sketches.R

+18-1
Original file line numberDiff line numberDiff line change
@@ -188,4 +188,21 @@ ggsave("imgs/sketches.png", width = 10, height = 5)
188188
# loss512 = (`0` - `512`) / `0`
189189
# ) %>%
190190
# select(dataset, threshold, loss64, loss128, loss256, loss512) %>%
191-
# summarise(across(loss64:loss512, ~max(.x) %>% scales::percent(accuracy=0.001)))
191+
# summarise(across(loss64:loss512, ~max(.x) %>% scales::percent(accuracy=0.001)))
192+
193+
table_sketch_quality() %>%
194+
filter(sketch_bits > 0) %>%
195+
select(dataset, threshold, sketch_bits, lost_pairs, lost_fraction) %>%
196+
ggplot(aes(x = factor(sketch_bits), y = lost_fraction)) +
197+
geom_point() +
198+
geom_segment(aes(xend = factor(sketch_bits)), yend = 0) +
199+
geom_hline(yintercept = 0.01) +
200+
scale_y_continuous(labels = scales::percent_format()) +
201+
labs(
202+
x = "sketch bits",
203+
y = "false negative rate"
204+
) +
205+
facet_wrap(vars(dataset, threshold), ncol = 8) +
206+
theme_paper()
207+
208+
ggsave("imgs/sketch_loss.png", width = 10, height = 2)

analysis/tables.R

+47
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,53 @@ recode_algorithms <- function(data) {
3131
)
3232
}
3333

34+
table_sketch_quality <- function() {
35+
db <- DBI::dbConnect(RSQLite::SQLite(), "danny-results.sqlite")
36+
# The load is the maximum load among all the workers in a given experiment
37+
all <- tbl(db, "result_recent") %>%
38+
collect() %>%
39+
mutate(
40+
timed_out = is.na(output_size) # && !is.na(total_time_ms)
41+
) %>%
42+
filter(hosts == "10.1.1.1:2001__10.1.1.2:2001__10.1.1.3:2001__10.1.1.4:2001__10.1.1.5:2001") %>%
43+
filter(profile_frequency == 0) %>%
44+
filter(!str_detect(path, "sample-200000.bin")) %>%
45+
filter(required_recall == 0.8) %>%
46+
filter(threshold %in% c(0.5, 0.7)) %>%
47+
filter(!no_verify, !no_dedup) %>%
48+
# filter(algorithm != "two-level-lsh" | (repetition_batch >= 1000)) %>%
49+
mutate(dataset = basename(path)) %>%
50+
inner_join(baseinfo) %>%
51+
mutate(
52+
dry_run = as.logical(dry_run),
53+
total_time = set_units(total_time_ms, "ms"),
54+
dataset = case_when(
55+
str_detect(dataset, "sift") ~ "SIFT",
56+
str_detect(dataset, "Livejournal") ~ "Livejournal",
57+
str_detect(dataset, "[Gg]love") ~ "Glove",
58+
str_detect(dataset, "Orkut") ~ "Orkut"
59+
)
60+
) %>%
61+
order_datasets() %>%
62+
recode_algorithms() %>%
63+
filter(algorithm == "Cartesian") %>%
64+
select(-total_time_ms)
65+
66+
baseline <- filter(all, sketch_bits == 0) %>%
67+
select(dataset, threshold, base_output_size = output_size)
68+
69+
all <- inner_join(all, baseline) %>%
70+
mutate(
71+
lost_pairs = base_output_size - output_size,
72+
lost_fraction = lost_pairs / output_size
73+
)
74+
75+
76+
DBI::dbDisconnect(db)
77+
all
78+
}
79+
80+
3481
table_search_best <- function() {
3582
db <- DBI::dbConnect(RSQLite::SQLite(), "danny-results.sqlite")
3683
# The load is the maximum load among all the workers in a given experiment

0 commit comments

Comments
 (0)