Explore poor query performance #3091

hadley · 2017-09-08T21:19:48Z

library(dplyr)
library(RPostgreSQL) 
myRedshift <- src_postgres(Sys.getenv("DBNAME"), 
                           host = Sys.getenv("JDBCURL"),
                           port = Sys.getenv("PORT"),
                           user = Sys.getenv("USER"), 
                           password = Sys.getenv("PW"))
experiment_id <- '455145790a164522d1f4f47f2df40c76728c84dd'  
date_start <- '2017-06-08'  
date_end <- '2017-06-15' 
experiment_data <- tbl(myRedshift, sql("SELECT *, DATE_TRUNC('day', ts)::date AS dt FROM p_experiments WHERE name LIKE '%variation::select%'")) %>%
  dplyr::filter(experimentid == experiment_id &
                  ts >= date_start &
                  ts < date_end &
                  !is.na(userid) &
                  variation != 'unknown')
cohorts <- experiment_data %>%
  group_by(userid, variation) %>%
  summarise(enrolment_date = min(dt))
reading_time <- tbl(myRedshift, "etl_reading") %>%
  dplyr::filter(createdate >= date_start &
                  createdate < date_end &
                  time_spent >= 0) %>%
  inner_join(cohorts, by = "userid") %>%
  dplyr::filter(createdate >= enrolment_date) %>%
  group_by(variation, userid) %>%
  summarise(rt = sum(time_spent)) %>%
  ungroup()
reading_time_all_allocated <- cohorts %>%
  left_join(reading_time, by = c("userid", "variation")) %>%
  collect(n = Inf)

produces this SQL (via explain()):

SELECT "TBL_LEFT"."userid" AS "userid", "TBL_LEFT"."variation" AS "variation", "TBL_LEFT"."enrolment_date" AS "enrolment_date", "TBL_RIGHT"."rt" AS "rt"
  FROM (SELECT "userid", "variation", MIN("dt") AS "enrolment_date"
FROM (SELECT *
FROM (SELECT *, DATE_TRUNC('day', ts)::date AS dt FROM p_experiments WHERE name LIKE '%variation::select%') "ihznadkpcf"
WHERE ("experimentid" = '455145790a164522d1f4f47f2df40c76728c84dd' AND "ts" >= '2017-06-08' AND "ts" < '2017-06-15' AND NOT((("userid") IS NULL)) AND "variation" != 'unknown')) "biptkfpowo"
GROUP BY "userid", "variation") "TBL_LEFT"
  LEFT JOIN (SELECT "variation", "userid", SUM("time_spent") AS "rt"
FROM (SELECT *
FROM (SELECT "TBL_LEFT"."platform" AS "platform", "TBL_LEFT"."uuid" AS "uuid", "TBL_LEFT"."userid" AS "userid", "TBL_LEFT"."storyid" AS "storyid", "TBL_LEFT"."partid" AS "partid", "TBL_LEFT"."createdate" AS "createdate", "TBL_LEFT"."read_percent" AS "read_percent", "TBL_LEFT"."time_spent" AS "time_spent", "TBL_RIGHT"."variation" AS "variation", "TBL_RIGHT"."enrolment_date" AS "enrolment_date"
  FROM (SELECT *
FROM "etl_reading"
WHERE ("createdate" >= '2017-06-08' AND "createdate" < '2017-06-15' AND "time_spent" >= 0.0)) "TBL_LEFT"
  INNER JOIN (SELECT "userid", "variation", MIN("dt") AS "enrolment_date"
FROM (SELECT *
FROM (SELECT *, DATE_TRUNC('day', ts)::date AS dt FROM p_experiments WHERE name LIKE '%variation::select%') "bvqcfsxath"
WHERE ("experimentid" = '455145790a164522d1f4f47f2df40c76728c84dd' AND "ts" >= '2017-06-08' AND "ts" < '2017-06-15' AND NOT((("userid") IS NULL)) AND "variation" != 'unknown')) "nnwriawvlp"
GROUP BY "userid", "variation") "TBL_RIGHT"
  ON ("TBL_LEFT"."userid" = "TBL_RIGHT"."userid")
) "xmtbxcquyw"
WHERE ("createdate" >= "enrolment_date")) "ostngvabdq"
GROUP BY "variation", "userid") "TBL_RIGHT"
  ON ("TBL_LEFT"."userid" = "TBL_RIGHT"."userid" AND "TBL_LEFT"."variation" = "TBL_RIGHT"."variation")

Comparing this to the actual SQL I'd write to get these results:

reading_time_all_allocated <- dbGetQuery(con, glue(
"WITH cohorts AS (
  SELECT 
    p.variation, 
    p.userid,
    DATE(MIN(p.ts)) AS enrolment_date
  FROM p_experiments p
  WHERE 
    p.experimentid = '{experiment_id}' 
    AND p.name LIKE '%variation::select%'
    AND p.userid IS NOT NULL 
    AND p.ts BETWEEN '{date_start}' AND '{date_end}'
    AND p.variation != 'unknown'
  GROUP BY p.variation, p.userid),
reading_time AS (
  SELECT 
    c.userid, 
    c.variation,
    COALESCE(SUM(er.time_spent), 0) AS rt
  FROM etl_reading er
  JOIN cohorts c ON er.userid = c.userid
  WHERE 
    er.createdate >= c.enrolment_date
    AND er.createdate BETWEEN '{date_start}' AND ('{date_end}' - INTEGER '1')
    AND er.time_spent >= 0
  GROUP BY c.userid, c.variation)
SELECT 
  c.userid, 
  c.variation, 
  COALESCE(r.rt, 0) AS rt 
FROM cohorts c
  LEFT OUTER JOIN reading_time r ON c.userid = r.userid AND c.variation = r.variation",
    experiment_id = experiment_id,
    date_start = date_start,
    date_end = date_end))

Probably because of the extra subqueries inside the joins.

ghost · 2018-06-07T23:41:05Z

This issue was moved by hadley to tidyverse/dbplyr/issues/95.

lock · 2018-12-05T00:38:20Z

This old issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with reprex) and link to this issue. https://reprex.tidyverse.org/

hadley added database performance 🚀 labels Sep 8, 2017

hadley mentioned this issue Nov 6, 2017

dbplyr generates unnecessary subqueries #3190

Closed

ghost mentioned this issue Jun 7, 2018

Explore poor query performance tidyverse/dbplyr#95

Closed

ghost deleted a comment from hadley Jun 7, 2018

ghost closed this as completed Jun 7, 2018

lock bot locked and limited conversation to collaborators Dec 5, 2018

This issue was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Explore poor query performance #3091

Explore poor query performance #3091

hadley commented Sep 8, 2017

ghost commented Jun 7, 2018

lock bot commented Dec 5, 2018

Explore poor query performance #3091

Explore poor query performance #3091

Comments

hadley commented Sep 8, 2017

ghost commented Jun 7, 2018

lock bot commented Dec 5, 2018