Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Joins gain sql_on argument (#146)
  • Loading branch information
krlmlr authored and hadley committed Feb 9, 2019
1 parent 8d38547 commit 4b5aab9
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 22 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Expand Up @@ -39,6 +39,9 @@

* New `tbl_memdb()` for creating reprexes (to match `tbl_lazy()`)

* All `..._join()` methods gain an `sql_on` argument that allows specifying
arbitrary join predicates in SQL code (#146, @krlmlr).

* ORACLE: New custom translation for `paste()` and `paste0()` (@cderv, #221)

* `sql_prefix()` no longer turns SQL functions into uppercase, allowing for correct translation of case-sensitive SQL functions (#181, @mtoto).
Expand Down
2 changes: 2 additions & 0 deletions R/query-join.R
Expand Up @@ -118,6 +118,8 @@ sql_join_tbls <- function(con, by) {
parens = TRUE,
con = con
)
} else if (length(by$on) > 0) {
on <- build_sql("(", by$on, ")", con = con)
}

on
Expand Down
71 changes: 58 additions & 13 deletions R/verb-joins.R
Expand Up @@ -6,9 +6,13 @@
#' @section Implementation notes:
#'
#' Semi-joins are implemented using `WHERE EXISTS`, and anti-joins with
#' `WHERE NOT EXISTS`. Support for semi-joins is somewhat partial: you
#' can only create semi joins where the `x` and `y` columns are
#' compared with `=` not with more general operators.
#' `WHERE NOT EXISTS`.
#'
#' All joins use column equality by default.
#' An arbitrary join predicate can be specified by passing
#' an SQL expression to the `sql_on` argument.
#' Use `LHS` and `RHS` to refer to the left-hand side or
#' right-hand side table, respectively.
#'
#' @inheritParams dplyr::join
#' @param copy If `x` and `y` are not from the same data source,
Expand All @@ -23,6 +27,9 @@
#' @param auto_index if `copy` is `TRUE`, automatically create
#' indices for the variables in `by`. This may speed up the join if
#' there are matching indexes in `x`.
#' @param sql_on A custom join predicate as an SQL expression. The SQL
#' can refer to the `LHS` and `RHS` aliases to disambiguate
#' column names.
#' @examples
#' \dontrun{
#' library(dplyr)
Expand Down Expand Up @@ -75,6 +82,19 @@
#'
#' # batters without person covariates
#' anti_join(batting, people)
#'
#' # Arbitrary predicates ------------------------------------------------------
#'
#' # Find all pairs of awards given to the same player
#' # with at least 18 years between the awards:
#' awards_players <- tbl(lahman_s, "AwardsPlayers")
#' inner_join(
#' awards_players, awards_players,
#' sql_on = paste0(
#' "(LHS.playerID = RHS.playerID) AND ",
#' "(LHS.yearID < RHS.yearID - 18)"
#' )
#' )
#' }
#' }
#' @name join.tbl_sql
Expand All @@ -84,11 +104,14 @@ NULL
#' @export
inner_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
suffix = c(".x", ".y"),
auto_index = FALSE, ...) {
auto_index = FALSE, ...,
sql_on = NULL) {

add_op_join(
x, y,
"inner",
by = by,
sql_on = sql_on,
copy = copy,
suffix = suffix,
auto_index = auto_index,
Expand All @@ -100,11 +123,14 @@ inner_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
#' @export
left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
suffix = c(".x", ".y"),
auto_index = FALSE, ...) {
auto_index = FALSE, ...,
sql_on = NULL) {

add_op_join(
x, y,
"left",
by = by,
sql_on = sql_on,
copy = copy,
suffix = suffix,
auto_index = auto_index,
Expand All @@ -116,11 +142,14 @@ left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
#' @export
right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
suffix = c(".x", ".y"),
auto_index = FALSE, ...) {
auto_index = FALSE, ...,
sql_on = NULL) {

add_op_join(
x, y,
"right",
by = by,
sql_on = sql_on,
copy = copy,
suffix = suffix,
auto_index = auto_index,
Expand All @@ -132,11 +161,14 @@ right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
#' @export
full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
suffix = c(".x", ".y"),
auto_index = FALSE, ...) {
auto_index = FALSE, ...,
sql_on = NULL) {

add_op_join(
x, y,
"full",
by = by,
sql_on = sql_on,
copy = copy,
suffix = suffix,
auto_index = auto_index,
Expand All @@ -147,11 +179,14 @@ full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
#' @rdname join.tbl_sql
#' @export
semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
auto_index = FALSE, ...) {
auto_index = FALSE, ...,
sql_on = NULL) {

add_op_semi_join(
x, y,
anti = FALSE,
by = by,
sql_on = sql_on,
copy = copy,
auto_index = auto_index,
...
Expand All @@ -161,23 +196,28 @@ semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
#' @rdname join.tbl_sql
#' @export
anti_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE,
auto_index = FALSE, ...) {
auto_index = FALSE, ...,
sql_on = NULL) {

add_op_semi_join(
x, y,
anti = TRUE,
by = by,
sql_on = sql_on,
copy = copy,
auto_index = auto_index,
...
)
}


add_op_join <- function(x, y, type, by = NULL, copy = FALSE,
add_op_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE,
suffix = c(".x", ".y"),
auto_index = FALSE, ...) {

if (identical(type, "full") && identical(by, character())) {
if (!is.null(sql_on)) {
by <- list(x = character(0), y = character(0), on = sql(sql_on))
} else if (identical(type, "full") && identical(by, character())) {
type <- "cross"
by <- list(x = character(0), y = character(0))
} else {
Expand All @@ -201,9 +241,14 @@ add_op_join <- function(x, y, type, by = NULL, copy = FALSE,
x
}

add_op_semi_join <- function(x, y, anti = FALSE, by = NULL, copy = FALSE,
add_op_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = FALSE,
auto_index = FALSE, ...) {
by <- common_by(by, x, y)
if (!is.null(sql_on)) {
by <- list(x = character(0), y = character(0), on = sql(sql_on))
} else {
by <- common_by(by, x, y)
}

y <- auto_copy(
x, y, copy,
indexes = if (auto_index) list(by$y)
Expand Down
39 changes: 30 additions & 9 deletions man/join.tbl_sql.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions tests/testthat/sql/join-on.sql
@@ -0,0 +1,32 @@
$inner
<SQL>
SELECT `LHS`.`x` AS `x.x`, `LHS`.`y` AS `y`, `RHS`.`x` AS `x.y`, `RHS`.`z` AS `z`
FROM `df` AS `LHS`
INNER JOIN `df` AS `RHS`
ON (LHS.y < RHS.z)


$left
<SQL>
SELECT `LHS`.`x` AS `x.x`, `LHS`.`y` AS `y`, `RHS`.`x` AS `x.y`, `RHS`.`z` AS `z`
FROM `df` AS `LHS`
LEFT JOIN `df` AS `RHS`
ON (LHS.y < RHS.z)


$right
<SQL>
SELECT `LHS`.`x` AS `x.x`, `LHS`.`y` AS `y`, `RHS`.`x` AS `x.y`, `RHS`.`z` AS `z`
FROM `df` AS `LHS`
RIGHT JOIN `df` AS `RHS`
ON (LHS.y < RHS.z)


$full
<SQL>
SELECT `LHS`.`x` AS `x.x`, `LHS`.`y` AS `y`, `RHS`.`x` AS `x.y`, `RHS`.`z` AS `z`
FROM `df` AS `LHS`
FULL JOIN `df` AS `RHS`
ON (LHS.y < RHS.z)


12 changes: 12 additions & 0 deletions tests/testthat/test-query-join.R
Expand Up @@ -18,3 +18,15 @@ test_that("generated sql doesn't change unexpectedly", {
)
expect_known_output(print(reg), test_path("sql/join.sql"))
})

test_that("sql_on query doesn't change unexpectedly", {
lf1 <- lazy_frame(x = 1, y = 2)
lf2 <- lazy_frame(x = 1, z = 3)
reg <- list(
inner = inner_join(lf1, lf2, sql_on = "LHS.y < RHS.z"),
left = left_join(lf1, lf2, sql_on = "LHS.y < RHS.z"),
right = right_join(lf1, lf2, sql_on = "LHS.y < RHS.z"),
full = full_join(lf1, lf2, sql_on = "LHS.y < RHS.z")
)
expect_known_output(print(reg), test_path("sql/join-on.sql"))
})
14 changes: 14 additions & 0 deletions tests/testthat/test-verb-joins.R
Expand Up @@ -27,6 +27,20 @@ test_that("join with both same and different vars", {
expect_equal(names(j1), c("x", "y"))
})

test_that("joining over arbitrary predicates", {
j1 <- collect(left_join(df1, df2, sql_on = "LHS.x = RHS.b"))
j2 <- collect(left_join(df1, df2, by = c("x" = "b"))) %>% mutate(b = x)
expect_equal(j1, j2)

j1 <- collect(left_join(df1, df3, sql_on = "LHS.x = RHS.z"))
j2 <- collect(left_join(df1, df3, by = c("x" = "z"))) %>% mutate(z = x.x)
expect_equal(j1, j2)

j1 <- collect(left_join(df1, df3, sql_on = "LHS.x = RHS.x"))
j2 <- collect(left_join(df1, df3, by = "x")) %>% mutate(x.y = x) %>% rename(x.x = x)
expect_equal(j1, j2)
})

test_that("inner join doesn't result in duplicated columns ", {
expect_equal(colnames(inner_join(df1, df1)), c("x", "y"))
})
Expand Down

0 comments on commit 4b5aab9

Please sign in to comment.