New function to create `split` from dataframes #241

juliasilge · 2021-05-19T18:30:57Z

We find that folks sometimes have existing training/testing sets they want to use instead of starting from initial_split():

It is already possible to make a split object from dataframes in whatever way you want:

library(rsample)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
data(cells, package = "modeldata")
ind <- list(analysis = which(cells$case == "Train"), 
            assessment = which(cells$case == "Test"))
manual_split <- make_splits(ind, cells)
manual_split
#> <Analysis/Assess/Total>
#> <1009/1010/2019>

training(manual_split)
#> # A tibble: 1,009 x 58
#>    case  class angle_ch_1 area_ch_1 avg_inten_ch_1 avg_inten_ch_2 avg_inten_ch_3
#>    <fct> <fct>      <dbl>     <int>          <dbl>          <dbl>          <dbl>
#>  1 Train PS         134.        819           31.9           207.           69.9
#>  2 Train WS         107.        431           28.0           116.           63.9
#>  3 Train PS          69.2       298           19.5           102.           28.2
#>  4 Train WS         109.        256           18.8           127.           13.6
#>  5 Train PS         104.        258           17.6           125.           22.5
#>  6 Train PS          78.0       358           42.3           218.           42.3
#>  7 Train PS          13.7       158           31.4           103.           41.5
#>  8 Train WS         107.        315          295.            493.          193. 
#>  9 Train WS          84.7       246          583.            245.           86.4
#> 10 Train WS         124.        223          375.            293.          112. 
#> # … with 999 more rows, and 51 more variables: avg_inten_ch_4 <dbl>,
#> #   convex_hull_area_ratio_ch_1 <dbl>, convex_hull_perim_ratio_ch_1 <dbl>,
#> #   diff_inten_density_ch_1 <dbl>, diff_inten_density_ch_3 <dbl>,
#> #   diff_inten_density_ch_4 <dbl>, entropy_inten_ch_1 <dbl>,
#> #   entropy_inten_ch_3 <dbl>, entropy_inten_ch_4 <dbl>,
#> #   eq_circ_diam_ch_1 <dbl>, eq_ellipse_lwr_ch_1 <dbl>,
#> #   eq_ellipse_oblate_vol_ch_1 <dbl>, eq_ellipse_prolate_vol_ch_1 <dbl>,
#> #   eq_sphere_area_ch_1 <dbl>, eq_sphere_vol_ch_1 <dbl>,
#> #   fiber_align_2_ch_3 <dbl>, fiber_align_2_ch_4 <dbl>,
#> #   fiber_length_ch_1 <dbl>, fiber_width_ch_1 <dbl>, inten_cooc_asm_ch_3 <dbl>,
#> #   inten_cooc_asm_ch_4 <dbl>, inten_cooc_contrast_ch_3 <dbl>,
#> #   inten_cooc_contrast_ch_4 <dbl>, inten_cooc_entropy_ch_3 <dbl>,
#> #   inten_cooc_entropy_ch_4 <dbl>, inten_cooc_max_ch_3 <dbl>,
#> #   inten_cooc_max_ch_4 <dbl>, kurt_inten_ch_1 <dbl>, kurt_inten_ch_3 <dbl>,
#> #   kurt_inten_ch_4 <dbl>, length_ch_1 <dbl>, neighbor_avg_dist_ch_1 <dbl>,
#> #   neighbor_min_dist_ch_1 <dbl>, neighbor_var_dist_ch_1 <dbl>,
#> #   perim_ch_1 <dbl>, shape_bfr_ch_1 <dbl>, shape_lwr_ch_1 <dbl>,
#> #   shape_p_2_a_ch_1 <dbl>, skew_inten_ch_1 <dbl>, skew_inten_ch_3 <dbl>,
#> #   skew_inten_ch_4 <dbl>, spot_fiber_count_ch_3 <int>,
#> #   spot_fiber_count_ch_4 <dbl>, total_inten_ch_1 <int>,
#> #   total_inten_ch_2 <dbl>, total_inten_ch_3 <int>, total_inten_ch_4 <int>,
#> #   var_inten_ch_1 <dbl>, var_inten_ch_3 <dbl>, var_inten_ch_4 <dbl>,
#> #   width_ch_1 <dbl>
testing(manual_split)
#> # A tibble: 1,010 x 58
#>    case  class angle_ch_1 area_ch_1 avg_inten_ch_1 avg_inten_ch_2 avg_inten_ch_3
#>    <fct> <fct>      <dbl>     <int>          <dbl>          <dbl>          <dbl>
#>  1 Test  PS        143.         185           15.7           4.95           9.55
#>  2 Test  PS          2.89       285           24.3         112.            20.5 
#>  3 Test  WS         40.7        172          326.          654.           129.  
#>  4 Test  WS        174.         177          260.          596.           124.  
#>  5 Test  PS        180.         251           18.3           5.73          17.2 
#>  6 Test  WS         18.9        495           16.1          89.5           13.7 
#>  7 Test  WS        153.         384           17.7          89.9           20.4 
#>  8 Test  WS         13.7        424          174.          389.            38.8 
#>  9 Test  PS         52.2        236           18.2           6.33          17.1 
#> 10 Test  PS         92.9        187           40.2         214.            44.2 
#> # … with 1,000 more rows, and 51 more variables: avg_inten_ch_4 <dbl>,
#> #   convex_hull_area_ratio_ch_1 <dbl>, convex_hull_perim_ratio_ch_1 <dbl>,
#> #   diff_inten_density_ch_1 <dbl>, diff_inten_density_ch_3 <dbl>,
#> #   diff_inten_density_ch_4 <dbl>, entropy_inten_ch_1 <dbl>,
#> #   entropy_inten_ch_3 <dbl>, entropy_inten_ch_4 <dbl>,
#> #   eq_circ_diam_ch_1 <dbl>, eq_ellipse_lwr_ch_1 <dbl>,
#> #   eq_ellipse_oblate_vol_ch_1 <dbl>, eq_ellipse_prolate_vol_ch_1 <dbl>,
#> #   eq_sphere_area_ch_1 <dbl>, eq_sphere_vol_ch_1 <dbl>,
#> #   fiber_align_2_ch_3 <dbl>, fiber_align_2_ch_4 <dbl>,
#> #   fiber_length_ch_1 <dbl>, fiber_width_ch_1 <dbl>, inten_cooc_asm_ch_3 <dbl>,
#> #   inten_cooc_asm_ch_4 <dbl>, inten_cooc_contrast_ch_3 <dbl>,
#> #   inten_cooc_contrast_ch_4 <dbl>, inten_cooc_entropy_ch_3 <dbl>,
#> #   inten_cooc_entropy_ch_4 <dbl>, inten_cooc_max_ch_3 <dbl>,
#> #   inten_cooc_max_ch_4 <dbl>, kurt_inten_ch_1 <dbl>, kurt_inten_ch_3 <dbl>,
#> #   kurt_inten_ch_4 <dbl>, length_ch_1 <dbl>, neighbor_avg_dist_ch_1 <dbl>,
#> #   neighbor_min_dist_ch_1 <dbl>, neighbor_var_dist_ch_1 <dbl>,
#> #   perim_ch_1 <dbl>, shape_bfr_ch_1 <dbl>, shape_lwr_ch_1 <dbl>,
#> #   shape_p_2_a_ch_1 <dbl>, skew_inten_ch_1 <dbl>, skew_inten_ch_3 <dbl>,
#> #   skew_inten_ch_4 <dbl>, spot_fiber_count_ch_3 <int>,
#> #   spot_fiber_count_ch_4 <dbl>, total_inten_ch_1 <int>,
#> #   total_inten_ch_2 <dbl>, total_inten_ch_3 <int>, total_inten_ch_4 <int>,
#> #   var_inten_ch_1 <dbl>, var_inten_ch_3 <dbl>, var_inten_ch_4 <dbl>,
#> #   width_ch_1 <dbl>

^{Created on 2021-05-19 by the reprex package (v2.0.0)}

Let's consider making this easier still by building some kind of function to take two dataframes (training, testing) as input and creating a split as output.

The text was updated successfully, but these errors were encountered:

juliasilge · 2021-06-29T19:58:09Z

The new functionality for this is with make_splits():

library(rsample)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

data(cells, package = "modeldata")

make_splits(
  cells %>% filter(case == "Train"),
  cells %>% filter(case == "Test")
)
#> <Analysis/Assess/Total>
#> <1009/1010/2019>

^{Created on 2021-06-29 by the reprex package (v2.0.0)}

github-actions · 2021-07-14T01:02:25Z

This issue has been automatically locked. If you believe you have found a related problem, please file a new issue (with a reprex: https://reprex.tidyverse.org) and link to this issue.

juliasilge added the feature a feature request or enhancement label May 19, 2021

juliasilge mentioned this issue May 24, 2021

Tune XGBoost with tidymodels and #TidyTuesday beach volleyball | Julia Silge juliasilge/juliasilge.com#9

Open

liamblake mentioned this issue Jun 10, 2021

Function to create split from dataframes #246

Merged

juliasilge closed this as completed in #246 Jun 29, 2021

github-actions bot locked and limited conversation to collaborators Jul 14, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

New function to create `split` from dataframes #241

New function to create `split` from dataframes #241

juliasilge commented May 19, 2021

juliasilge commented Jun 29, 2021

github-actions bot commented Jul 14, 2021

New function to create split from dataframes #241

New function to create split from dataframes #241

Comments

juliasilge commented May 19, 2021

juliasilge commented Jun 29, 2021

github-actions bot commented Jul 14, 2021

New function to create `split` from dataframes #241

New function to create `split` from dataframes #241