Description
Hi everyone,
Thanks a lot for you work on broom it's a very useful package. I encountered a strange behavior today when working with coxph models with pspline. Using tidy()
, the term column was not containing the full variable names. This happen with very short variable names. Please see this small example that illustrate this truncation of the terms labels.
# Load library
library("survival")
library("broom")
# Build a basic coxph model with the lung dataset and a spline for age
df <- survival::lung
mod <- coxph(Surv(time, status) ~ pspline(age, df=3) + wt.loss, data=df)
tidy(mod)
#> # A tibble: 3 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 pspline(age, df = 3), lin 0.0220 0.00931 5.57 0.0182
#> 2 pspline(age, df = 3), non NA NA 2.01 0.383
#> 3 wt.loss 0.00102 0.00623 0.0266 0.870
summary(mod)
#> Call:
#> coxph(formula = Surv(time, status) ~ pspline(age, df = 3) + wt.loss,
#> data = df)
#>
#> n= 214, number of events= 152
#> (14 observations deleted due to missingness)
#>
#> coef se(coef) se2 Chisq DF p
#> pspline(age, df = 3), lin 0.021972 0.009306 0.009306 5.57 1.00 0.018
#> pspline(age, df = 3), non 2.01 2.08 0.380
#> wt.loss 0.001016 0.006225 0.006219 0.03 1.00 0.870
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> ps(age)3 1.339 0.74697 0.4109 4.362
#> ps(age)4 1.791 0.55829 0.2682 11.961
#> ps(age)5 2.276 0.43941 0.2580 20.077
#> ps(age)6 2.408 0.41528 0.2701 21.470
#> ps(age)7 2.294 0.43585 0.2698 19.513
#> ps(age)8 2.446 0.40891 0.2931 20.407
#> ps(age)9 3.022 0.33087 0.3616 25.265
#> ps(age)10 4.098 0.24401 0.4775 35.172
#> ps(age)11 6.442 0.15522 0.6351 65.353
#> ps(age)12 10.492 0.09531 0.6219 177.009
#> wt.loss 1.001 0.99898 0.9889 1.013
#>
#> Iterations: 4 outer, 12 Newton-Raphson
#> Theta= 0.8602308
#> Degrees of freedom for terms= 3.1 1.0
#> Concordance= 0.571 (se = 0.027 )
#> Likelihood ratio test= 7.88 on 4.08 df, p=0.1
This truncation occurs because, tidy.coxph rely on summary
,
which in turn call summary.coxph.penal
with a maxlabel
argument which have a default of 25.
# No truncation in this case
summary(mod, maxlabel = 500)
#> Call:
#> coxph(formula = Surv(time, status) ~ pspline(age, df = 3) + wt.loss,
#> data = df)
#>
#> n= 214, number of events= 152
#> (14 observations deleted due to missingness)
#>
#> coef se(coef) se2 Chisq DF p
#> pspline(age, df = 3), linear 0.021972 0.009306 0.009306 5.57 1.00 0.018
#> pspline(age, df = 3), nonlin 2.01 2.08 0.380
#> wt.loss 0.001016 0.006225 0.006219 0.03 1.00 0.870
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> ps(age)3 1.339 0.74697 0.4109 4.362
#> ps(age)4 1.791 0.55829 0.2682 11.961
#> ps(age)5 2.276 0.43941 0.2580 20.077
#> ps(age)6 2.408 0.41528 0.2701 21.470
#> ps(age)7 2.294 0.43585 0.2698 19.513
#> ps(age)8 2.446 0.40891 0.2931 20.407
#> ps(age)9 3.022 0.33087 0.3616 25.265
#> ps(age)10 4.098 0.24401 0.4775 35.172
#> ps(age)11 6.442 0.15522 0.6351 65.353
#> ps(age)12 10.492 0.09531 0.6219 177.009
#> wt.loss 1.001 0.99898 0.9889 1.013
#>
#> Iterations: 4 outer, 12 Newton-Raphson
#> Theta= 0.8602308
#> Degrees of freedom for terms= 3.1 1.0
#> Concordance= 0.571 (se = 0.027 )
#> Likelihood ratio test= 7.88 on 4.08 df, p=0.1
Truncation doesn’t not only occurs for the pspline term but for any term label of more than 25 characters
# Create a long variable name
df_long_variable <- df
df_long_variable$this_is_a_very_long_column_name_with_more_than_25_characters <- df$ph.ecog
mod_long_var <- coxph(Surv(time, status) ~ pspline(age, df=3) + wt.loss + this_is_a_very_long_column_name_with_more_than_25_characters, data = df_long_variable)
# Notice that the non spline term label is truncated both in tidy and summary.
tidy(mod_long_var)
#> # A tibble: 4 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 pspline(age, df = 3), lin 0.0136 0.00961 2.00 0.157
#> 2 pspline(age, df = 3), non NA NA 2.06 0.373
#> 3 wt.loss -0.00735 0.00661 1.24 0.266
#> 4 this_is_a_very_long_colum 0.478 0.130 13.5 0.000233
summary(mod_long_var)
#> Call:
#> coxph(formula = Surv(time, status) ~ pspline(age, df = 3) + wt.loss +
#> this_is_a_very_long_column_name_with_more_than_25_characters,
#> data = df_long_variable)
#>
#> n= 213, number of events= 151
#> (15 observations deleted due to missingness)
#>
#> coef se(coef) se2 Chisq DF p
#> pspline(age, df = 3), lin 0.013597 0.009606 0.009606 2.00 1.00 0.16000
#> pspline(age, df = 3), non 2.06 2.08 0.37000
#> wt.loss -0.007353 0.006615 0.006599 1.24 1.00 0.27000
#> this_is_a_very_long_colum 0.477850 0.129826 0.129150 13.55 1.00 0.00023
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> ps(age)3 1.3352 0.7490 0.3946 4.518
#> ps(age)4 1.7875 0.5594 0.2509 12.734
#> ps(age)5 2.3250 0.4301 0.2443 22.126
#> ps(age)6 2.5889 0.3863 0.2685 24.962
#> ps(age)7 2.4018 0.4163 0.2613 22.074
#> ps(age)8 2.2888 0.4369 0.2543 20.600
#> ps(age)9 2.6401 0.3788 0.2923 23.843
#> ps(age)10 3.4062 0.2936 0.3670 31.609
#> ps(age)11 5.2597 0.1901 0.4764 58.074
#> ps(age)12 8.4714 0.1180 0.4531 158.377
#> wt.loss 0.9927 1.0074 0.9799 1.006
#> this_is_a_very_long_colum 1.6126 0.6201 1.2503 2.080
#>
#> Iterations: 4 outer, 11 Newton-Raphson
#> Theta= 0.8532275
#> Degrees of freedom for terms= 3.1 1.0 1.0
#> Concordance= 0.613 (se = 0.026 )
#> Likelihood ratio test= 21.83 on 5.06 df, p=6e-04
# No truncation here
summary(mod_long_var, maxlabel = 500)
#> Call:
#> coxph(formula = Surv(time, status) ~ pspline(age, df = 3) + wt.loss +
#> this_is_a_very_long_column_name_with_more_than_25_characters,
#> data = df_long_variable)
#>
#> n= 213, number of events= 151
#> (15 observations deleted due to missingness)
#>
#> coef se(coef)
#> pspline(age, df = 3), linear 0.013597 0.009606
#> pspline(age, df = 3), nonlin
#> wt.loss -0.007353 0.006615
#> this_is_a_very_long_column_name_with_more_than_25_characters 0.477850 0.129826
#> se2 Chisq
#> pspline(age, df = 3), linear 0.009606 2.00
#> pspline(age, df = 3), nonlin 2.06
#> wt.loss 0.006599 1.24
#> this_is_a_very_long_column_name_with_more_than_25_characters 0.129150 13.55
#> DF p
#> pspline(age, df = 3), linear 1.00 0.16000
#> pspline(age, df = 3), nonlin 2.08 0.37000
#> wt.loss 1.00 0.27000
#> this_is_a_very_long_column_name_with_more_than_25_characters 1.00 0.00023
#>
#> exp(coef)
#> ps(age)3 1.3352
#> ps(age)4 1.7875
#> ps(age)5 2.3250
#> ps(age)6 2.5889
#> ps(age)7 2.4018
#> ps(age)8 2.2888
#> ps(age)9 2.6401
#> ps(age)10 3.4062
#> ps(age)11 5.2597
#> ps(age)12 8.4714
#> wt.loss 0.9927
#> this_is_a_very_long_column_name_with_more_than_25_characters 1.6126
#> exp(-coef)
#> ps(age)3 0.7490
#> ps(age)4 0.5594
#> ps(age)5 0.4301
#> ps(age)6 0.3863
#> ps(age)7 0.4163
#> ps(age)8 0.4369
#> ps(age)9 0.3788
#> ps(age)10 0.2936
#> ps(age)11 0.1901
#> ps(age)12 0.1180
#> wt.loss 1.0074
#> this_is_a_very_long_column_name_with_more_than_25_characters 0.6201
#> lower .95
#> ps(age)3 0.3946
#> ps(age)4 0.2509
#> ps(age)5 0.2443
#> ps(age)6 0.2685
#> ps(age)7 0.2613
#> ps(age)8 0.2543
#> ps(age)9 0.2923
#> ps(age)10 0.3670
#> ps(age)11 0.4764
#> ps(age)12 0.4531
#> wt.loss 0.9799
#> this_is_a_very_long_column_name_with_more_than_25_characters 1.2503
#> upper .95
#> ps(age)3 4.518
#> ps(age)4 12.734
#> ps(age)5 22.126
#> ps(age)6 24.962
#> ps(age)7 22.074
#> ps(age)8 20.600
#> ps(age)9 23.843
#> ps(age)10 31.609
#> ps(age)11 58.074
#> ps(age)12 158.377
#> wt.loss 1.006
#> this_is_a_very_long_column_name_with_more_than_25_characters 2.080
#>
#> Iterations: 4 outer, 11 Newton-Raphson
#> Theta= 0.8532275
#> Degrees of freedom for terms= 3.1 1.0 1.0
#> Concordance= 0.613 (se = 0.026 )
#> Likelihood ratio test= 21.83 on 5.06 df, p=6e-04
These truncation does not occur when the object if of class coxph
, as summary.coxph
does not have any maxlabel
argument
mod_nospline <- coxph(Surv(time, status) ~ age + wt.loss + this_is_a_very_long_column_name_with_more_than_25_characters, data = df_long_variable)
tidy(mod_long_var)
#> # A tibble: 4 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 pspline(age, df = 3), lin 0.0136 0.00961 2.00 0.157
#> 2 pspline(age, df = 3), non NA NA 2.06 0.373
#> 3 wt.loss -0.00735 0.00661 1.24 0.266
#> 4 this_is_a_very_long_colum 0.478 0.130 13.5 0.000233
As a workaround, I replace the term column from tidy by the row names extracted using coefficients
and summary
.
mod_tidy <- tidy(mod_long_var)
mod_tidy$term <- summary(mod_long_var, maxlabel = 3000) |>
coefficients() |>
row.names()
# Correct terms labels
mod_tidy
#> # A tibble: 4 × 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 pspline(age, df = 3), linear 0.0136 0.00961 2.00 1.57e-1
#> 2 pspline(age, df = 3), nonlin NA NA 2.06 3.73e-1
#> 3 wt.loss -0.00735 0.00661 1.24 2.66e-1
#> 4 this_is_a_very_long_column_name_with_mor… 0.478 0.130 13.5 2.33e-4
The inconsistency in the summary
methods is definitely on the survival
package side, where the truncation is actually a feature
in the console. But in broom::tidy
it’s seams unexpected to me,
and it might need some fixing as it makes filter
with variable names or regex fail.
Links to the functions:
broom/R/survival-coxph-tidiers.R
Line 69 in c0c2253
- https://github.com/therneau/survival/blob/master/R/print.summary.coxph.penal.R
Thanks a lot for your time,
Stéphane
Created on 2023-04-12 with reprex v2.0.2