## Data exploration

In [85]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)

In [86]:
plot_distribution <- function(dataframe, var_name, title, n_bins) {
  plot <- ggplot(dataframe, aes(x = get(var_name))) + 
  geom_histogram(aes(y = after_stat(density)), bins = n_bins, colour="black") + 
  labs(x = title) +
  geom_density(colour="red", linewidth = 1.3) + 
  theme(axis.text = element_text(size = 16),
  axis.title.y = element_blank(), 
  axis.title.x = element_text(size = 20))
  return(plot)
}

In [87]:
df0 <- read.csv("./data/S1Data.csv")
df0 <- df0 %>%
  rename_with(tolower) %>% 
  rename(platelets = pletelets) %>%
  mutate_at(vars(gender, smoking, diabetes, bp, anaemia), factor)

### Discrete variables 

In [88]:
variables <- c("gender", "smoking", "diabetes", "bp", "anaemia")

#### Cross-tab

In [89]:
result <- lapply(variables, function(var) {
  breakdown <- as.data.frame(table(df0[[var]], df0[["event"]])) %>%
    pivot_wider(names_from = "Var2", values_from = "Freq")
  cbind(var, breakdown)
})
df <- do.call(rbind, result)
df <- as.data.frame(df) %>%
rename("level" = "Var1", "number_censored" = "0", "number_death" = "1") %>%
mutate(
  percent_censored = round(number_censored / (number_censored + number_death) * 100, 2),percent_death = round(number_death / (number_censored + number_death) * 100, 2))
print(df)

        var level number_censored number_death percent_censored percent_death
1    gender     0              71           34            67.62         32.38
2    gender     1             132           62            68.04         31.96
3   smoking     0             137           66            67.49         32.51
4   smoking     1              66           30            68.75         31.25
5  diabetes     0             118           56            67.82         32.18
6  diabetes     1              85           40            68.00         32.00
7        bp     0             137           57            70.62         29.38
8        bp     1              66           39            62.86         37.14
9   anaemia     0             120           50            70.59         29.41
10  anaemia     1              83           46            64.34         35.66


In [90]:
result <- lapply(variables, function(var) {
    contingency_table <- table(df0[[var]], df0$event)
    test_result <- chisq.test(contingency_table, correct = FALSE)
    list(
    Variable = var,
    P_value = test_result$p.value,
    Significant = test_result$p.value < 0.05
  )
})
df <- do.call(rbind, lapply(result, as.data.frame))
print(df)

  Variable   P_value Significant
1   gender 0.9405034       FALSE
2  smoking 0.8272151       FALSE
3 diabetes 0.9731996       FALSE
4       bp 0.1700298       FALSE
5  anaemia 0.2518294       FALSE


* `gender`, `smoking`, `diabetes`,`bp` and `anaemia` do not show a slight difference

### Continuous variables

In [91]:
variables <- c("creatinine", "sodium", "cpk", "age", "platelets", "ejection.fraction")

#### Summary statistics

In [92]:
summary(df0[, c("time", variables)])

      time         creatinine        sodium           cpk        
 Min.   :  4.0   Min.   :0.500   Min.   :113.0   Min.   :  23.0  
 1st Qu.: 73.0   1st Qu.:0.900   1st Qu.:134.0   1st Qu.: 116.5  
 Median :115.0   Median :1.100   Median :137.0   Median : 250.0  
 Mean   :130.3   Mean   :1.394   Mean   :136.6   Mean   : 581.8  
 3rd Qu.:203.0   3rd Qu.:1.400   3rd Qu.:140.0   3rd Qu.: 582.0  
 Max.   :285.0   Max.   :9.400   Max.   :148.0   Max.   :7861.0  
      age          platelets      ejection.fraction
 Min.   :40.00   Min.   : 25100   Min.   :14.00    
 1st Qu.:51.00   1st Qu.:212500   1st Qu.:30.00    
 Median :60.00   Median :262000   Median :38.00    
 Mean   :60.83   Mean   :263358   Mean   :38.08    
 3rd Qu.:70.00   3rd Qu.:303500   3rd Qu.:45.00    
 Max.   :95.00   Max.   :850000   Max.   :80.00    

In [84]:
result <- lapply(variables, function(var) {
  test <- t.test(df0[[var]] ~ event, data = df0)
  data.frame(
    variables = var,
    mean_event0 = round(test$estimate[1], 2),  
    mean_event1 = round(test$estimate[2 ], 2),  
    p_value = round(test$p.value, 4), 
    significant = test$p.value < 0.05
  )
})
df <- do.call(rbind, result)
row.names(df) <- NULL
print(df)

          variables mean_event0 mean_event1 p_value significant
1        creatinine        1.18        1.84  0.0001        TRUE
2            sodium      137.22      135.38  0.0019        TRUE
3               cpk      540.05      670.20  0.3692       FALSE
4               age       58.76       65.22  0.0000        TRUE
5         platelets   266657.49   256381.04  0.3993       FALSE
6 ejection.fraction       40.27       33.47  0.0000        TRUE


* `cpk`, `platelets` show little difference in mean values between deceased and censored patients
* `creatinine`, `sodium`, `age` , `ejection.fraction` exhibit a difference in mean values