In [48]:
#======== Libraries ========
library(data.table)
library(plotly)
library(ggplot2)
library(tidyr)
library(dplyr)
library(splines)
library(gnm)
library(Epi) 

#======== Data load ========
environment <- read.csv("/Users/gabrielazemencikova/Desktop/ami-catalonia/project/card_var_pol_stat_data_at.csv")
colnames(environment)[1:18] = c('at_code','date','max_hum','min_hum','max_temp','min_temp','events','a_rolling_week','expected','a_asir_90','b_asir_365','pop_value','daily_diff_temp','daily_diff_hum','maximum_temperature_change','minimum_temperature_change','maximum_relative_humidity_change','minimum_relative_humidity_change')

environment$date <- as.Date(environment$date)
environment$dow <- weekdays(environment$date)
environment$day <- format(environment$date, "%d")
environment$month <- format(environment$date, "%m")
environment$year <- format(environment$date, "%y")
environment$doy <- as.numeric(format(environment$date, "%j"))

# Function to categorize each month into a season
categorize_season <- function(month) {
  if (month %in% c("05", "06", "07", "08", "09", "10")) {
    return("Summer")
  } else {
    return("Winter")
  }
}

# Add a new column "season" to the environmentset based on the month
environment$season <- sapply(environment$month, categorize_season)

#======== Lags and CAs ========
# Function to create lags and cumulative averages
create_lags_and_cumulative_averages <- function(df, variable, max_lag = 5) {
  # Create lag variables
  for (lag in 0:max_lag) {
    lag_col <- paste(variable, "lag", lag, sep = "_")
    df <- df %>% mutate(!!lag_col := lag(!!sym(variable), lag))
  }
  
  # Create cumulative average variables
  for (n in 2:(max_lag + 1)) {
    lag_cols <- sapply(0:(n - 1), function(lag) paste(variable, "lag", lag, sep = "_"))
    ca_col <- paste(variable, "CA", n, sep = "_")
    df <- df %>% mutate(!!ca_col := rowMeans(select(., all_of(lag_cols)), na.rm = TRUE))
  }
  
  return(df)
}

# Calculate mean_temp and mean_hum
environment <- environment %>%
  mutate(
    mean_temp = (max_temp + min_temp) / 2,
    mean_hum = (max_hum + min_hum) / 2
  )

# List of variables for which to create lags and cumulative averages
variables <- c("max_temp", "min_temp", "max_hum", "min_hum", "mean_hum", "mean_temp", "max_PM10")

# Initialize empty list to store processed dataframes
processed_data <- list()

# Loop through each at_code and apply the function to each variable
for (at_code in unique(environment$at_code)) {
  df <- environment[environment$at_code == at_code, ]
  for (variable in variables) {
    df <- create_lags_and_cumulative_averages(df, variable)
  }

  # Generate splines for humidity adjustment.
  df$l03max_tmean = (df$max_temp_lag_0+df$max_temp_lag_1+df$max_temp_lag_2+df$max_temp_lag_3)/4
  df$l03min_tmean = (df$min_temp_lag_0+df$min_temp_lag_1+df$min_temp_lag_2+df$min_temp_lag_3)/4
  df$l03min_rh = (df$min_hum_lag_0+df$min_hum_lag_1+df$min_hum_lag_2+df$min_hum_lag_3)/4
  df$l03max_rh = (df$max_hum_lag_0+df$max_hum_lag_1+df$max_hum_lag_2+df$max_hum_lag_3)/4

  df$ns.max_tmean <- ns(df$l03max_tmean, df=6)
  df$ns.min_tmean <- ns(df$l03min_tmean, df=6)
  df$ns.min_rh <- ns(df$l03min_rh, df=5)
  df$ns.max_rh <- ns(df$l03max_rh, df=5)

  # Generate averaged lag exposure for PM10 sale for a 10 ug/m3 increase.
  df$l01pm10 <- (df$max_PM10_lag_0 + df$max_PM10_lag_1)/2
  df$l01pm10 <- df$l01pm10/10

  processed_data[[at_code]] <- df
}

# Merge the processed dataframes
data <- do.call(rbind, processed_data)

#======== Drop the first few values (because of the lags) ========
data <- data[!(data$date >= as.Date("2010-01-02") & data$date <= as.Date("2010-01-08")), ]

#======== Create different datasets for different seasons ========
data_entire = data
data_winter = data[data$season=="Winter",]
data_summer = data[data$season=="Summer",]

#======== Generate time-stratified strata ========
data_entire$month <- as.factor(data_entire$month)
data_entire$year  <- as.factor(data_entire$year)
data_entire$dow   <- as.factor(data_entire$dow)
data_entire$stratum <- with(data_entire, as.factor(year:month:dow))
data_entire$ind <- tapply(data_entire$events, data_entire$stratum, sum)[data_entire$stratum]

# Fit fixed-effects conditional quasi-Poisson regression.
model.cc <- gnm(a_asir_90 ~ ns.max_tmean + ns.min_rh + l01pm10, 
                data=data_entire, family=quasipoisson, subset=ind>0, eliminate=stratum)

# Get Relative Risk for PM10.
Epi::ci.exp(model.cc, subset="l01pm10") 

# Residuals of the model to check for patterns.
residuals <- residuals(model.cc, type = "deviance")
plot(residuals, ylab = "Residuals", xlab = "Fitted Values", main = "Residual Plot")
abline(h = 0, col = "red")


"shoving 'interior' knots matching boundary knots to inside"
"shoving 'interior' knots matching boundary knots to inside"
"shoving 'interior' knots matching boundary knots to inside"
"shoving 'interior' knots matching boundary knots to inside"
"shoving 'interior' knots matching boundary knots to inside"
"shoving 'interior' knots matching boundary knots to inside"
"shoving 'interior' knots matching boundary knots to inside"


In [56]:
# # Ensure ind is calculated and added to data_entire
# data_entire$ind <- tapply(data_entire$events, data_entire$stratum, sum)[data_entire$stratum]

# # Subset the data based on ind > 0
# subset_data <- data_entire[data_entire$ind > 0, ]

# # Predict on the subset data
# predicted <- predict(model.cc, newdata = subset_data, type = "response")

# # Ensure the predicted values have the same length as the events in the subset data
# stopifnot(length(predicted) == length(subset_data$events))

# # Create a data frame for plotting
# plot_data <- data.frame(observed = subset_data$events, predicted = predicted)

# # Create the plot using ggplot2
# library(ggplot2)
# ggplot(plot_data, aes(x = predicted, y = observed)) +
#   geom_point() +
#   geom_abline(slope = 1, intercept = 0, color = "red") +
#   labs(title = "Predicted vs. Observed Values", x = "Predicted Values", y = "Observed Values") +
#   theme_minimal()

#======== Stratified analysis for AT01 and AT03 ========
dat.2city <- data_entire[data_entire$at_code %in% c("AT01","AT03"),]

# Generate space-time-stratified strata.
dat.2city$at_code  <- as.factor(dat.2city$at_code)
dat.2city$month <- as.factor(dat.2city$month)
dat.2city$year  <- as.factor(dat.2city$year)
dat.2city$dow   <- as.factor(dat.2city$dow)
dat.2city$stratum <- with(dat.2city, as.factor(at_code:year:month:dow))
dat.2city$ind <- tapply(dat.2city$a_asir_90, dat.2city$stratum, sum)[dat.2city$stratum]

# Fit conditional Poisson with space-time-stratified strata.
model.cc.adj.city <- gnm(a_asir_90 ~ ns.max_tmean + ns.min_rh + l01pm10, 
                         data=dat.2city, family=quasipoisson, subset=ind>0, eliminate=stratum)
Epi::ci.exp(model.cc.adj.city, subset="l01pm10") 

# Stratified analysis by city.
model.cc.vlc <- gnm(a_asir_90 ~ ns.max_tmean + ns.min_rh + l01pm10, 
                    data=subset(dat.2city,at_code=="AT01"), 
                    family=quasipoisson, subset=ind>0, eliminate=stratum)
Epi::ci.exp(model.cc.vlc, subset="l01pm10") 

model.cc.ldn <- gnm(a_asir_90 ~ ns.max_tmean + ns.min_rh + l01pm10, 
                    data=subset(dat.2city,at_code=="AT03"), 
                    family=quasipoisson, subset=ind>0, eliminate=stratum)
Epi::ci.exp(model.cc.ldn, subset="l01pm10") 

# Interaction analysis by location.
model.cc.city.int <- gnm(a_asir_90 ~ ns.max_tmean + ns.min_rh + at_code + l01pm10:at_code, 
                        data=dat.2city, family=quasipoisson, subset=ind>0, eliminate=stratum)
Epi::ci.exp(model.cc.city.int, subset="l01pm10") 

# Likelihood ratio test for effect modification.
anova(model.cc.adj.city, model.cc.city.int, test="LRT")

In [58]:
#======== Male and Female / Years distribution of events ========
age_pop <- read.csv("dataset_thesis/population_age_incidence.csv")

# pivot_table <- age_pop %>%
#   group_by(date, at_code, sex, age_range) %>%
#   summarise(events_count = n()) %>%
#   pivot_wider(names_from = c(sex, age_range), values_from = events_count, values_fill = 0)

# # If you want to reset index to make date and at_code regular columns
# pivot_table <- pivot_table %>%
#   ungroup() %>%
#   mutate(across(c(date, at_code), as.character))

# pivot_table$date <- as.Date(pivot_table$date)

# long <- inner_join(data_entire, pivot_table, by = c("date", "at_code"))
pivot_table <- age_pop %>%
  group_by(date, at_code, age_range) %>%
  summarise(events_count = n())

pivot_table$date <- as.Date(pivot_table$date)

long_1 = inner_join(data_entire, pivot_table, by = c("date", "at_code"))
# Fit fixed-effects conditional quasi-Poisson regression adjusted by age.
model.cc.adj.age <- gnm(events_count ~ factor(age_range) + ns.max_tmean + ns.min_rh + l01pm10, 
                        data=long_1, family=quasipoisson, eliminate=stratum)
Epi::ci.exp(model.cc.adj.age, subset="l01pm10") 

# Generate age-time-stratified strata.
long_1$stratum4 <-  with(long_1, as.factor(paste(year, month, dow, age_range, sep = ":")))
long_1$ind4 <- tapply(long_1$events_count, long_1$stratum4, sum)[long_1$stratum4]

# Fit conditional quasi-Poisson with age-time-stratified strata.
model.cc.str4 <- gnm(events_count ~ ns.max_tmean + ns.min_rh + l01pm10, 
                     data=long_1, family=quasipoisson, subset=ind4>0, eliminate=stratum4)
Epi::ci.exp(model.cc.str4, subset="l01pm10") 

summary(model.cc.adj.age)
summary(model.cc.str4)
# Interaction analysis by age. 
model.cc.age.int <- gnm(events_count ~ ns.max_tmean + ns.min_rh + factor(age_range) + l01pm10:factor(age_range), 
                        data=long_1, family=quasipoisson, subset=ind4>0, eliminate=stratum4)
Epi::ci.exp(model.cc.age.int, subset="l01pm10") 

# Likelihood ratio-test for effect for interaction.
anova(model.cc.str4, model.cc.age.int, test="LRT")
