# Bayesian stats

- work in progress
- data cleaning demo
- Bayesian analysis

This script puts the theory from the notes on [Bayesian statistics](https://github.com/tros01/project_notes/blob/main/bayesian_stats_r.ipynb) to work.

In [32]:
# Libraries
library(tidyverse)
library(readxl)

In [33]:
# Helpers
excol_to_index <- function(excel_col) {
    # Input an Excel column code and return the index
    excel_col <- trimws(toupper(as.character(excel_col)))

    if (is.na(excel_col) || (nchar(excel_col) == 0)) {
        warning("Input must be a non-empty character string.") 
        return(NA) 
    }
    
    characters <- utf8ToInt(excel_col) - utf8ToInt("A") + 1

    if (any(characters < 1 | characters > 26)) {
        warning("Input must contain only letters.")
        return(NA)
    }

    index <- 0
    for (i in seq_along(characters)) { index <- index * 26 + characters[i] }
    
    return(index)
}

In [34]:
# Raw data
root_folder <- r"(D:\data\electoral_commission_public_attitudes)"

table <- suppressMessages(
    ec_ds_raw <- read_excel(
        file.path(root_folder, r"(Electoral Commission Winter Tracker 2018-2025 Historical Data v3.xlsx)"),
        sheet = "Counts", 
        col_names = FALSE, 
        col_types = "text", 
        skip = 7 
        # n_max = 3
    ) |>
    janitor::clean_names()
)

## Survey data

### Reference data frame

In [35]:
# Extract a df of indexed questions
table <- suppressMessages(
    ec_qs <- read_excel(
        file.path(root_folder, r"(Electoral Commission Winter Tracker 2018-2025 Historical Data v3.xlsx)"),
        sheet = "Counts", 
        col_names = FALSE, 
        col_types = "text", 
        skip = 7
    ) |>
    janitor::clean_names() |>
    select(1:2) |>
    mutate(
        start_index = row_number()
    ) |>
    filter(
        !is.na(x1) & is.na(x2)
    ) |>
    mutate(
        end_index = lead(start_index) - 1
    ) |>
    select(start_index, end_index, x1) |>
    rename(question = x1)
)

ec_qs |> head(n=5)

start_index,end_index,question
<int>,<dbl>,<chr>
1,20,year. Year
21,21,How knowledgeable would you say you are about the following election types in the UK?
22,35,S7b_1. UK Parliament (General Elections)
36,49,S7b_2. Local elections
50,63,S7b_3. Senedd/Northern Ireland assembly/Scottish Parliament


### Dataset

In [36]:
# Construct a header
table <- suppressMessages(
    ec_header_raw <- read_excel(
        file.path(root_folder, r"(Electoral Commission Winter Tracker 2018-2025 Historical Data v3.xlsx)"),
        sheet = "Counts", 
        col_names = FALSE, 
        col_types = "text", 
        skip = 4, 
        n_max = 3
    ) |>
    janitor::clean_names()
)

row1 <- ec_header_raw[1,] |> # Top header
    as.character() |>
    str_to_lower()
row2 <- ec_header_raw[2,] |> # Years
    unlist() |>
    as.numeric()
row3 <- ec_header_raw[3,] |> # Excel-style index
    as.character()

    # Clean the header
row1 <- str_replace_all(row1, "[ /+]", "")
row1 <- str_replace_all(row1, "[-:]", "_")
row1 <- row1[!is.na(row1)]
row1 |> print()

 [1] "total"                  "year"                   "gender_male"           
 [4] "gender_female"          "age_16_24"              "age_25_34"             
 [7] "age_35_44"              "age_45_64"              "age_65_74"             
[10] "age_75"                 "socialgrade_ab"         "socialgrade_c1"        
[13] "socialgrade_c2"         "socialgrade_de"         "socialgrade_net_abc1"  
[16] "socialgrade_net_c2de"   "nation_england"         "nation_wales"          
[19] "nation_scotland"        "nation_northernireland" "nation_greatbritain"   


In [37]:
    # Construct a new header
ec_cols <- sapply(
    row1[!(row1 == "total")],
    \(x) {
        paste0(
            rep(paste0(
                x,
                "_"
                ), max(row2, na.rm = TRUE) - min(row2, na.rm = TRUE) + 1
            ), 
            seq(min(row2, na.rm = TRUE), max(row2, na.rm = TRUE))
        )
    }
)
ec_header <- c("qa", "total", ec_cols)

# Dimension check
length(ec_header) == ncol(ec_ds_raw)

In [38]:
# Main dataset data frame
ec_ds <- ec_ds_raw
colnames(ec_ds) <- ec_header

ec_ds <- ec_ds |>
    mutate(
        across(
            .cols = -qa,
            .fns = ~ as.numeric(.)
        )
    )

ec_ds |> head(n=10)

[1m[22m[36mℹ[39m In argument: `across(.cols = -qa, .fns = ~as.numeric(.))`.
[33m![39m NAs introduced by coercion


qa,total,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,year_2024,year_2025,⋯,nation_northernireland_2024,nation_northernireland_2025,nation_greatbritain_2018,nation_greatbritain_2019,nation_greatbritain_2020,nation_greatbritain_2021,nation_greatbritain_2022,nation_greatbritain_2023,nation_greatbritain_2024,nation_greatbritain_2025
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
year. Year,,,,,,,,,,⋯,,,,,,,,,,
Unweighted base,31304.0,1300.0,1731.0,3201.0,3418.0,5486.0,4375.0,5874.0,5919.0,⋯,1000.0,1036.0,1100.0,1530.0,2701.0,2906.0,4928.0,3865.0,4874.0,4883.0
Base,31076.98,1300.0,1730.98,3201.0,3417.0,5260.0,4375.0,5874.0,5919.0,⋯,166.85,169.77,1264.91,1687.23,3111.37,3311.39,5102.2,4249.7,5707.15,5749.23
Effective base,17219.04,1136.44,1297.81,2283.0,2001.55,2376.28,2735.17,3039.85,2924.72,⋯,805.0,787.61,1080.48,1238.27,2164.91,1888.27,2245.74,2594.98,2878.38,2767.81
2018,1300.0,1300.0,,,,,,,,⋯,,,1264.91,,,,,,,
,,,,,,,,,,⋯,,,,,,,,,,
2019,1730.98,,1730.98,,,,,,,⋯,,,,1687.23,,,,,,
,,,,,,,,,,⋯,,,,,,,,,,
2020,3201.0,,,3201.0,,,,,,⋯,,,,,3111.37,,,,,
,,,,,,,,,,⋯,,,,,,,,,,


In [39]:
# Dimensions of the main dataset df
print(sprintf("% s rows and % s cols", nrow(ec_ds), ncol(ec_ds)))

[1] "6009 rows and 162 cols"


### Question-specific subsets

In [40]:
# Q9D2. How did you register to vote (2022)?
    # Get the question row-range
q9d2 <- "How did you register to vote"
q9d2_start <- as.numeric(
    ec_qs |> filter(str_detect(ec_qs$question, q9d2)) |> pull(start_index)
)
q9d2_end <- as.numeric(
    ec_qs |> filter(str_detect(ec_qs$question, q9d2)) |> pull(end_index)
)

cat("Index range:")
c(q9d2_start, q9d2_end)

Index range:

In [41]:
q9d2_ds <- ec_ds |> 
    slice(q9d2_start:q9d2_end) |>
    select(qa, total, starts_with("year")) |>
    select(qa, year_2022) |>
    mutate(
        qa = str_trim(qa),
        qa = str_to_lower(qa),
        qa = str_replace_all(qa, " ", "_"),
        qa = str_replace_all(qa, "'", ""),
        qa = case_when(
            str_detect(qa, "using") ~ "paper",
            qa == "cant_remember" ~ "neither",
            .default = qa
        )
    ) |>
    filter(qa %in% c("online", "paper", "neither")) |>
    mutate(
        across(
            .cols = -qa,
            .fns = ~ as.integer(.)
        )
    )

q9d2_ds

qa,year_2022
<chr>,<int>
online,50
paper,7
neither,2


In [42]:
# Q23D. A deepfake video is a media in which a person in an existing video is replaced with someone else's likeness.Have you seen a deepfake video in the last year?
    # Get the question row-range
q23d <- "A deepfake video is a media in which a person"
q23d_start <- as.numeric(
    ec_qs$start_index[which(str_detect(ec_qs$question, q23d))]
)
q23d_end <- as.numeric(
    ec_qs$end_index[which(str_detect(ec_qs$question, q23d))]
)

cat("Index range:")
c(q23d_start, q23d_end)

Index range:

In [43]:
q23d_ds <- ec_ds |>
    slice(q23d_start:q23d_end) |>
    select(qa, total, starts_with("year")) |>
    select(qa, ends_with(c("2022", "2023"))) |>
    mutate(
        qa = str_trim(qa),
        qa = str_to_lower(qa),
        qa = str_replace_all(qa, " ", "_"),
        qa = str_replace_all(qa, "'", ""),
        qa = case_match(
            qa, 
            "dont_know" ~ "neither",
            .default = qa
        )
    ) |>
    filter(qa %in% c("yes", "no", "neither")) |>
    mutate(
        across(
            .cols = -qa,
            .fns = ~ as.integer(.)
        )
    )

q23d_ds

qa,year_2022,year_2023
<chr>,<int>,<int>
yes,1054,856
no,2466,2062
neither,1739,1455


### Analysis