In [3]:
# Loading Packages
library(tidyverse)
library(easystats)

library(arrow) # For reading .parquet files, which are much smaller
library(data.table)
library(dtplyr)
library(emmeans)

library(ggplot2)
library(ggthemes)
library(patchwork)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.4     [32m✔[39m [34mtidyr    [39m 1.3.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
[34m# Attaching packages: easystats 0.7.5[39m
[32m✔ [39m[37mbayestestR [39m [32m0.17.0[39m   [32m✔ [39m[37mcorrelation[39m [32m0.8.8 [39m
[32m✔ [39m[37mdatawizard [39m [32m1.3.0 [39m   [32m✔ 

In [3]:
context <- function(df, target_id, num=3) {
        df_idx <- df %>%
        mutate(sent_idx = consecutive_id(Sentence_ID))
    
    target <- df_idx %>%
        filter(Sentence_ID == target_id) %>% 
        slice(1) %>% 
        pull(sent_idx)

    if(length(target) == 0) return(NULL)

    df_idx %>% 
        filter(
            between(
                sent_idx,
                target - num,
                target
            )
        ) %>% 
        mutate(
            Phrase_Token = if_else(Sentence_ID != target_id, "", phr_tok),
            Phrase_Surprisal = if_else(Sentence_ID != target_id, NA_real_, surprisal)
        ) %>%
        select(-sent_idx)
}

In [4]:
context <- function(dt, target_id, num=3) {

    u_ids <- unique(dt$Sentence_ID)
    

    target_pos <- match(target_id, u_ids)
    
    if (is.na(target_pos)) return(NULL)
    

    start_idx <- max(1, target_pos - num)
    end_idx   <- target_pos - 1
    
    prev_ids <- if (target_pos > 1) u_ids[start_idx:end_idx] else integer(0)
    
    dt_context <- dt[.(prev_ids), on = "Sentence_ID", mult = "first"]
    dt_context[, Phrase_Token := ""]
    dt_context[, Phrase_Surprisal := NA]
    
    dt_target <- dt[Sentence_ID == target_id]
    
    result <- rbind(dt_context, dt_target)

    # result <- result %>% select(-grp_idx)

    as_tibble(result)
    }

    context_full <- function(dt, target_id, num=3) {
    dt[, grp_idx := rleid(Sentence_ID)]
    target_grp <- dt[Sentence_ID == target_id, grp_idx[1]]
    result <- dt[grp_idx >= (target_grp - num) & grp_idx <= target_grp]
    result <- result %>% select(-grp_idx)
    as_tibble(result)
}

In [5]:
# dt_full <- lazy_dt(read_parquet("Data/No BNC ID/No ID Full Data.parquet"))
df_sent <- read_parquet("Older Files/No BNC ID/No ID Filtered Sentences.parquet")
df_nps <- read_parquet("Older Files/No BNC ID/No ID Head Nouns.parquet")


In [None]:
dt_full <- dt_full %>% 
    lazy_dt()

In [None]:
context(dt_full, "A00_0007", 3)

In [None]:
# Global font metrics for ggPlot: 
    plotFont <- function(fontBase) { # Easy way to adjust font size for plots
        theme( # Add as a final ggplot object (no parentheses)
        plot.title = element_text(size = 14*fontBase),      # Title font size
        axis.title.x = element_text(size = 12*fontBase),    # X-axis title font size
        axis.title.y = element_text(size = 12*fontBase),    # Y-axis title font size
        axis.text.x = element_text(size = 10*fontBase),     # X-axis tick labels font size
        axis.text.y = element_text(size = 10*fontBase),     # Y-axis tick labels font size
        legend.text = element_text(size = 10*fontBase),     # Legend text size
        legend.title = element_text(size = 10*fontBase),    # Legend title size
        strip.text = element_text(size = 10*fontBase)
        )
    }

    custom_theme <-  theme(
                            plot.background = element_rect(fill = "white", color = NA),
                            panel.background = element_rect(fill = "white", color = NA),
                            legend.background = element_rect(fill = "white", color = NA),

                            axis.line = element_line(color = "black", linewidth = 0.75), # <--- The key fix
                            axis.ticks = element_line(color = "black"),
                            panel.grid.major = element_line(color = "grey85"),
                            
                            legend.position = "bottom", 
                            legend.direction = "horizontal",
                            legend.title = element_blank(), # Clean look (optional)

                            plot.margin = margin(5,5,5,5, "mm")

                            
                            ) 

# For httpgd plot view in VS Code
options(repr.plot.width = 12, repr.plot.height = 8)

In [None]:
ggplot(df_nps, aes(x = np_start_idx, fill = definiteness)) + 
    geom_bar() + 
    custom_theme

In [None]:
dt_nps %>% 
    count(definiteness, np_start_idx) %>% 
    collect() %>%
    group_by(np_start_idx) %>%
    mutate(prop = n/sum(n)) %>% 
    pivot_wider(
        names_from = definiteness,
        values_from = c(n, prop), 
        names_vary = "slowest",
    ) %>% 
    arrange(np_start_idx)

In [None]:
context(df_full, "AD1_0794")

In [None]:
df_nps %>% 
    filter() %>% 
    select(Sentence_Text, Phrase_Token, argPos, surprisal, Sentence_ID) %>%
    group_by(Sentence_ID)%>%
    mutate(max_surprisal = max(surprisal)) %>% 
    ungroup() %>% 
    arrange(max_surprisal)%>%
    head(20)

In [None]:
context(df_full, "J2U_0641", 3)

In [12]:
mod <- glm(data = df_nps,
    formula = surprisal ~ argPos * definiteness * np_start_idx,
    family = Gamma(link = 'log'))

In [17]:
library(emmeans)

eff_slopes_log <- emtrends(mod, ~ definiteness, var = "np_start_idx", at = list(argPos = "obj") )


slopes_final <- as.data.frame(eff_slopes) %>% 
    mutate(
        ratio = exp(np_start_idx.trend),
        pct_change = ((ratio-1) * 100)
        )

print(slopes_final)

NOTE: Results may be misleading due to involvement in interactions



  definiteness np_start_idx.trend         SE    df    lower.CL    upper.CL
1        indef        -0.08827350 0.02652147 56796 -0.14025573 -0.03629128
2          def         0.01219296 0.01496353 56796 -0.01713565  0.04152157
      ratio pct_change
1 0.9155104  -8.448955
2 1.0122676   1.226759


In [16]:
# Create a summary of the 6 key values
results_summary <- data.frame(
  Effect = c("Baseline (Bits)", "Definite Obj vs Indef Obj", "Indef Sbj vs Obj", 
             "Def Sbj vs Obj", "Slope: Indefinite", "Slope: Definite"),
  Value = c(
    exp(coef(mod)[1]),                                           # 4.248
    (exp(coef(mod)["definitenessdef"]) - 1) * 100,              # -33.6
    (exp(coef(mod)["argPossbj"]) - 1) * 100,                    # 46.4
    (exp(coef(mod)["argPossbj"] + 
         coef(mod)["argPossbj:definitenessdef"]) - 1) * 100,    # 54.9
    (exp(coef(mod)["np_start_idx"]) - 1) * 100,                  # -2.1
    (exp(coef(mod)["np_start_idx"] + 
         coef(mod)["definitenessdef:np_start_idx"]) - 1) * 100   # 0.4
  )
)

print(results_summary)

                     Effect      Value
1           Baseline (Bits)   4.247669
2 Definite Obj vs Indef Obj -33.617017
3          Indef Sbj vs Obj  46.393744
4            Def Sbj vs Obj  54.866176
5         Slope: Indefinite  -2.114858
6           Slope: Definite   0.430898


In [None]:
# Pulls the preceding sentences from the full dataset to view context window

df_full[, grp_idx := rleid(Sentence_ID)]
target_grp <- df_full[Sentence_ID == "EF4_0684", grp_idx[1]]
result <- df_full[grp_idx >= (target_grp - 3) & grp_idx <= target_grp]

as_tibble(result)