# QP 2 Analysis 
## 12-18-2025

In [1]:
# Packages

library(tidyverse)
library(easystats)

library(arrow)
library(data.table)
library(dtplyr)

library(ggplot2)
library(ggthemes)
library(patchwork)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
[34m# Attaching packages: easystats 0.7.5[39m
[32m✔ [39m[37mbayestestR [39m [32m0.17.0[39m   [32m✔ [39m[37mcorrelation[39m [32m0.8.8 [39m
[32m✔ [39m[37m

In [None]:
# Read from CSV


files <- list.files(path = "D:/BNC Full Data/12-17 Run/CSV",
                    pattern = "\\.csv$",
                    full.names = TRUE)

df_full <- read_csv(files, id = "file_name") 



    # mutate(definiteness = factor(levels = c("indef", "def"))) %>% 
    # mutate(argPos = factor(levels = c("obj", "sbj")))

write_parquet(df_full, "Data/Full Data Unfactored.parquet")
###



[1mRows: [22m[34m111571140[39m [1mColumns: [22m[34m41[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (19): bnc_id, consecutive_id, filename, modality, s_text, w_tok, phr_tok...
[32mdbl[39m (17): s_verb_cnt, s_aux_cnt, s_tot_obj_cnt, s_dir_obj_cnt, s_ind_obj_cnt...
[33mlgl[39m  (4): s_trans, is_noun, is_bare_np, is_np_head

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


## Data Prep: 

In [None]:
dt_full <- 
    mutate( 
        definiteness = factor(definiteness, levels = c("indef", "def", "unmarked")),
        argPos = factor(argPos, levels = c("obj", "sbj", "non-arg"))
        ) %>% 
    summary()

write_parquet(dt_full, "Data/Full Data.parquet")

**Data Attributes:**

- Total tokens: 
- Total sentences: 
- Transitivity: Transitive: ; Intransitive:


In [None]:
df_sent <- df_full %>%
    select(
        -filename, 
        -w_pos, 
        -w_dep,
        -is_bare_np,
        -np_struct,
        -det_pos,
        -head_dep,
        -det_dep,
        -np_sum_surp,
        -np_mean_surp
    )
    #
    # Removes duplicate sentences
    arrange(sentence_id) %>% 
    group_by(s_text) %>% 
    filter(sentence_id == first(first_sentence_id)) %>% 
    ungroup() 
    # 
    # Filter by criteria
    filter(
        modality == "written", 
        s_verb_count  == 1,
        s_aux_count == 0,
        s_sbj_count == 1,
        s_tot_obj_count %in% 1,
        s_dir_obj_count == 1 ,
        s_ind_obj_count == 0,
        s_sub_conj_count == 0,
        s_coord_conj_count == 0, 
        s_clausal_comp_count == 0,
        s_rel_clause_count == 0, 
        s_adv_clause_count == 0, 
        s_pp_count == 0,
        s_comma_count == 0,
        !str_detect(s_text, "\\?"), # Is there a question mark in the text
        s_trans == TRUE,
    ) %>%
    select(
        -modality, 
        -s_verb_count,
        -s_aux_count,
        -s_sbj_count,
        -s_tot_obj_count,
        -s_dir_obj_count,
        -s_ind_obj_count,
        -s_sub_conj_count,
        -s_coord_conj_count, 
        -s_clausal_comp_count,
        -s_rel_clause_count, 
        -s_adv_clause_count, 
        -s_pp_count,
        -s_comma_count,
    ) %>%
    # 
    # Removes sentences with invalid argPos or definiteness values, only keeping sentences with exactly one subject and one object
    group_by(sentence_id) %>%
        filter(
            all(
                (
                    !(is_noun == TRUE & !argPos %in% c("sbj", "obj")) | 
                    !(is_noun == TRUE & is.na(argPos))
                ) &
                (
                    !(is_noun == TRUE & !definiteness %in% c("def", "indef")) |
                    !((is_noun == TRUE) & is.na(definiteness))
                ) &
                (

                )
            )
        ) %>% 
    ungroup() %>%
    #
    # Tagging the index of the first token of each NP
        # Finds Token
    arrange(sentence_id, w_idx) %>%
    group_by(sentence_id) %>% 
    mutate(
        prev_is_noun = lag(is_noun, default = FALSE),
        prev_np_head_txt = lag(head_text),
        first_tok_np = is_noun & (!prev_is_noun | head_text != prev_np_head_text)
        ) %>%
    ungroup() %>% 
        # Takes index and copies to all rows in NP
    select(-prev_is_noun, -prev_np_head_text) %>% 
    group_by(sentence_id, np_id = consecutive_id(phr_tok)) %>%
    mutate(
        np_idx = ifelse(
            is.na(phr_tok) | is_noun == FALSE,
            NA,
            min(w_idx)
            ) 
        ) %>% 
        ungroup() %>% 
        select(-np_id) %>% 
    #
    # Removes sentences where subjects are not the first word, i.e. utterance inversion
    group_by(sentence_id)
    filter(
        all(
                (argPos == "sbj" & np_idx == 0) | 
                (argPos != "sbj") | 
                is.na(argPos)
        )
    ) %>% 
    ungroup() %>%
    summary()

write_parquet(dt_sent, "Data/Filtered Sentences.parquet")

**Filtered Data Description:**

- Total tokens: 
- Total Sentences:
- Total NPs: 
- Definiteness: Def: ; Indef: 


In [None]:
# Creates datatable of just head nouns for analysis:

df_nps <- df_sents %>% 
    select(
        -head_text,
        -det_text,
    ) %>%
    filter(
        is_noun == TRUE, 
        is_np_head == TRUE,
        !is.na(surprisal)
    ) %>% 
    group_by(sentence_id) %>% 
    filter(n() == 2 & distinct(argPos) == 2) %>%
    ungroup() %>% 

write_parquet(dt_nps, "Data/NPs Only.parquet")

In [None]:
# Loading from Parquet for later: 
df_full <- read_parquet("Data/Full Data.parquet")
df_sent <- read_parquet("Data/Filtered Sentences.parquet")
df_nps <- read_parquet("Data/NPs Only.parquet")

## Data Description: 

In [None]:
# Count of each NP type:

print("Grand totals")
df_nps %>%
    count(argPos, definiteness) %>% 
    pivot_wider(
        names_from = definiteness,
        values_from = n,
        values_fill = 0
    )

print("Grand props")
dt_nps %>% 
    count(argPos, definiteness) %>%
    mutate(prop = n / sum(n)) %>% 
    pivot_wider(
        names_from = definiteness,
        values_from = prop,
        values_fill = 0
    )

In [None]:
# Proportions by category:

#Grouped by argPos
print("Grouped by argPos")
df_nps %>%
    count(argPos, definiteness) %>%
    group_by(argPos) %>%
    mutate(prop = n / sum(n)) %>% 
    pivot_wider(
        names_from = definiteness, 
        values_from = c(n, prop),
        names_vary = "slowest"
    )

# grouped by definiteness
print("Grouped by definiteness")
df_nps %>%
    count(definiteness, argPos) %>%
    group_by(definiteness) %>%
    mutate(prop = n / sum(n)) %>% 
        pivot_wider(
        names_from = argPos, 
        values_from = c(n, prop),
        names_vary = "slowest"
    )

In [None]:
# NPs by start position and definiteness: 

dt_nps %>% 
    count(definiteness, np_idx) %>%  
    group_by(np_idx) %>%
    mutate(prop = n/sum(n)) %>% 
    pivot_wider(
        names_from = np_idx,
        values_from = c(n, prop), 
        names_vary = "slowest"
    ) %>% 
    arrange(np_idx)

## Statistics

In [None]:
model <- glm(
    data = df_nps,
    formula = surprisal ~ argPos * definiteness * np_idx,
    family = Gamma(link = 'log')
)

In [None]:
summary(model)
model_parameters(model, exponentiate = TRUE)

- Baseline est: 
- DefObj:
- IndefSbj:
- DefSbj:
- NP Idx x Indef:
- NP Idx x Def: 

## Plots

In [None]:
# Global font metrics for ggPlot: 
    plotFont <- function(fontBase) { # Easy way to adjust font size for plots
        theme( # Add as a final ggplot object (no parentheses)
        plot.title = element_text(size = 14*fontBase),      # Title font size
        axis.title.x = element_text(size = 12*fontBase),    # X-axis title font size
        axis.title.y = element_text(size = 12*fontBase),    # Y-axis title font size
        axis.text.x = element_text(size = 10*fontBase),     # X-axis tick labels font size
        axis.text.y = element_text(size = 10*fontBase),     # Y-axis tick labels font size
        legend.text = element_text(size = 10*fontBase),     # Legend text size
        legend.title = element_text(size = 10*fontBase),    # Legend title size
        strip.text = element_text(size = 10*fontBase)
        )
    }

    custom_theme <-  theme(
                            plot.background = element_rect(fill = "white", color = NA),
                            panel.background = element_rect(fill = "white", color = NA),
                            legend.background = element_rect(fill = "white", color = NA),

                            axis.line = element_line(color = "black", linewidth = 0.75), # <--- The key fix
                            axis.ticks = element_line(color = "black"),
                            panel.grid.major = element_line(color = "grey85"),
                            
                            legend.position = "bottom", 
                            legend.direction = "horizontal",
                            legend.title = element_blank(), # Clean look (optional)

                            plot.margin = margin(5,5,5,5, "mm")

                            
                            ) 

# For httpgd plot view in VS Code
options(repr.plot.width = 12, repr.plot.height = 8)

In [None]:
# Plot of argPos, definiteness, and surprisal

p_argDefSurp <- ggplot(data = df_nps, aes(y = surprisal, x = argPos, fill = definiteness)) + 
    geom_boxplot(outlier.shape = NA) + 
        coord_cartesian(ylim = c(-.1, 13)) + 
    labs(
        # title = "Argument Position, Definiteness, and Surprisal",
        x = "Argument Position",
        y = "Surprisal",
        fill = "Definiteness:"
    ) + 
    scale_x_discrete(labels = c("sbj" = "Subject", "obj" = "Object")) + 
    scale_fill_discrete(labels = c("indef" = "Indefinite", "def" = "Definite")) + 
    custom_theme + 
     plotFont(1)


        # ggsave(
        # "plots/p_argDefSurp.png",
        # p_argDefSurp,
        # width = 8,
        # height = 6,
        # dpi = 300)

In [None]:
# Plot of effect of NP position effect: 

p_npIdxSurp <- ggplot(data = df_nps, aes(x = np_start_idx, y = surprisal)) + 
    geom_smooth(se = FALSE, aes(color = definiteness)) + 
    coord_cartesian(x = c(-.05, 17))+ # Range of np_start_idx is 0-16
    labs(
        # title = "Surprisal and NP Start Position",
        x = "NP Start Position",
        y = "Surprisal",
        color = "Definiteness:"
    ) + 
    scale_color_discrete(labels = c("indef" = "Indefinite", "def" = "Definite")) + 
    custom_theme + 
    plotFont(1)
    
# ggsave(
#         "plots/p_npIdxSurp.png",
#         p_npIdxSurp,
#         width = 10,
#         height = 5,
#         dpi = 300)


In [None]:
# Plot of position distribution



p_npIdxProp <- ggplot(data = df_nps, aes(x = np_start_idx, fill = definiteness, y = after_stat(prop), group = definiteness)) + 
    geom_bar(position = "dodge") + 
    labs(
        # title = "Proportions of NP Start Positions",
        x = "NP Start Position",
        y = "Proportion",
        fill = "Definiteness:"
    ) + 
    scale_fill_discrete(labels = c("indef" = "Indefinite", "def" = "Definite")) + 
    custom_theme+
    plotFont(1)

    # ggsave(
    #     "plots/p_npIdxProp.png",
    #     p_npIdxProp,
    #     width = 8,
    #     height = 6,
    #     dpi = 300)

In [None]:
# Bar plot of definiteness and position: 

p_defIdx <- ggplot(df_nps, aes(x = np_start_idx, fill = definiteness)) + 
    geom_bar() + 
    custom_theme

    # ggsave(
    #     "plots/p_defIdx.png",
    #     p_defIdx,
    #     width = 8,
    #     height = 6,
    #     dpi = 300)

In [None]:
# Distribution of surprisals: 

p_checkMod <- plot(check_predictions(mod)) + 
    coord_cartesian(x = c(0, 22)) + 
    labs(
        title = NULL,
        subtitle = NULL, 
        x = "Surprisal",
        y = "Density",
        )+
        custom_theme + 
        guides(
                color = guide_legend(override.aes = list(linewidth = 1.5))
                )+
        plotFont(1)

# ggsave(
#         "plots/p_checkMod.png",
#         p_checkMod,
#         width = 10,
#         height = 5,
#         dpi = 300)

In [None]:
# Side by side of the box plot for argument and definiteness and for np position

p_comb_surp <- p_argDefSurp + p_npIdxSurp

# ggsave(
#         "plots/p_comb_surp.png",
#         p_comb_surp,
#         width = 10,
#         height = 5,
#         dpi = 300)