In [1]:
# Loading Packages
library(tidyverse)
library(easystats)

library(arrow) # For reading .parquet files, which are much smaller
library(data.table)
library(dtplyr)

library(ggplot2)
library(ggthemes)
library(patchwork)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.4     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
[34m# Attaching packages: easystats 0.7.5[39m
[32m✔ [39m[37mbayestestR [39m [32m0.17.0[39m   [32m✔ [39m[37mcorrelation[39m [32m0.8.8 [39m
[32m✔ [39m[37m

In [133]:
context <- function(dt, target_id, num=3) {

    u_ids <- unique(dt$Sentence_ID)
    

    target_pos <- match(target_id, u_ids)
    
    if (is.na(target_pos)) return(NULL)
    

    start_idx <- max(1, target_pos - num)
    end_idx   <- target_pos - 1
    
    prev_ids <- if (target_pos > 1) u_ids[start_idx:end_idx] else integer(0)
    
    dt_context <- dt[.(prev_ids), on = "Sentence_ID", mult = "first"]
    dt_context[, Phrase_Token := ""]
    dt_context[, Phrase_Surprisal := NA]
    
    dt_target <- dt[Sentence_ID == target_id]
    
    result <- rbind(dt_context, dt_target)

    # result <- result %>% select(-grp_idx)

    as_tibble(result)
    }

    context_full <- function(dt, target_id, num=3) {
    dt[, grp_idx := rleid(Sentence_ID)]
    target_grp <- dt[Sentence_ID == target_id, grp_idx[1]]
    result <- dt[grp_idx >= (target_grp - num) & grp_idx <= target_grp]
    result <- result %>% select(-grp_idx)
    as_tibble(result)
}

In [2]:
df_sent <- open_dataset("Data/Filtered Sentences.parquet") %>% collect()
df_nps <- open_dataset("Data/Head Nouns.parquet") %>% collect()


In [None]:
 df_full <- open_dataset("Data/Full Data.parquet") %>% 
    select(Sentence_Text, Phrase_Token, Phrase_Surprisal, Sentence_ID) %>% 
    collect() %>% 
    setDT()

In [163]:
context(df_full, "AD1_0794")

Sentence_Text,Phrase_Token,Phrase_Surprisal,Sentence_ID
<chr>,<chr>,<dbl>,<chr>
"‘ There , ’ Dorothea said , ‘ there , ’ and poured the tea .",,,AD1_0791
"‘ I have never let myself go like that , ’ Alida said , ‘ never .",,,AD1_0792
It proves to what a point I am brought . ’,,,AD1_0793
Dorothea reassured her .,Dorothea,0.6249965,AD1_0794
Dorothea reassured her .,reassured,5.1876678,AD1_0794
Dorothea reassured her .,her,0.1723633,AD1_0794
Dorothea reassured her .,.,1.6347656,AD1_0794


In [162]:
df_nps %>% 
    filter() %>% 
    select(Sentence_Text, Phrase_Token, argPos, surprisal, Sentence_ID) %>%
    group_by(Sentence_ID)%>%
    mutate(max_surprisal = max(surprisal)) %>% 
    ungroup() %>% 
    arrange(max_surprisal)%>%
    head(20)

Sentence_Text,Phrase_Token,argPos,surprisal,Sentence_ID,max_surprisal
<chr>,<chr>,<fct>,<dbl>,<chr>,<dbl>
A Brownie always tells the truth .,A Brownie,sbj,0.32424784,G24_0268,0.3242478
A Brownie always tells the truth .,the truth,obj,0.07298279,G24_0268,0.3242478
Tertiary alkanols have three alkyl groups .,Tertiary alkanols,sbj,0.32697952,HSB_0669,0.3269795
Tertiary alkanols have three alkyl groups .,three alkyl groups,obj,0.09336146,HSB_0669,0.3269795
Preobrazhensky argued that :,Preobrazhensky,sbj,0.49144783,BMA_1265,0.4914478
Preobrazhensky argued that :,that,obj,0.17614746,BMA_1265,0.4914478
Haavikko shook his head .,Haavikko,sbj,0.52361721,G04_2321,0.5236172
Haavikko shook his head .,his head,obj,0.05979919,G04_2321,0.5236172
who fears the Lord .,who,sbj,0.57226562,ARG_0069,0.5722656
who fears the Lord .,the Lord,obj,0.39587402,ARG_0069,0.5722656


In [151]:
context(df_full, "J2U_0641", 3)

Sentence_Text,Phrase_Token,Phrase_Surprisal,Sentence_ID
<chr>,<chr>,<dbl>,<chr>
Aluminium recycling in the UK rose to 9.5 last year .,,,J2U_0638
"The number of UK recycling centres has risen from 24 in 1988 to 320 , whilst 163 local authorities registered for aluminium can recycling programmes .",,,J2U_0639
Financial Times 8 March,,,J2U_0640
Robot eats plastic packaging,Robot,20.34375,J2U_0641
Robot eats plastic packaging,eats,9.546875,J2U_0641
Robot eats plastic packaging,plastic packaging,3.798828,J2U_0641
Robot eats plastic packaging,plastic packaging,3.798828,J2U_0641


Sentence_Text,Phrase_Token,Phrase_Surprisal,Sentence_ID
<chr>,<chr>,<dbl>,<chr>
‘ And so you try to make it shorter . ’,,,FNT_4186
She looked at him for a moment and burst out laughing .,,,FNT_4187
‘ Are you offering that as an interpretation ? ’,,,FNT_4188
Jacob shook his head sadly .,Jacob,9.21875,FNT_4189
Jacob shook his head sadly .,shook,3.44335938,FNT_4189
Jacob shook his head sadly .,his head,0.01050377,FNT_4189
Jacob shook his head sadly .,his head,0.01050377,FNT_4189
Jacob shook his head sadly .,sadly,4.9609375,FNT_4189
Jacob shook his head sadly .,.,0.51953125,FNT_4189


In [68]:
# Pulls the preceding sentences from the full dataset to view context window

df_full[, grp_idx := rleid(Sentence_ID)]
target_grp <- df_full[Sentence_ID == "EF4_0684", grp_idx[1]]
result <- df_full[grp_idx >= (target_grp - 3) & grp_idx <= target_grp]

as_tibble(result)

Sentence_Text,Phrase_Token,Phrase_Surprisal,Sentence_ID,grp_idx
<chr>,<chr>,<dbl>,<chr>,<int>
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",And,3.6738281,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",since,5.0000000,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",that assumption,2.2856445,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",that assumption,2.2856445,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",is,0.4821777,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",implicit,6.7304688,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",in,0.2346191,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",the Webbs ' criticism,3.7994385,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",the Webbs ' criticism,3.7994385,EF4_0681,2178987
"And since that assumption is implicit in the Webbs ' criticism of Owen , it can not be understood too clearly that on their argument there can be no place for industrial co-operatives , properly so-called : no place , because the ownership , whether individual or collective , of an industrial co-operative rests in the people working in it .",the Webbs ' criticism,3.7994385,EF4_0681,2178987
