In [1]:
!pip install conllu
!pip install minicons



In [2]:
import requests
from conllu import parse
import pandas as pd

# URLs for the datasets
urls = {
    "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-BOUN/refs/heads/master/tr_boun-ud-dev.conllu",
    "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-BOUN/refs/heads/master/tr_boun-ud-test.conllu",
    "train": "https://raw.githubusercontent.com/UniversalDependencies/UD_Turkish-BOUN/refs/heads/master/tr_boun-ud-train.conllu",
    "dev2": "https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/refs/heads/master/en_gum-ud-dev.conllu",
    "test2": "https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/refs/heads/master/en_gum-ud-test.conllu",
    "train2": "https://raw.githubusercontent.com/UniversalDependencies/UD_English-GUM/refs/heads/master/en_gum-ud-train.conllu",
    "dev3": "https://raw.githubusercontent.com/UniversalDependencies/UD_English-LinES/refs/heads/master/en_lines-ud-dev.conllu",
    "test3": "https://raw.githubusercontent.com/UniversalDependencies/UD_English-LinES/refs/heads/master/en_lines-ud-test.conllu",
    "train3": "https://raw.githubusercontent.com/UniversalDependencies/UD_English-LinES/refs/heads/master/en_lines-ud-train.conllu"
}

# Fetch and parse conllu data
def fetch_and_parse(url):
    response = requests.get(url)
    response.raise_for_status()
    return parse(response.text)

# Filter sentences
filtered = []
for name, url in urls.items():
    sentences = fetch_and_parse(url)
    for sent in sentences:
        nsubj_tokens = [tok for tok in sent if tok["deprel"] == "nsubj"]
        root_tokens = [tok for tok in sent if tok["deprel"] == "root"]
        if nsubj_tokens and root_tokens:
            filtered.append({
                "sentence": " ".join(tok["form"] for tok in sent),
                "nsubj": [tok["form"] for tok in nsubj_tokens],
                "root": [tok["form"] for tok in root_tokens],
                "split": name
            })

# Create dataframe
df = pd.DataFrame(filtered)
df.to_csv("nsubj_root_sentences.csv", index=False)


In [3]:
import pandas as pd
import torch
from minicons import cwe
from transformers import BertTokenizer

# Initialize model and tokenizer
model_name = "bert-base-uncased"
wrapper = cwe.CWE(model_name, model_type="bert")
tokenizer = BertTokenizer.from_pretrained(model_name)
model = wrapper.model
model.eval()

# Load data
df = pd.read_csv("nsubj_root_sentences.csv")

results = []

for i, row in df.iterrows():
    sentence = row["sentence"]
    nsubj = row["nsubj"].strip("[]'")
    root = row["root"].strip("[]'")

    # Tokenize with special tokens
    inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    # Try to find token indices
    try:
        nsubj_idx = tokens.index(nsubj)
        root_idx = tokens.index(root)
    except ValueError:
        continue

    if root_idx < nsubj_idx:
        continue  # optionally skip if direction matters

    # Forward pass with attention output
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)

    attentions = outputs.attentions  # tuple of [batch, heads, seq, seq]
    attn_tensor = torch.stack(attentions).squeeze(1)  # [layers, heads, seq, seq]

    attention_scores = attn_tensor[:, :, root_idx, nsubj_idx]  # [layers, heads]

    flat_idx = torch.argmax(attention_scores)
    best_layer = flat_idx.item() // attention_scores.shape[1]
    best_head = flat_idx.item() % attention_scores.shape[1]
    max_value = attention_scores[best_layer, best_head].item()

    results.append({
        "sentence": sentence,
        "nsubj": nsubj,
        "root": root,
        "nsubj_idx": nsubj_idx,
        "root_idx": root_idx,
        "best_layer": best_layer,
        "best_head": best_head,
        "attention_value": max_value
    })

# Save results
pd.DataFrame(results).to_csv("bert_attention_nsubj_root.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [4]:
import pandas as pd
import torch
from minicons import cwe
from transformers import GPT2Tokenizer

# Load GPT-2 small via minicons wrapper
model_name = "gpt2"
wrapper = cwe.CWE(model_name, model_type="gpt2")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
wrapper.model.eval()

# Load data
df = pd.read_csv("nsubj_root_sentences.csv")

results = []

for i, row in df.iterrows():
    sentence = row["sentence"]
    nsubj = row["nsubj"].strip("[]'")
    root = row["root"].strip("[]'")

    # Tokenize
    inputs = tokenizer(sentence, return_tensors="pt", padding=True)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    # Try to find token indices
    try:
        root_idxs = [i for i, tok in enumerate(tokens) if root in tok]
        nsubj_idxs = [i for i, tok in enumerate(tokens) if nsubj in tok]
        if not root_idxs or not nsubj_idxs:
            continue
        root_idx = root_idxs[0]
        nsubj_idx = nsubj_idxs[0]
        if nsubj_idx >= root_idx:
            continue  # GPT-2 is left-to-right
    except Exception:
        continue

    # Forward pass to get attention
    with torch.no_grad():
        outputs = wrapper.model(**inputs, output_attentions=True)

    attentions = outputs.attentions  # tuple of [batch, heads, seq, seq]
    attn_tensor = torch.stack(attentions).squeeze(1)  # [layers, heads, seq, seq]

    attention_scores = attn_tensor[:, :, root_idx, nsubj_idx]  # [layers, heads]

    flat_idx = torch.argmax(attention_scores)
    best_layer = flat_idx.item() // attention_scores.shape[1]
    best_head = flat_idx.item() % attention_scores.shape[1]
    max_value = attention_scores[best_layer, best_head].item()

    results.append({
        "sentence": sentence,
        "nsubj": nsubj,
        "root": root,
        "root_idx": root_idx,
        "nsubj_idx": nsubj_idx,
        "best_layer": best_layer,
        "best_head": best_head,
        "attention_value": max_value
    })

# Save results
pd.DataFrame(results).to_csv("gpt2_attention_nsubj_root.csv", index=False)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [5]:
import pandas as pd

def summarize_attention_weight(csv_path):
    df = pd.read_csv(csv_path)
    total_sentences = len(df)

    weighted_scores = (
        df.groupby(["best_layer", "best_head"])["attention_value"]
        .sum()
        .reset_index()
        .rename(columns={"attention_value": "total_weighted_attention"})
    )

    frequency = (
        df.groupby(["best_layer", "best_head"])
        .size()
        .reset_index(name="count")
    )

    summary = weighted_scores.merge(frequency, on=["best_layer", "best_head"])
    summary["percentage"] = 100 * summary["count"] / total_sentences

    top5 = summary.sort_values("total_weighted_attention", ascending=False).head(5)
    return top5


def summarize_attention_frequency(csv_path):
    df = pd.read_csv(csv_path)
    total = len(df)

    freq_table = (
        df.groupby(["best_layer", "best_head"])
        .size()
        .reset_index(name="count")
        .sort_values("count", ascending=False)
    )
    freq_table["percentage"] = 100 * freq_table["count"] / total

    return freq_table.head(5)

def get_top_layer_head(df, myrow="total_weighted_attention"):
    top_row = df.loc[df[myrow].idxmax()]
    LAYER = int(top_row["best_layer"])
    HEAD = int(top_row["best_head"])
    return LAYER, HEAD


In [14]:
print(summarize_attention_weight("gpt2_attention_nsubj_root.csv"))
print(summarize_attention_frequency("gpt2_attention_nsubj_root.csv"))
GPT2_LAYER, GPT2_HEAD = get_top_layer_head(summarize_attention_weight("gpt2_attention_nsubj_root.csv"))

    best_layer  best_head  total_weighted_attention  count  percentage
39           4         11               2037.840844   2039   36.371745
41           5          1               1119.533952   1121   19.996432
34           4          3                563.416102    860   15.340706
55           6          9                188.653575    189    3.371388
27           3          6                179.008392    292    5.208705
    best_layer  best_head  count  percentage
39           4         11   2039   36.371745
41           5          1   1121   19.996432
34           4          3    860   15.340706
27           3          6    292    5.208705
55           6          9    189    3.371388


In [15]:
print(summarize_attention_weight("bert_attention_nsubj_root.csv"))
print(summarize_attention_frequency("bert_attention_nsubj_root.csv"))
BERT_LAYER, BERT_HEAD = get_top_layer_head(summarize_attention_frequency("bert_attention_nsubj_root.csv"), myrow="percentage")

    best_layer  best_head  total_weighted_attention  count  percentage
32           3          5                257.689358    303   11.399549
56           6         11                249.803234    317   11.926260
58           7          4                177.491398    231    8.690745
68           8         10                168.082241    334   12.565839
42           4         10                102.070245    228    8.577878
    best_layer  best_head  count  percentage
68           8         10    334   12.565839
56           6         11    317   11.926260
32           3          5    303   11.399549
58           7          4    231    8.690745
42           4         10    228    8.577878


In [34]:
df_syn = pd.read_csv("eng_syn_stimuli.csv")
df_no_syn = pd.read_csv("eng_no_syn_stimuli.csv")

In [44]:
import torch
import pandas as pd
from minicons import cwe
from transformers import GPT2Tokenizer
import unicodedata

# === Config ===
model_name = "gpt2"

# === Load GPT-2 ===
wrapper = cwe.CWE(model_name, model_type="gpt2")
model = wrapper.model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.eval()

# === Normalize text ===
def normalize_text(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    return text

def find_best_matching_token(word, tokens):
    """Find token that overlaps best with the word (ignore Ġ)"""
    word = word.lower()
    best_idx = -1
    best_score = 0
    for i, tok in enumerate(tokens):
        clean_tok = tok.lstrip("Ġ").lower()
        overlap = len(set(word) & set(clean_tok))
        if overlap > best_score:
            best_score = overlap
            best_idx = i
    return best_idx

def add_gpt2_attention(df):
    attn_to_2nd = []
    attn_to_5th = []

    for idx, row in df.iterrows():
        sentence = row["Full_Sentence"]
        norm_sentence = normalize_text(sentence)

        inputs = tokenizer(norm_sentence, return_tensors="pt")
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        words = norm_sentence.strip(".").split()
        if len(words) < 5:
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        w2 = words[1]
        w5 = words[4]
        if w5.endswith("'s") or w5.endswith("’s"):
            w5 = w5[:-2]
        elif w5.endswith("s") and not w5.endswith("ss"):
            w5 = w5[:-1]

        aux_idx = -1
        for i, tok in enumerate(tokens):
            if tok in ["Ġis", "Ġare"]:
                aux_idx = i
                break

        if aux_idx == -1:
            print(f"[SKIP] No is/are: {norm_sentence}")
            print(tokens)
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        idx2 = find_best_matching_token(w2, tokens)
        idx5 = find_best_matching_token(w5, tokens)

        if idx2 == -1 or idx5 == -1:
            print(f"[SKIP] w2={w2} or w5={w5} not found: {tokens}")
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        if idx2 > aux_idx or idx5 > aux_idx:
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        with torch.no_grad():
            outputs = model(**inputs, output_attentions=True)
            attn = outputs.attentions[GPT2_LAYER][0, GPT2_HEAD]

        attn_to_2nd.append(attn[aux_idx, idx2].item())
        attn_to_5th.append(attn[aux_idx, idx5].item())

    df["attention_to_2nd_gpt"] = attn_to_2nd
    df["attention_to_5th_gpt"] = attn_to_5th

    return df


In [45]:
gatt_syn = add_gpt2_attention(df_syn)
gatt_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt
0,1,a,The cake at the senator’s celebration,singular,is,The cake at the senator’s celebration is delic...,1.452068e-10,7.354839e-06
1,1,b,The cake at the senator’s celebration,singular,are,The cake at the senator’s celebration are deli...,6.029218e-10,3.539259e-05
2,1,c,The cake at the senators’ celebration,plural,is,The cake at the senators’ celebration is delic...,5.489714e-11,5.400035e-07
3,1,d,The cake at the senators’ celebration,plural,are,The cake at the senators’ celebration are deli...,1.737449e-10,2.637350e-06
4,2,a,The call to the doctor’s secretary,singular,is,The call to the doctor’s secretary is urgent.,8.236045e-13,1.576609e-05
...,...,...,...,...,...,...,...,...
123,31,d,The truck with the firemans' hose,plural,are,The truck with the firemans' hose are leaving.,6.719176e-11,4.967463e-07
124,32,a,The response to the country's attack,singular,is,The response to the country's attack is immedi...,1.218679e-08,3.949295e-06
125,32,b,The response to the country's attack,singular,are,The response to the country's attack are immed...,2.489455e-08,1.217911e-05
126,32,c,The response to the countries' attack,plural,is,The response to the countries' attack is immed...,3.120982e-09,2.425458e-07


In [46]:
gatt_no_syn = add_gpt2_attention(df_no_syn)
gatt_no_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt
0,1,a,The slogan on the poster,singular,is,The slogan on the poster is designed to get at...,2.417451e-07,9.999981e-01
1,1,b,The slogan on the posters,plural,is,The slogan on the posters is designed to get a...,7.093639e-07,9.999888e-01
2,1,c,The slogan on the poster,singular,are,The slogan on the poster are designed to get a...,3.525625e-07,9.999980e-01
3,1,d,The slogan on the posters,plural,are,The slogan on the posters are designed to get ...,6.073408e-07,9.999913e-01
4,2,a,The picture on the flier,singular,is,The picture on the flier is of a village churc...,6.981644e-10,6.981644e-10
...,...,...,...,...,...,...,...,...
91,23,d,The message from the engineers,plural,are,The message from the engineers are about the r...,5.101764e-09,9.999638e-01
92,24,a,The gate to the pasture,singular,is,The gate to the pasture is falling down due to...,3.317614e-06,9.999671e-01
93,24,b,The gate to the pastures,plural,is,The gate to the pastures is falling down due t...,3.469449e-09,1.929548e-07
94,24,c,The gate to the pasture,singular,are,The gate to the pasture are falling down due t...,2.572294e-06,9.999768e-01


In [None]:
# group_a = [
#     "The key to the drawers are missing.",
#     "The lock to the doors are broken.",
#     "The guide to the books are outdated.",
#     "The path to the rooms are blocked.",
#     "The ladder to the attics are shaky.",
#     "The password to the emails are forgotten.",
#     "The entrance to the buildings are locked.",
#     "The solution to the puzzles are complex.",
#     "The cable to the screens are unplugged.",
#     "The tunnel to the bunkers are collapsed."
# ]

# group_b = [
#     "The statue in the orcs’ camp are dusty.",
#     "The warrior in the giants’ cave are injured.",
#     "The knight in the dragons’ lair are scared.",
#     "The poet in the kings’ court are nervous.",
#     "The child in the clowns’ tent are laughing.",
#     "The lantern in the witches’ hut are glowing.",
#     "The vase in the thieves’ den are cracked.",
#     "The priest in the rebels’ church are praying.",
#     "The book in the scholars’ library are old.",
#     "The scroll in the monks’ room are preserved."
# ]

In [None]:
# import torch
# import pandas as pd
# from minicons import cwe
# from transformers import GPT2Tokenizer

# # === Configuration ===
# model_name = "gpt2"

# # === Initialize model and tokenizer ===
# wrapper = cwe.CWE(model_name, model_type="gpt2")
# model = wrapper.model
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# model.config.pad_token_id = tokenizer.eos_token_id
# model.eval()

# # === Helper: match word to token ===
# def find_token_index(word, tokens):
#     word = word.lower().strip(".,’'")
#     for i, tok in enumerate(tokens):
#         clean_tok = tok.replace("Ġ", "").replace("##", "").lower()
#         if word in clean_tok:
#             return i
#     return -1

# # === Attention extractor ===
# def extract_gpt2_attention(sentences, group_label):
#     results = []
#     for sentence in sentences:
#         inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)
#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

#         words = sentence.strip(".").split()
#         if len(words) < 5:
#             continue
#         w2, w5 = words[1], words[4]

#         try:
#             are_idx = tokens.index("Ġare")
#         except ValueError:
#             print(f"[SKIP] 'are' not found in: {sentence}")
#             continue

#         idx2 = find_token_index(w2, tokens)
#         idx5 = find_token_index(w5, tokens)

#         if idx2 == -1 or idx5 == -1:
#             continue
#         if idx2 > are_idx or idx5 > are_idx:
#             continue  # GPT-2 can't attend to future tokens

#         with torch.no_grad():
#             outputs = model(**inputs, output_attentions=True)
#             attn = outputs.attentions[GPT2_LAYER][0, GPT2_HEAD]  # [seq, seq]

#         results.append({
#             "group": group_label,
#             "sentence": sentence,
#             "attention_to_2nd": attn[are_idx, idx2].item(),
#             "attention_to_5th": attn[are_idx, idx5].item()
#         })

#     return pd.DataFrame(results)

# gpt2_df = pd.concat([
#     extract_gpt2_attention(group_a, "A"),
#     extract_gpt2_attention(group_b, "B")
# ], ignore_index=True)

In [None]:
# gpt2_df

Unnamed: 0,group,sentence,attention_to_2nd,attention_to_5th
0,A,The lock to the doors are broken.,0.023662,0.020556
1,A,The guide to the books are outdated.,0.01381,0.091745
2,A,The path to the rooms are blocked.,0.01063,0.044339
3,A,The password to the emails are forgotten.,0.014835,0.122942
4,A,The entrance to the buildings are locked.,0.011862,0.02774
5,A,The solution to the puzzles are complex.,0.004367,0.019595
6,A,The cable to the screens are unplugged.,0.021826,0.023475
7,B,The statue in the orcs’ camp are dusty.,0.023529,0.019572
8,B,The warrior in the giants’ cave are injured.,0.009721,0.019342
9,B,The knight in the dragons’ lair are scared.,0.010478,0.010018


In [49]:
import pandas as pd
import torch
from minicons import cwe
from transformers import BertTokenizer
import unicodedata

# === Load BERT ===
wrapper = cwe.CWE("bert-base-uncased", model_type="bert")
model = wrapper.model
model.eval()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# === Normalize ===
def normalize_text(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    return text

# === One-token fuzzy match ===
def find_best_matching_token(word, tokens):
    word = word.lower()
    best_idx = -1
    best_score = 0
    for i, tok in enumerate(tokens):
        clean_tok = tok.replace("##", "").lower()
        overlap = len(set(word) & set(clean_tok))
        if overlap > best_score:
            best_score = overlap
            best_idx = i
    return best_idx

# === Main BERT attention function ===
def add_bert_attention(df):
    attn_to_2nd = []
    attn_to_5th = []

    for idx, row in df.iterrows():
        sentence = row["Full_Sentence"]
        norm_sentence = normalize_text(sentence)

        inputs = tokenizer(norm_sentence, return_tensors="pt")
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        # Find 'is' or 'are'
        aux_idx = -1
        for i, tok in enumerate(tokens):
            if tok == "is" or tok == "are":
                aux_idx = i
                break

        if aux_idx == -1:
            print(f"[SKIP] 'is' or 'are' not found: {norm_sentence}")
            print(tokens)
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        words = norm_sentence.strip(".").split()
        if len(words) < 5:
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        w2 = words[1]
        w5 = words[4]
        if w5.endswith("'s") or w5.endswith("’s"):
            w5 = w5[:-2]
        elif w5.endswith("s") and not w5.endswith("ss"):
            w5 = w5[:-1]

        idx2 = find_best_matching_token(w2, tokens)
        idx5 = find_best_matching_token(w5, tokens)

        if idx2 == -1 or idx5 == -1:
            print(f"[SKIP] w2={w2} or w5={w5} not found: {tokens}")
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        if idx2 > aux_idx or idx5 > aux_idx:
            attn_to_2nd.append(None)
            attn_to_5th.append(None)
            continue

        with torch.no_grad():
            outputs = model(**inputs, output_attentions=True)
            attn = outputs.attentions[BERT_LAYER][0, BERT_HEAD]

        attn_to_2nd.append(attn[aux_idx, idx2].item())
        attn_to_5th.append(attn[aux_idx, idx5].item())

    df["attention_to_2nd_bert"] = attn_to_2nd
    df["attention_to_5th_bert"] = attn_to_5th

    return df


In [50]:
batt_syn = add_bert_attention(gatt_syn)
batt_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt,attention_to_2nd_bert,attention_to_5th_bert
0,1,a,The cake at the senator’s celebration,singular,is,The cake at the senator’s celebration is delic...,1.452068e-10,7.354839e-06,0.182049,0.020422
1,1,b,The cake at the senator’s celebration,singular,are,The cake at the senator’s celebration are deli...,6.029218e-10,3.539259e-05,0.197339,0.005257
2,1,c,The cake at the senators’ celebration,plural,is,The cake at the senators’ celebration is delic...,5.489714e-11,5.400035e-07,0.207025,0.009875
3,1,d,The cake at the senators’ celebration,plural,are,The cake at the senators’ celebration are deli...,1.737449e-10,2.637350e-06,0.180334,0.133189
4,2,a,The call to the doctor’s secretary,singular,is,The call to the doctor’s secretary is urgent.,8.236045e-13,1.576609e-05,0.193536,0.033539
...,...,...,...,...,...,...,...,...,...,...
123,31,d,The truck with the firemans' hose,plural,are,The truck with the firemans' hose are leaving.,6.719176e-11,4.967463e-07,0.099796,0.022698
124,32,a,The response to the country's attack,singular,is,The response to the country's attack is immedi...,1.218679e-08,3.949295e-06,0.190015,0.017857
125,32,b,The response to the country's attack,singular,are,The response to the country's attack are immed...,2.489455e-08,1.217911e-05,0.726147,0.001932
126,32,c,The response to the countries' attack,plural,is,The response to the countries' attack is immed...,3.120982e-09,2.425458e-07,0.191680,0.032076


In [51]:
batt_no_syn = add_bert_attention(gatt_no_syn)
batt_no_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt,attention_to_2nd_bert,attention_to_5th_bert
0,1,a,The slogan on the poster,singular,is,The slogan on the poster is designed to get at...,2.417451e-07,9.999981e-01,0.280187,0.056911
1,1,b,The slogan on the posters,plural,is,The slogan on the posters is designed to get a...,7.093639e-07,9.999888e-01,0.337098,0.014280
2,1,c,The slogan on the poster,singular,are,The slogan on the poster are designed to get a...,3.525625e-07,9.999980e-01,0.248383,0.117720
3,1,d,The slogan on the posters,plural,are,The slogan on the posters are designed to get ...,6.073408e-07,9.999913e-01,0.160674,0.463609
4,2,a,The picture on the flier,singular,is,The picture on the flier is of a village churc...,6.981644e-10,6.981644e-10,0.457317,0.457317
...,...,...,...,...,...,...,...,...,...,...
91,23,d,The message from the engineers,plural,are,The message from the engineers are about the r...,5.101764e-09,9.999638e-01,0.223952,0.164540
92,24,a,The gate to the pasture,singular,is,The gate to the pasture is falling down due to...,3.317614e-06,9.999671e-01,0.153237,0.031520
93,24,b,The gate to the pastures,plural,is,The gate to the pastures is falling down due t...,3.469449e-09,1.929548e-07,0.194421,0.028613
94,24,c,The gate to the pasture,singular,are,The gate to the pasture are falling down due t...,2.572294e-06,9.999768e-01,0.283596,0.041382


In [None]:
# import pandas as pd
# import torch
# from minicons import cwe
# from transformers import BertTokenizer

# # === Load model and tokenizer using minicons ===
# wrapper = cwe.CWE("bert-base-uncased", model_type="bert")
# model = wrapper.model
# model.eval()
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# # === Find word token index helper ===
# def find_token_index(word, tokens):
#     word = word.lower().strip(".,’'")
#     for i, tok in enumerate(tokens):
#         if word in tok.replace("##", "").lower():
#             return i
#     return -1

# # === Extract attention values ===
# def extract_attention(sentences, group):
#     data = []
#     for sent in sentences:
#         inputs = tokenizer(sent, return_tensors="pt")
#         tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

#         try:
#             are_idx = tokens.index("are")
#         except ValueError:
#             continue

#         words = sent.strip(".").split()
#         if len(words) < 5:
#             continue

#         w2, w5 = words[1], words[4]
#         idx2 = find_token_index(w2, tokens)
#         idx5 = find_token_index(w5, tokens)

#         if idx2 == -1 or idx5 == -1:
#             continue

#         with torch.no_grad():
#             outputs = model(**inputs, output_attentions=True)
#             attn = outputs.attentions[BERT_LAYER][0, BERT_HEAD]  # [seq, seq]

#         data.append({
#             "group": group,
#             "sentence": sent,
#             "attention_to_2nd": attn[are_idx, idx2].item(),
#             "attention_to_5th": attn[are_idx, idx5].item()
#         })

#     return pd.DataFrame(data)


# df_a = extract_attention(group_a, "A")
# df_b = extract_attention(group_b, "B")
# df = pd.concat([df_a, df_b], ignore_index=True)
# print(df)
# df.to_csv("bert_are_attention.csv", index=False)

   group                                       sentence  attention_to_2nd  \
0      A            The key to the drawers are missing.          0.048634   
1      A              The lock to the doors are broken.          0.035907   
2      A           The guide to the books are outdated.          0.283187   
3      A             The path to the rooms are blocked.          0.049810   
4      A      The password to the emails are forgotten.          0.111492   
5      A      The entrance to the buildings are locked.          0.081310   
6      A       The solution to the puzzles are complex.          0.318987   
7      A        The cable to the screens are unplugged.          0.051427   
8      B   The warrior in the giants’ cave are injured.          0.203914   
9      B    The knight in the dragons’ lair are scared.          0.226680   
10     B      The poet in the kings’ court are nervous.          0.051517   
11     B   The lantern in the witches’ hut are glowing.          0.030141   

In [52]:
# Add a column indicating model type
batt_syn["is_syn"] = "yes"
batt_no_syn["is_syn"] = "no"


# Combine them into a single DataFrame
combined_df = pd.concat([batt_syn, batt_no_syn], ignore_index=True)

# Save or inspect
print(combined_df)
combined_df.to_csv("eng_attention.csv", index=False)

     Item Condition                               Stimulus NP_Number  \
0       1         a  The cake at the senator’s celebration  singular   
1       1         b  The cake at the senator’s celebration  singular   
2       1         c  The cake at the senators’ celebration    plural   
3       1         d  The cake at the senators’ celebration    plural   
4       2         a     The call to the doctor’s secretary  singular   
..    ...       ...                                    ...       ...   
219    23         d         The message from the engineers    plural   
220    24         a                The gate to the pasture  singular   
221    24         b               The gate to the pastures    plural   
222    24         c                The gate to the pasture  singular   
223    24         d               The gate to the pastures    plural   

    Auxiliary                                      Full_Sentence  \
0          is  The cake at the senator’s celebration is delic...   

In [None]:
# Surprisal

In [57]:
from minicons import scorer
import pandas as pd
import unicodedata

# === Initialize scorer ===
bert_scorer = scorer.MaskedLMScorer("bert-base-uncased")

# === Normalize helper (same as before)
def normalize_text(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    return text

# === Correct surprisal extractor ===
def add_bert_surprisal(df):
    surprisals = []

    for idx, row in df.iterrows():
        sentence = normalize_text(row["Full_Sentence"])
        words = sentence.strip().split()

        # Find 'is' or 'are'
        try:
            aux_idx = words.index("are")
            target = "are"
        except ValueError:
            try:
                aux_idx = words.index("is")
                target = "is"
            except ValueError:
                print(f"[SKIP] No is/are: {sentence}")
                surprisals.append(None)
                continue

        # Replace with [MASK]
        masked_words = words.copy()
        masked_words[aux_idx] = "[MASK]"
        masked_sent = " ".join(masked_words)

        try:
            score_tuples = bert_scorer.token_score(masked_sent, target)
            # score_tuples is a list of lists (1 list of tuples)
            if score_tuples and len(score_tuples) > 0:
                found = None
                for tok, s in score_tuples[0]:
                    if tok == "[MASK]":
                        found = s
                        break
                surprisals.append(found)
            else:
                surprisals.append(None)
        except Exception as e:
            print(f"[SKIP] {sentence} — {e}")
            surprisals.append(None)

    df["surprisal_bert"] = surprisals
    return df


In [59]:
bsurp_df_no_syn = add_bert_surprisal(df_no_syn)
bsurp_df_no_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt,attention_to_2nd_bert,attention_to_5th_bert,is_syn,surprisal_bert
0,1,a,The slogan on the poster,singular,is,The slogan on the poster is designed to get at...,2.417451e-07,9.999981e-01,0.280187,0.056911,no,21.273739
1,1,b,The slogan on the posters,plural,is,The slogan on the posters is designed to get a...,7.093639e-07,9.999888e-01,0.337098,0.014280,no,20.682386
2,1,c,The slogan on the poster,singular,are,The slogan on the poster are designed to get a...,3.525625e-07,9.999980e-01,0.248383,0.117720,no,21.273739
3,1,d,The slogan on the posters,plural,are,The slogan on the posters are designed to get ...,6.073408e-07,9.999913e-01,0.160674,0.463609,no,20.682386
4,2,a,The picture on the flier,singular,is,The picture on the flier is of a village churc...,6.981644e-10,6.981644e-10,0.457317,0.457317,no,18.294003
...,...,...,...,...,...,...,...,...,...,...,...,...
91,23,d,The message from the engineers,plural,are,The message from the engineers are about the r...,5.101764e-09,9.999638e-01,0.223952,0.164540,no,18.455061
92,24,a,The gate to the pasture,singular,is,The gate to the pasture is falling down due to...,3.317614e-06,9.999671e-01,0.153237,0.031520,no,19.385170
93,24,b,The gate to the pastures,plural,is,The gate to the pastures is falling down due t...,3.469449e-09,1.929548e-07,0.194421,0.028613,no,19.283329
94,24,c,The gate to the pasture,singular,are,The gate to the pasture are falling down due t...,2.572294e-06,9.999768e-01,0.283596,0.041382,no,19.385170


In [58]:
bsurp_df_syn = add_bert_surprisal(df_syn)
bsurp_df_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt,attention_to_2nd_bert,attention_to_5th_bert,is_syn,surprisal_bert
0,1,a,The cake at the senator’s celebration,singular,is,The cake at the senator’s celebration is delic...,1.452068e-10,7.354839e-06,0.182049,0.020422,yes,20.181545
1,1,b,The cake at the senator’s celebration,singular,are,The cake at the senator’s celebration are deli...,6.029218e-10,3.539259e-05,0.197339,0.005257,yes,20.181545
2,1,c,The cake at the senators’ celebration,plural,is,The cake at the senators’ celebration is delic...,5.489714e-11,5.400035e-07,0.207025,0.009875,yes,20.125511
3,1,d,The cake at the senators’ celebration,plural,are,The cake at the senators’ celebration are deli...,1.737449e-10,2.637350e-06,0.180334,0.133189,yes,20.125511
4,2,a,The call to the doctor’s secretary,singular,is,The call to the doctor’s secretary is urgent.,8.236045e-13,1.576609e-05,0.193536,0.033539,yes,21.191139
...,...,...,...,...,...,...,...,...,...,...,...,...
123,31,d,The truck with the firemans' hose,plural,are,The truck with the firemans' hose are leaving.,6.719176e-11,4.967463e-07,0.099796,0.022698,yes,18.290253
124,32,a,The response to the country's attack,singular,is,The response to the country's attack is immedi...,1.218679e-08,3.949295e-06,0.190015,0.017857,yes,22.407114
125,32,b,The response to the country's attack,singular,are,The response to the country's attack are immed...,2.489455e-08,1.217911e-05,0.726147,0.001932,yes,22.407114
126,32,c,The response to the countries' attack,plural,is,The response to the countries' attack is immed...,3.120982e-09,2.425458e-07,0.191680,0.032076,yes,22.018763


In [None]:
# Debug one sample
gpt2_scorer.token_score("The doors are open.")[0]


[('The', 0.0),
 ('Ġdoors', -9.860700607299805),
 ('Ġare', -2.6851882934570312),
 ('Ġopen', -2.2943038940429688),
 ('.', -2.1006317138671875)]

In [60]:
from minicons import scorer
import pandas as pd

# === Load your DataFrame ===
# df = pd.read_csv("your_file.csv")

# === Initialize GPT-2 scorer ===
gpt2_scorer = scorer.IncrementalLMScorer("gpt2")

# === Function: add GPT-2 surprisal ===
def add_gpt2_surprisal(df):
    surprisals = []

    for idx, row in df.iterrows():
        sentence = row["Full_Sentence"]

        try:
            # Get (token, surprisal) tuples for whole sentence
            score_tuples = gpt2_scorer.token_score(sentence)  # returns [ [(tok, value), ...] ]
        except Exception as e:
            print(f"[SKIP] {sentence} — {e}")
            surprisals.append(None)
            continue

        # Flatten: only the first list in the list of lists
        tok_surps = score_tuples[0]

        # Look for Ġare or Ġis (GPT-2 uses Ġ prefix for word-start)
        match = None
        for token, surprisal in tok_surps:
            if token in ["Ġare", "Ġis"]:
                match = surprisal
                break

        surprisals.append(match)

    df["surprisal_gpt2"] = surprisals
    return df


# from minicons import scorer
# import pandas as pd

# # Initialize scorer
# gpt2_scorer = scorer.IncrementalLMScorer("gpt2")

# gpt2_results = []

# for sent in all_sentences:
#     try:
#         score_tuples = gpt2_scorer.token_score(sent)
#     except Exception as e:
#         print(f"[SKIP] {sent} — {e}")
#         continue

#     for token, surprisal in score_tuples[0]:
#         if token == "Ġare":
#             gpt2_results.append({
#                 "sentence": sent,
#                 "token": "are",
#                 "surprisal": surprisal,
#                 "model": "gpt2"
#             })
#             break  # stop at first matching "are"
# gpt2_surprisal_df = pd.DataFrame(gpt2_results)
# gpt2_surprisal_df

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [61]:
gsurp_df_no_syn = add_gpt2_surprisal(bsurp_df_no_syn)
gsurp_df_no_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt,attention_to_2nd_bert,attention_to_5th_bert,is_syn,surprisal_bert,surprisal_gpt2
0,1,a,The slogan on the poster,singular,is,The slogan on the poster is designed to get at...,2.417451e-07,9.999981e-01,0.280187,0.056911,no,21.273739,-1.397896
1,1,b,The slogan on the posters,plural,is,The slogan on the posters is designed to get a...,7.093639e-07,9.999888e-01,0.337098,0.014280,no,20.682386,-1.286392
2,1,c,The slogan on the poster,singular,are,The slogan on the poster are designed to get a...,3.525625e-07,9.999980e-01,0.248383,0.117720,no,21.273739,-7.271469
3,1,d,The slogan on the posters,plural,are,The slogan on the posters are designed to get ...,6.073408e-07,9.999913e-01,0.160674,0.463609,no,20.682386,-4.627663
4,2,a,The picture on the flier,singular,is,The picture on the flier is of a village churc...,6.981644e-10,6.981644e-10,0.457317,0.457317,no,18.294003,-2.042534
...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,23,d,The message from the engineers,plural,are,The message from the engineers are about the r...,5.101764e-09,9.999638e-01,0.223952,0.164540,no,18.455061,-6.241112
92,24,a,The gate to the pasture,singular,is,The gate to the pasture is falling down due to...,3.317614e-06,9.999671e-01,0.153237,0.031520,no,19.385170,-1.747627
93,24,b,The gate to the pastures,plural,is,The gate to the pastures is falling down due t...,3.469449e-09,1.929548e-07,0.194421,0.028613,no,19.283329,-2.107246
94,24,c,The gate to the pasture,singular,are,The gate to the pasture are falling down due t...,2.572294e-06,9.999768e-01,0.283596,0.041382,no,19.385170,-6.789543


In [62]:
gsurp_df_syn = add_gpt2_surprisal(bsurp_df_syn)
gsurp_df_syn

Unnamed: 0,Item,Condition,Stimulus,NP_Number,Auxiliary,Full_Sentence,attention_to_2nd_gpt,attention_to_5th_gpt,attention_to_2nd_bert,attention_to_5th_bert,is_syn,surprisal_bert,surprisal_gpt2
0,1,a,The cake at the senator’s celebration,singular,is,The cake at the senator’s celebration is delic...,1.452068e-10,7.354839e-06,0.182049,0.020422,yes,20.181545,-2.553642
1,1,b,The cake at the senator’s celebration,singular,are,The cake at the senator’s celebration are deli...,6.029218e-10,3.539259e-05,0.197339,0.005257,yes,20.181545,-6.888351
2,1,c,The cake at the senators’ celebration,plural,is,The cake at the senators’ celebration is delic...,5.489714e-11,5.400035e-07,0.207025,0.009875,yes,20.125511,-2.789070
3,1,d,The cake at the senators’ celebration,plural,are,The cake at the senators’ celebration are deli...,1.737449e-10,2.637350e-06,0.180334,0.133189,yes,20.125511,-5.912453
4,2,a,The call to the doctor’s secretary,singular,is,The call to the doctor’s secretary is urgent.,8.236045e-13,1.576609e-05,0.193536,0.033539,yes,21.191139,-2.868225
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,31,d,The truck with the firemans' hose,plural,are,The truck with the firemans' hose are leaving.,6.719176e-11,4.967463e-07,0.099796,0.022698,yes,18.290253,-7.516151
124,32,a,The response to the country's attack,singular,is,The response to the country's attack is immedi...,1.218679e-08,3.949295e-06,0.190015,0.017857,yes,22.407114,-3.034782
125,32,b,The response to the country's attack,singular,are,The response to the country's attack are immed...,2.489455e-08,1.217911e-05,0.726147,0.001932,yes,22.407114,-8.680504
126,32,c,The response to the countries' attack,plural,is,The response to the countries' attack is immed...,3.120982e-09,2.425458e-07,0.191680,0.032076,yes,22.018763,-2.582726


In [63]:
import numpy as np

gsurp_df_syn["surprisal_bert"] = gsurp_df_syn["surprisal_bert"] / np.log(2)
gsurp_df_no_syn["surprisal_bert"] = gsurp_df_no_syn["surprisal_bert"] / np.log(2)
gsurp_df_syn["surprisal_gpt2"] = -gsurp_df_syn["surprisal_gpt2"]
gsurp_df_no_syn["surprisal_gpt2"] = -gsurp_df_no_syn["surprisal_gpt2"]

surp_combined_df = pd.concat([gsurp_df_syn, gsurp_df_no_syn], ignore_index=True)
surp_combined_df.to_csv("eng_surp_att.csv", index=False)