In [None]:
import requests
from conllu import parse
import pandas as pd
import torch
from minicons import cwe
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModel


# Attention Head Probing

In [None]:
urls = {
    "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-dev.conllu",
    "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-test.conllu",
    "traina": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-train-a.conllu",
    "trainb": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-train-b.conllu",
    "trainc": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-train-c.conllu",
    "traind": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-train-d.conllu",
    "traine": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-Taiga/refs/heads/master/ru_taiga-ud-train-e.conllu",
    "dev": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/refs/heads/master/ru_syntagrus-ud-dev.conllu",
    "test": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/refs/heads/master/ru_syntagrus-ud-test.conllu",
    "traina": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/refs/heads/master/ru_syntagrus-ud-train-a.conllu",
    "trainb": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/refs/heads/master/ru_syntagrus-ud-train-b.conllu",
    "trainc": "https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/refs/heads/master/ru_syntagrus-ud-train-c.conllu"
}

def fetch_and_parse(url):
    response = requests.get(url)
    response.raise_for_status()
    return parse(response.text)

filtered = []
for name, url in urls.items():
    sentences = fetch_and_parse(url)
    for sent in sentences:
        nsubj_tokens = [tok for tok in sent if tok["deprel"] == "nsubj"]
        root_tokens = [tok for tok in sent if tok["deprel"] == "root"]
        if nsubj_tokens and root_tokens:
            filtered.append({
                "sentence": " ".join(tok["form"] for tok in sent),
                "nsubj": [tok["form"] for tok in nsubj_tokens],
                "root": [tok["form"] for tok in root_tokens],
                "split": name
            })

df = pd.DataFrame(filtered)
df.to_csv("nsubj_root_sentences_ru.csv", index=False)


In [None]:
model_name = "deepvk/bert-base-uncased"
wrapper = cwe.CWE(model_name, model_type="bert")
tokenizer = BertTokenizer.from_pretrained(model_name)
model = wrapper.model
model.eval()

df = pd.read_csv("nsubj_root_sentences_ru.csv")

results = []

for i, row in df.iterrows():
    sentence = row["sentence"]
    nsubj = row["nsubj"].strip("[]'").split(",")[0].strip()
    root = row["root"].strip("[]'").split(",")[0].strip()

    # Tokenize sentence
    inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    try:
        nsubj_idx = tokens.index(nsubj)
        root_idx = tokens.index(root)
    except ValueError:
        continue

    if root_idx < nsubj_idx:
        continue

    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)

    attentions = outputs.attentions  
    attn_tensor = torch.stack(attentions).squeeze(1)

    attention_scores = attn_tensor[:, :, root_idx, nsubj_idx]

    flat_idx = torch.argmax(attention_scores)
    best_layer = flat_idx.item() // attention_scores.shape[1]
    best_head = flat_idx.item() % attention_scores.shape[1]
    max_value = attention_scores[best_layer, best_head].item()

    results.append({
        "sentence": sentence,
        "nsubj": nsubj,
        "root": root,
        "nsubj_idx": nsubj_idx,
        "root_idx": root_idx,
        "best_layer": best_layer,
        "best_head": best_head,
        "attention_value": max_value
    })

pd.DataFrame(results).to_csv("russian_bert_attention_nsubj_root.csv", index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/332 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/449k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/669 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/455M [00:00<?, ?B/s]

In [None]:
import pandas as pd

def summarize_attention_weight(csv_path):
    df = pd.read_csv(csv_path)
    total_sentences = len(df)

    weighted_scores = (
        df.groupby(["best_layer", "best_head"])["attention_value"]
        .sum()
        .reset_index()
        .rename(columns={"attention_value": "total_weighted_attention"})
    )

    frequency = (
        df.groupby(["best_layer", "best_head"])
        .size()
        .reset_index(name="count")
    )

    summary = weighted_scores.merge(frequency, on=["best_layer", "best_head"])
    summary["percentage"] = 100 * summary["count"] / total_sentences

    top5 = summary.sort_values("total_weighted_attention", ascending=False).head(5)
    return top5


def summarize_attention_frequency(csv_path):
    df = pd.read_csv(csv_path)
    total = len(df)

    freq_table = (
        df.groupby(["best_layer", "best_head"])
        .size()
        .reset_index(name="count")
        .sort_values("count", ascending=False)
    )
    freq_table["percentage"] = 100 * freq_table["count"] / total

    return freq_table.head(5)

def get_top_layer_head(df, myrow="total_weighted_attention"):
    top_row = df.loc[df[myrow].idxmax()]
    LAYER = int(top_row["best_layer"])
    HEAD = int(top_row["best_head"])
    return LAYER, HEAD


In [None]:
print(summarize_attention_weight("russian_bert_attention_nsubj_root.csv"))
print(summarize_attention_frequency("russian_bert_attention_nsubj_root.csv"))
BERT_LAYER, BERT_HEAD = get_top_layer_head(summarize_attention_frequency("russian_bert_attention_nsubj_root.csv"), myrow="percentage")

    best_layer  best_head  total_weighted_attention  count  percentage
16           1          5               4182.957143   4193   32.635430
53           5          1               1089.549613   1095    8.522727
30           2         11                704.172802    884    6.880448
70           8          0                676.589726   1058    8.234745
43           4          1                582.550897    856    6.662516
    best_layer  best_head  count  percentage
16           1          5   4193   32.635430
53           5          1   1095    8.522727
70           8          0   1058    8.234745
30           2         11    884    6.880448
43           4          1    856    6.662516


# Attention

In [None]:
model_name = "deepvk/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_attentions=True)
model.eval()

df = pd.read_csv("syn_llm.csv", header=None,
    names=["language", "item", "syncretic", "sentence", "condition", "head_num", "attr_num", "verb_num"])
df = df[df["language"] == "russian"]

def get_token_span(word, tokens):
    word = word.lower().strip(".,’'")
    span = []
    pointer = 0
    for i, token in enumerate(tokens):
        clean = token.replace("##", "").lower().lstrip("▁")
        if word[pointer:].startswith(clean):
            span.append(i)
            pointer += len(clean)
        if pointer >= len(word):
            break
    return span

def extract_bert_attention_from_df(df, layer, head):
    rows = []

    for _, row in df.iterrows():
        sentence = row["sentence"]
        words = sentence.strip(".").split()

        if len(words) < 6:
            continue

        head_np = words[0]
        attractor_np = words[2]
        verb_1 = words[3]
        verb_2 = words[4]

        # Tokenize and locate spans
        inputs = tokenizer(sentence, return_tensors="pt")
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        head_span = get_token_span(head_np, tokens)
        attr_span = get_token_span(attractor_np, tokens)
        verb_span = get_token_span(verb_1, tokens) + get_token_span(verb_2, tokens)

        if not head_span or not attr_span or not verb_span:
            print(f"[SKIP] No span found in: {sentence}")
            continue

        with torch.no_grad():
            outputs = model(**inputs)
            attentions = outputs.attentions

        attn_matrix = attentions[layer][0, head]  # shape: [seq, seq]

        attn_to_attr = attn_matrix[verb_span][:, attr_span].sum().item()
        attn_to_head = attn_matrix[verb_span][:, head_span].sum().item()

        rows.append({
            "item": row["item"],
            "sentence": sentence,
            "condition": row["condition"],
            "head_num": row["head_num"],
            "attr_num": row["attr_num"],
            "verb_num": row["verb_num"],
            "attention_to_attractor": attn_to_attr,
            "attention_to_head": attn_to_head,
            "head_tokens": [tokens[i] for i in head_span],
            "attractor_tokens": [tokens[i] for i in attr_span],
            "verb_tokens": [tokens[i] for i in verb_span]
        })

    return pd.DataFrame(rows)

result_df = extract_bert_attention_from_df(df, BERT_LAYER, BERT_HEAD)
result_df.to_csv("rubert_attention_output.csv", index=False)
print(result_df[["sentence", "attention_to_head", "attention_to_attractor"]])


[SKIP] No span found in: Билет в музей во время уличной демонстрации был утерян
[SKIP] No span found in: Билет в музей во время уличной демонстрации были утеряны
[SKIP] No span found in: Билеты в музей во время уличной демонстрации был утерян
[SKIP] No span found in: Билеты в музей во время уличной демонстрации были утеряны
[SKIP] No span found in: Возражение на упрёк без излишне бурных эмоций было принято
[SKIP] No span found in: Возражение на упрёк без излишне бурных эмоций были приняты
[SKIP] No span found in: Возражения на упрёк без излишне бурных эмоций было принято
[SKIP] No span found in: Возражения на упрёк без излишне бурных эмоций были приняты
[SKIP] No span found in: История про убийство в присутствии маленьких детей была рассказана
[SKIP] No span found in: История про убийство в присутствии маленьких детей были рассказаны
[SKIP] No span found in: История про убийства в присутствии маленьких детей была рассказана
[SKIP] No span found in: История про убийства в присутствии ма

In [None]:
# Surprisal

# Surprisal

In [None]:
import pandas as pd
from minicons import scorer

sc = scorer.MaskedLMScorer("DeepPavlov/rubert-base-cased")

df = pd.read_csv("syn_llm.csv", header=None,
    names=["language", "item", "syncretic", "sentence", "condition", "head_num", "attr_num", "verb_num"])
df = df[df["language"] == "russian"]

results = []

for _, row in df.iterrows():
    sentence = row["sentence"]
    words = sentence.strip(".").split()

    if len(words) < 3:
        print(f"[SKIP] Too short: {sentence}")
        continue

    verb = words[-1]
    masked_words = words[:-1] + ["[MASK]"]
    masked_sent = " ".join(masked_words)

    try:
        surprisals = sc.token_score(masked_sent, verb)
        if isinstance(surprisals, list) and len(surprisals) > 0 and isinstance(surprisals[0], list):
            mask_score = None
            for token, value in surprisals[0]:
                if token == "[MASK]":
                    mask_score = value
                    break

            if mask_score is not None:
                results.append({
                    "item": row["item"],
                    "sentence": sentence,
                    "condition": row["condition"],
                    "head_num": row["head_num"],
                    "attr_num": row["attr_num"],
                    "verb_num": row["verb_num"],
                    "verb": verb,
                    "surprisal": mask_score,
                    "model": "rubert"
                })
            else:
                print(f"[SKIP] [MASK] not found in: {masked_sent}")
        else:
            print(f"[SKIP] Unexpected structure in: {masked_sent}")

    except Exception as e:
        print(f"[SKIP] {masked_sent} — {e}")
        continue

rubert_surprisal_df = pd.DataFrame(results)
print(rubert_surprisal_df)

rubert_surprisal_df.to_csv("rubert_verb_surprisal.csv", index=False)


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

    item                                           sentence condition  \
0      1  Абонемент на концерт был дорогим из-за участия...  SG-SG-SG   
1      1  Абонемент на концерт были дорогими из-за участ...  SG-SG-PL   
2      1  Абонемент на концерты был дорогим из-за участи...  SG-PL-SG   
3      1  Абонемент на концерты были дорогими из-за учас...  SG-PL-PL   
4      1  Абонементы на концерт был дорогим из-за участи...  PL-SG-SG   
..   ...                                                ...       ...   
635   80  Шрам после операций с помощью медицинского лаз...  SG-PL-PL   
636   80  Шрамы после операции с помощью медицинского ла...  PL-SG-SG   
637   80  Шрамы после операции с помощью медицинского ла...  PL-SG-PL   
638   80  Шрамы после операций с помощью медицинского ла...  PL-PL-SG   
639   80  Шрамы после операций с помощью медицинского ла...  PL-PL-PL   

    head_num attr_num verb_num      verb  surprisal   model  
0         SG       SG       SG  артистов  15.128869  rubert  

In [None]:
import numpy as np

rubert_surprisal_df["surprisal"] = rubert_surprisal_df["surprisal"] / np.log(2)
surp_combined_df = pd.concat([bert_surprisal_df, gpt2_surprisal_df], ignore_index=True)
surp_combined_df.to_csv("turkish_bert_gpt2_combined_surprisal.csv", index=False)