In [1]:
# %%
import os
import torch
import pandas as pd
# import nltk
import numpy as np
# import scipy
import glob
import json
from attention_graph_util import compute_flows, create_attention_graph, get_adjmat

from helpers import get_edit_positions, get_dataset
from transformers import AutoTokenizer
from tqdm.notebook import tqdm


# %%
# https://gist.github.com/jlherren/d97839b1276b9bd7faa930f74711a4b6

MODEL_NAME = os.getenv("MODEL_NAME") or "bert-base-cased"
ATTENTION_TYPE = (
    os.getenv("ATTENTION_TYPE") or "attentions"
)  # "/media/data/models/bert-base-cased"

os.environ["TOKENIZERS_PARALLELISM"] = "false"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, return_tensors="pt")

In [14]:
def get_target_attentions(item):
    try:
        unmasked: torch.Tensor = item.attentions_sentence[
            :, :, :, item.tokenized_edited_positions_in_unmasked
        ]
        negated: torch.Tensor = item.attentions_negated[
            :, :, :, item.tokenized_edited_positions_in_negated
        ]
        negated_with_original: torch.Tensor = item.attentions_negated[
            :, :, :, item.tokenized_edited_positions_in_negated_with_original
        ]
    except IndexError:
        print(item)
        print(item.tokenized_edited_positions_in_unmasked)
        print(item.tokenized_edited_positions_in_negated)
        print(item.attentions_sentence.shape)
        print(item.attentions_negated.shape)
        raise
    return unmasked, negated, negated_with_original


# %%
def load(dataset_path: str) -> pd.DataFrame:
    # if "LAMA_negated" in dataset_path:
    #     for df in get_dataset():
    #         dataset = df.iloc[0,]["dataset"]
    #         for attention_type in glob.glob(
    #             f"/media/data/thielen/ba/negation_datasets/{os.path.basename(MODEL_NAME)}/{ATTENTION_TYPE}/LAMA_negated/{dataset}/*/*"
    #         ):
    #             if not os.path.exists(f"{attention_type}/variant_00/attentions.pt"):
    #                 # print(f"Skipping {attention_type}")
    #                 continue

    #             # print(f"Loading {attention_type}")
    #             df["attentions_sentence"] = [
    #                 row
    #                 for row in torch.load(
    #                     os.path.join(attention_type, "variant_00", "attentions.pt")
    #                 )
    #             ]
    #             df["pooler_outputs_sentence"] = [
    #                 row for row in torch.load(f"{attention_type}/variant_00/pooler_outputs.pt")
    #             ]
    #             df["attentions_negated"] = [
    #                 row
    #                 for row in torch.load(
    #                     os.path.join(attention_type, "variant_01", "attentions.pt")
    #                 )
    #             ]
    #             df["pooler_outputs_negated"] = [
    #                 row for row in torch.load(f"{attention_type}/variant_01/pooler_outputs.pt")
    #             ]
    #             df["dataset_path"] = attention_type

    #             print("attentions.shape", df["attentions_sentence"][0].shape)

    #             yield df
    #             # return
    # else:
        # return
    
    df = pd.DataFrame()

    for attention_type in glob.glob(f"{dataset_path}/*/*/*"):
        if "Squad" not in attention_type:
            continue

        if not os.path.exists(f"{attention_type}/variant_00/attentions.pt"):
            print(f"Skipping {attention_type}")
            continue

        print(f"Loading {attention_type}")

        df["unmasked_sentences"] = pd.read_csv(
            f"{attention_type}/variant_00/samples.csv"
        )["Sentence"]
        df["unmasked_negated"] = pd.read_csv(
            f"{attention_type}/variant_01/samples.csv"
        )["Sentence"]

        df["attentions_sentence"] = [
            row for row in torch.load(f"{attention_type}/variant_00/attentions.pt")
        ]
        df["pooler_outputs_sentence"] = [
            row for row in torch.load(f"{attention_type}/variant_00/pooler_outputs.pt")
        ]
        df["attentions_negated"] = [
            row for row in torch.load(f"{attention_type}/variant_01/attentions.pt")
        ]
        df["pooler_outputs_negated"] = [
            row for row in torch.load(f"{attention_type}/variant_01/pooler_outputs.pt")
        ]
        df["dataset_path"] = attention_type

        print("attentions.shape", df["attentions_sentence"][0].shape)

        yield df


dfs = []
for dataset_path in glob.glob(
    f"/media/data/thielen/ba/negation_datasets/{os.path.basename(MODEL_NAME)}/{ATTENTION_TYPE}/*"
):
    # print(f"Loading {dataset_path}")
    dfs.extend(load(dataset_path))  # extend implicitly iterates over the generator

for i in range(len(dfs)):
    if "masked_sentences" in dfs[i].columns:
        dfs[i]["unmasked_sentences"] = dfs[i].apply(
            lambda s: s["masked_sentences"].replace("[MASK]", str(s["obj_label"])),
            axis=1,
        )
        dfs[i]["unmasked_negated"] = dfs[i].apply(
            lambda s: s["negated"].replace("[MASK]", str(s["obj_label"])), axis=1
        )

print(f"Loaded {len(dfs)} dataframes")


def find_sub_list(sl, l):
    results = []
    sll = len(sl)
    for ind in (i for i, e in enumerate(l) if e == sl[0]):
        if l[ind : ind + sll] == sl:
            # results.append((ind,ind+sll-1))
            results.append(list(range(ind, ind + sll)))

    return [x for y in results for x in y]


def maybe_add_original_positions(item):
    if (
        not len(item.tokenized_edited_tokens_in_negated)
        or item.tokenized_edited_tokens_in_negated[0] not in ["not", "cannot"]
        or item.tokenized_edited_positions_in_negated
        == item.tokenized_edited_positions_in_unmasked
    ):
        return item.tokenized_edited_positions_in_negated
    else:
        ret = (
            item.tokenized_edited_positions_in_negated
            + item.tokenized_edited_positions_in_unmasked
        )
        return list(sorted(set(ret)))


def maybe_add_original_tokens(item):
    if (
        not len(item.tokenized_edited_tokens_in_negated)
        or item.tokenized_edited_tokens_in_negated[0] not in ["not", "cannot"]
        or item.tokenized_edited_positions_in_negated
        == item.tokenized_edited_positions_in_unmasked
    ):
        return item.tokenized_edited_tokens_in_negated
    else:
        ret = (
            item.tokenized_edited_tokens_in_negated
            + item.tokenized_edited_tokens_in_unmasked
        )
        return list(sorted(set(ret), key=ret.index))


# %%

def tokenize(input):
    return TOKENIZER.tokenize(
        input,
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        max_length=32,
    )



Loading /media/data/thielen/ba/negation_datasets/bert-base-cased/attentions/LAMA_negated/Squad/train/32
attentions.shape torch.Size([12, 12, 32, 32])
Loaded 1 dataframes


In [15]:
for i in range(len(dfs)):
    dfs[i]["tokenized_unmasked_sentences"] = dfs[i].apply(
        lambda s: tokenize(s["unmasked_sentences"]), axis=1
    )
    dfs[i]["tokenized_unmasked_negated"] = dfs[i].apply(
        lambda s: tokenize(s["unmasked_negated"]), axis=1
    )

    dfs[i]["tokenized_edited_positions_in_negated"] = dfs[i].apply(
        lambda s: get_edit_positions(
            [
                tok
                for tok in s["tokenized_unmasked_sentences"]
                if tok not in TOKENIZER.all_special_tokens
            ],
            [
                tok
                for tok in s["tokenized_unmasked_negated"]
                if tok not in TOKENIZER.all_special_tokens
            ],
        )[: s["attentions_sentence"].shape[-1]],
        axis=1,
    )
    dfs[i]["tokenized_edited_tokens_in_negated"] = dfs[i].apply(
        lambda s: [
            list(filter(lambda t: t not in TOKENIZER.all_special_tokens, s["tokenized_unmasked_negated"]))[idx]
            for idx in s["tokenized_edited_positions_in_negated"]
            if idx < len(list(filter(lambda t: t not in TOKENIZER.all_special_tokens, s["tokenized_unmasked_negated"])))
        ],
        axis=1,
    )

    dfs[i]["tokenized_edited_positions_in_unmasked"] = dfs[i].apply(
        lambda s: get_edit_positions(
            [
                tok
                for tok in s["tokenized_unmasked_negated"]
                if tok not in TOKENIZER.all_special_tokens
            ],
            [
                tok
                for tok in s["tokenized_unmasked_sentences"]
                if tok not in TOKENIZER.all_special_tokens
            ],
        )[: s["attentions_sentence"].shape[-1]],
        axis=1,
    )
    dfs[i]["tokenized_edited_tokens_in_unmasked"] = dfs[i].apply(
        lambda s: [
            list(filter(lambda t: t not in TOKENIZER.all_special_tokens, s["tokenized_unmasked_sentences"]))[idx]
            for idx in s["tokenized_edited_positions_in_unmasked"]
            if idx < len(list(filter(lambda t: t not in TOKENIZER.all_special_tokens, s["tokenized_unmasked_sentences"])))
        ],
        axis=1,
    )

    dfs[i]["tokenized_edited_tokens_in_negated_with_original"] = dfs[i].apply(
        maybe_add_original_tokens, axis=1
    )
    dfs[i]["tokenized_edited_positions_in_negated_with_original"] = dfs[i].apply(
        maybe_add_original_positions, axis=1
    )

    dfs[i] = dfs[i][
        dfs[i]["tokenized_edited_tokens_in_negated"].apply(len) > 0
    ].reset_index(drop=True)
    dfs[i] = dfs[i][
        dfs[i]["tokenized_edited_tokens_in_unmasked"].apply(len) > 0
    ].reset_index(drop=True)

    dfs[i]["model_name"] = MODEL_NAME

# %%
for i in range(len(dfs)):
    index = dfs[i][
        dfs[i].unmasked_sentences
        == "Three humans are in an airport. One is on the ground, another is next to the window, and the last one is vertical on his two feet with a bag next to him."
    ]
    if len(index):
        dfs[i] = dfs[i].drop(index.index)

    index = dfs[i][
        dfs[i].tokenized_edited_positions_in_unmasked.map(max)
        > dfs[i].attentions_sentence.map(lambda x: x.shape[-1])
    ]
    if len(index):
        dfs[i] = dfs[i].drop(index.index)


In [23]:
def compute_flow(item, _type):
    device = "cpu" #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # "attentions_sentence", "attentions_negated"
    attentions_mat_regular = (item[_type]).clone().detach().to(device)
    res_att_mat = attentions_mat_regular.sum(axis=1) / attentions_mat_regular.shape[1] # average over heads
    res_att_mat = res_att_mat + torch.eye(res_att_mat.shape[1])[None,...].to(device) # add identity matrix to avoid division by zero
    res_att_mat = res_att_mat / res_att_mat.sum(axis=-1)[...,None] # normalize: sum of each row = 1
    tokens = item["tokenized_unmasked_sentences"] if _type == "attentions_sentence" else item["tokenized_unmasked_negated"]
    # print(res_att_mat.shape)

    res_adj_mat, res_labels_to_index = get_adjmat(mat=res_att_mat, input_tokens=tokens)
    res_G = create_attention_graph(res_adj_mat)
    output_nodes = []
    input_nodes = []
    for key in res_labels_to_index:
        if 'L24' in key:
            output_nodes.append(key)
        if res_labels_to_index[key] < attentions_mat_regular.shape[-1]:
            input_nodes.append(key)
    flow_values = compute_flows(res_G, res_labels_to_index, input_nodes, length=attentions_mat_regular.shape[-1])
    # print(flow_values.shape)
    return flow_values


In [16]:
idx = 0
item = dfs[0].iloc[idx]
print(item)

# attentions_sentence, attentions_negated
flow = compute_flow(item, "attentions_negated")
print(flow)

unmasked_sentences                                     To emphasize the 50th anniversary of the Super...
unmasked_negated                                       To emphasize the 50th anniversary of the Super...
attentions_sentence                                    [[[tensor([0.5185, 0.0464, 0.0057, 0.0171, 0.0...
pooler_outputs_sentence                                [[tensor(-0.6894), tensor(0.4977), tensor(0.99...
attentions_negated                                     [[[tensor([0.5304, 0.0475, 0.0059, 0.0175, 0.0...
pooler_outputs_negated                                 [[tensor(-0.6308), tensor(0.4733), tensor(0.99...
dataset_path                                           /media/data/thielen/ba/negation_datasets/bert-...
tokenized_unmasked_sentences                           [[CLS], To, emphasize, the, 50th, anniversary,...
tokenized_unmasked_negated                             [[CLS], To, emphasize, the, 50th, anniversary,...
tokenized_edited_positions_in_negated                  

In [16]:
attentions_mat = item["attentions_sentence"]
def convert_adjmat_tomats(adjmat, n_layers, l):
   mats = np.zeros((n_layers,l,l))
   
   for i in np.arange(n_layers):
       mats[i] = adjmat[(i+1)*l:(i+2)*l,i*l:(i+1)*l]
       
   return mats

flow_att_mat = convert_adjmat_tomats(flow, n_layers=attentions_mat.shape[0], l=attentions_mat.shape[-1])
flow_att_mat.shape

NameError: name 'flow' is not defined

In [19]:
file_path = os.path.join(dfs[0].dataset_path.iloc[0], "flows", f"{idx}".rjust(3, "0") + ".npy")
os.makedirs(os.path.dirname(file_path), exist_ok=True)
np.save(file_path, flow_att_mat)


In [29]:
from joblib import Parallel, delayed
from tqdm import tqdm

def process_item(item, _type):
    idx = item.name
    flow = compute_flow(item, _type)
    attentions_mat = item[_type]
    flow_att_mat = convert_adjmat_tomats(
        flow, n_layers=attentions_mat.shape[0], l=attentions_mat.shape[-1]
    )
    file_name = f"{idx:03}_{_type}.npy"
    file_path = os.path.join(item.dataset_path, "flows", file_name)
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    np.save(file_path, flow_att_mat)

num_cores = 32
start_idx = 9
tasks = [(item, _type) for _, item in dfs[0].iloc[start_idx:].iterrows() for _type in ["attentions_sentence", "attentions_negated"]]

with tqdm(total=len(tasks)) as pbar:
    def update(*a):
        pbar.update()

    Parallel(n_jobs=num_cores, prefer="processes", verbose=5)(
        delayed(process_item)(item, _type)
        for item, _type in tasks
    )

  0%|          | 0/586 [00:00<?, ?it/s][Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current

In [17]:
def load_flows(item):
    flows = []
    for _type in ["attentions_sentence", "attentions_negated"]:
        filename = os.path.join(item.dataset_path, "flows", f"{item.name:03}_{_type}.npy")
        flows.append(np.load(filename))
    return flows

# Use a for loop to apply load_flows to each row
results = []
for _, item in tqdm(dfs[0].iterrows(), total=dfs[0].shape[0]):
    results.append(load_flows(item))

# Create a new DataFrame with the results
df_flows = pd.DataFrame(results, columns=["flow_sentence", "flow_negated"], index=dfs[0].index)

# Concatenate the new DataFrame with the original DataFrame
dfs[0] = pd.concat([dfs[0], df_flows], axis=1)

100%|██████████| 302/302 [00:00<00:00, 1805.14it/s]


In [19]:
dfs[0].iloc[0].flow_sentence.shape

(12, 32, 32)

In [21]:
# from tqdm.notebook import tqdm
from tqdm import tqdm
from joblib import Parallel, delayed
import seaborn as sns
import matplotlib.pyplot as plt

def create_and_save_heatmap(att_mat_target, non_pad_tokens, dataset_path, name, attention_type, layer):
    fig, ax = plt.subplots(figsize=(6,8))
    sns.heatmap(att_mat_target, cmap="YlOrRd", xticklabels=non_pad_tokens, yticklabels=non_pad_tokens, ax=ax)
    filepath = os.path.join(dataset_path, "png", "flows", f"{name:03}_{attention_type}_layer_{layer:02}.png")
    dirpath = os.path.dirname(filepath)
    os.makedirs(dirpath, exist_ok=True)
    
    # Rotate the labels
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    
    plt.savefig(filepath, bbox_inches='tight')
    plt.clf()  # Clear the figure
    plt.close(fig)  # Close the figure

results = []
for _, item in tqdm(dfs[0].iterrows(), total=dfs[0].shape[0]):
    for attention_type in ["attentions_sentence", "attentions_negated"]:
        flow_att_mat = item.flow_sentence if attention_type == "attentions_sentence" else item.flow_negated
        src = item.tokenized_unmasked_sentences if attention_type == "attentions_sentence" else item.tokenized_unmasked_negated
        sentence = item.unmasked_sentences if attention_type == "attentions_sentence" else item.unmasked_negated

        non_pad_indices = [i for i in range(len(src)) if src[i] not in [TOKENIZER.sep_token, TOKENIZER.pad_token]]
        non_pad_tokens = [src[i] for i in non_pad_indices]

        for layer in range(flow_att_mat.shape[0]):
            att_mat_target = flow_att_mat[layer]
            results.append((att_mat_target, non_pad_tokens, item.dataset_path, item.name, attention_type, layer))

# Parallelize the creation and saving of heatmaps
Parallel(n_jobs=32)(delayed(create_and_save_heatmap)(*result) for result in tqdm(results))

100%|██████████| 302/302 [00:00<00:00, 2534.02it/s]
100%|██████████| 7248/7248 [02:02<00:00, 59.34it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,