In [1]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from tqdm import tqdm

from sys import path
path.append('../utils/')

from utils import get_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASETS = ["webkb", "20ng"]

In [3]:
pd_datasets = get_datasets(DATASETS, path="/home/welton/data/pd_datasets/__dset__.csv", sep=';')
pd_datasets["20ng"].head(5)

Unnamed: 0,spr,kpr,xtr,xfr,stmk,ltmk,lpr,str,ltr,lfr,...,bert,sfr,xtmk,xlnet_softmax,xpr,label,fold_id,docs,conc_size,hit_counts
0,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,"This is the story of Kent, the archetype Finn,...",18,18
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,In article <16BA7103C3.I3150101@dbstu1.rz.tu-b...,18,18
2,5,17,18,19,18,18,5,11,11,6,...,5,6,18,13,6,0,0,"A new alternative to Scouting for those ""unacc...",6,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,0,[reply to zazen@austin.ibm.com (E. H. Welbon)]...,17,17
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,In article <4949@eastman.UUCP> dps@nasa.kodak....,18,18


In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
def get_attention(dataset: str, fold: int, df: pd.DataFrame):
    
    # Loading the bert fine-tuned model and sending him to GPU.
    model_path = f"/home/welton/data/clfs_output/split_10/{dataset}/10_folds/rep_bert/{fold}/model"
    model = torch.load(model_path).to(device)
    # Setting bert to output attention weights.
    model.config.output_attentions = True
    # Loading bert tokenizer.
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    # Transforming the documents in ids.
    tokens_ids = tokenizer(df.docs.values.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
    # Reverting ids to tokens.
    tokens = [ tokenizer.convert_ids_to_tokens(ids) for ids in tokens_ids.input_ids ]
    # Computing attention weights.
    output = model.bert(**tokens_ids)
    docs_att = output.attentions[-1]
    att_weights = []
    # Making the mean of the attention on the twelve heads of BERT's last layer.
    for doc in docs_att:
        mean_att = torch.mean(doc, axis=0)
        w = torch.sum(mean_att, axis=0).detach().cpu().numpy()
        att_weights.append(w)
    return tokens, att_weights

def set_word_attention(doc_tokens: list, att: np.ndarray):
    wo = []
    we = []
    for idx, token in enumerate(doc_tokens):
        if token not in ["[CLS]", "[SEP]", "[PAD]"]:
            wo.append(token)
            we.append(att[idx])
    return wo, we

In [6]:
#dataset = "20ng"
for dataset in DATASETS:
    docs_att = []
    print(f"[{dataset.upper()}]")
    for fold in tqdm(np.arange(10)):
        df = pd_datasets[dataset]
        tokens, att_weights = get_attention(dataset, fold, df[df.fold_id == fold])
        words = []; weights = []
        for dt, da in zip(tokens, att_weights):
            wo, we = set_word_attention(dt, da)
            words.append(wo)
            weights.append(we)
        torch.cuda.empty_cache()

    data = {"docs": words, "weights": weights}
    att_df = pd.DataFrame(data)
    output_dir = f"/home/welton/data/attention/{dataset}/"
    os.makedirs(output_dir, exist_ok=True)
    att_df.to_csv(f"{output_dir}/{dataset}.csv", sep=';', index=False)

[WEBKB]


  0%|          | 0/10 [00:04<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 2.41 GiB (GPU 0; 7.79 GiB total capacity; 4.64 GiB already allocated; 2.34 GiB free; 4.69 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
docs_att[0]

[['this', 0.5484288],
 ['is', 0.33609223],
 ['the', 0.31935322],
 ['story', 0.9013461],
 ['of', 0.37335452],
 ['kent', 3.7715073],
 [',', 0.5740509],
 ['the', 0.74913317],
 ['arch', 0.58199304],
 ['##ety', 0.9072833],
 ['##pe', 0.7144489],
 ['finn', 4.488489],
 [',', 10.46603],
 ['that', 0.7912102],
 ['lives', 0.6414691],
 ['in', 0.42168948],
 ['the', 8.845614],
 ['bay', 2.436592],
 ['area', 0.8016088],
 [',', 9.61158],
 ['and', 0.49231413],
 ['tried', 0.58744675],
 ['to', 0.5151708],
 ['purchase', 0.9629158],
 ['thomas', 1.3626342],
 ['pain', 5.010043],
 ['##e', 1.289026],
 ["'", 0.90068424],
 ['s', 0.8964683],
 ['"', 0.72473943],
 ['age', 1.1120569],
 ['of', 0.618949],
 ['reason', 3.6639862],
 ['"', 0.40983197],
 ['.', 12.495114],
 ['this', 0.8442351],
 ['man', 0.9850842],
 ['was', 0.811539],
 ['driving', 1.718617],
 ['around', 1.1216342],
 [',', 0.2406631],
 ['to', 0.6028936],
 ['stacey', 4.0122404],
 ['##s', 1.1932758],
 [',', 0.33007967],
 ['to', 0.93505406],
 ['books', 2.1772122]

In [None]:
dataset = "20ng"
tokens, att_weights = get_attention(dataset, 0, pd_datasets[dataset].head(20))

In [None]:
doc_att = set_word_attention(tokens[0], att_weights[0])
doc_att