# Attention

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import BertTokenizer, BertModel, BertConfig

model_version = 'sberbank-ai/ruBert-base' #ruBert-large
model = BertModel.from_pretrained(model_version, output_attentions=True, num_attention_heads=12)
tokenizer = BertTokenizer.from_pretrained(model_version)


Downloading:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruBert-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [6]:
from google.colab import files, drive
import pandas as pd
import re
from tqdm.auto import tqdm


drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
with open('./drive/My Drive/workshop/semantic_roles_data.csv', encoding='utf-8', errors='ignore') as csvfile:
    role_df = pd.read_csv(csvfile, sep=',')

role_df.head(3)

Unnamed: 0,sentence,role,idx_target,raw_target,idx_head,raw_head
0,«Школа злословия» учит прикусить язык,agent,1-6,Школа,18-22,учит
1,Сохранится ли градус дискуссии в новом сезоне?,posessor,14-20,градус,0-10,Сохранится
2,Великолепная «Школа злословия» вернулась в эфи...,agent,14-19,Школа,31-40,вернулась


In [8]:
role_df = role_df[role_df['idx_head'] != 'oops']

In [9]:
role_df['role'].value_counts()

agent          818
patient        415
causator       249
theme           87
experiencer     83
addressee       58
posessor        54
place           38
time            35
benefactive     31
goal            26
Name: role, dtype: int64

In [10]:
def gaw(sentence, attention, agent_ind, verb_ind):
  
  weights = {i:{j: None for j in range(12)} for i in range(12)}

  for l in range(len(attention)): # 12
    for k in range(len(attention[l][0])): # 2
      for i in range(len(attention[l][0][k])):
        if i == verb_ind:
          for j in range(len(attention[l][0][k][i])):
            if j == agent_ind:
              weights[l][k] = {(sentence.split(' ')[0], sentence.split(' ')[1]): float(attention[l][0][k][i][j])}

  return weights

In [11]:
def get_max_weights(weights):
  
  a, v = list(weights[0][0][0])[0][0], list(weights[0][0][0])[0][1]
  out_weights = {i: {j: {(a, v): 0 for _ in range(12)} for j in range(12)} for i in range(12)}

  for weight_range in range(len(weights)):
    for i in range(12):
      for j in range(12):
        if out_weights[i][j][(a, v)] < weights[weight_range][i][j][(a, v)]:
          out_weights[i][j][(a, v)] = weights[weight_range][i][j][(a, v)]

  return out_weights

In [12]:
def get_mean_weights(weights):
  
  a, v = list(weights[0][0][0])[0][0], list(weights[0][0][0])[0][1]
  out_weights = {i: {j: {(a, v): 0 for _ in range(12)} for j in range(12)} for i in range(12)}

  for weight_range in range(len(weights)):
    for i in range(12):
      for j in range(12):
        out_weights[i][j][(a, v)] += weights[weight_range][i][j][(a, v)]
  
  for i in range(12):
    for j in range(12):
      out_weights[i][j][(a, v)] = out_weights[i][j][(a, v)]/len(weights)

  return out_weights

In [13]:
def get_all_attention_weights(sentence, attention, agent_inds, verb_inds):

  out = {'mean_bw_vtokens': None, 'max_bw_vtokens': None, 'st_bw_vtokens': None}

  if len(verb_inds) == 1 and len(agent_inds) == 1: 
    for tp in out.keys():
      weights = gaw(sentence, attention, agent_inds[0], verb_inds[0])
      out[tp] = {'mean_bw_atokens':weights, 'max_bw_atokens':weights, 
                 'st_bw_atokens': weights}
  else:
    if len(verb_inds) == 1:
      ag_weights = []

      for i in range(len(agent_inds)):
        ag_weights.append(gaw(sentence, attention, agent_inds[i], verb_inds[0]))

      mean_ag_weights = get_mean_weights(ag_weights)
      max_ag_weights = get_max_weights(ag_weights)

      ag_out = {'mean_bw_atokens':mean_ag_weights, 'max_bw_atokens':max_ag_weights, 
                'st_bw_atokens': ag_weights[0]}

      for tp in out.keys():
        out[tp] = ag_out

    elif len(agent_inds) == 1:
      vb_weights = []

      for i in range(len(verb_inds)):
        vb_weights.append(gaw(sentence, attention, agent_inds[0], verb_inds[i]))

      mean_vb_weights = get_mean_weights(vb_weights)
      max_vb_weights = get_max_weights(vb_weights)

      out['mean_bw_vtokens'] = {'mean_bw_atokens':mean_vb_weights, 'max_bw_atokens':mean_vb_weights, 
                                'st_bw_atokens': mean_vb_weights}
      out['max_bw_vtokens'] = {'mean_bw_atokens':max_vb_weights, 'max_bw_atokens':max_vb_weights, 
                               'st_bw_atokens': max_vb_weights}
      out['st_bw_vtokens'] = {'mean_bw_atokens':vb_weights[0], 'max_bw_atokens':vb_weights[0], 
                              'st_bw_atokens': vb_weights[0]}

    else:
      av_weights = [] 
      for i in range(len(verb_inds)): 
        verb_ag = []
        for j in range(len(agent_inds)):
          verb_ag.append(gaw(sentence, attention, agent_inds[j], verb_inds[i]))
        av_weights.append(verb_ag)
      counted_ag_weights = []
      for av in av_weights:
        mean_av_weights = get_mean_weights(av)
        max_av_weights = get_max_weights(av)
        first_av_weights = av[0]
        counted_ag_weights.append([mean_av_weights, max_av_weights, first_av_weights])
      out['mean_bw_vtokens'] = {'mean_bw_atokens': get_mean_weights([_[0] for _ in counted_ag_weights]),
                                'max_bw_atokens': get_mean_weights([_[1] for _ in counted_ag_weights]), 
                                'st_bw_atokens': get_mean_weights([_[2] for _ in counted_ag_weights])}
      out['max_bw_vtokens'] = {'mean_bw_atokens': get_max_weights([_[0] for _ in counted_ag_weights]),
                                'max_bw_atokens': get_max_weights([_[1] for _ in counted_ag_weights]), 
                                'st_bw_atokens': get_max_weights([_[2] for _ in counted_ag_weights])}
      out['st_bw_vtokens'] = {'mean_bw_atokens': counted_ag_weights[0][0],
                                'max_bw_atokens': counted_ag_weights[0][1], 
                                'st_bw_atokens': counted_ag_weights[0][2]}

  return out            

1. mean по глаголу mean по агенсу
2. mean по глаголу max по агенсу
3. mean по глаголу first по агенсу
4. first по глаголу mean по агенсу
5. first по глаголу max по агенсу
6. first по глаголу first по агенсу
7. max по глаголу mean по агенсу
8. max по глаголу max по агенсу
9. max по глаголу first по агенсу

* если всего 1 токен глагола и 1 токен агенса, будет 9 одинаковых таблиц значений
* если 1 токен глагола и несколько агенса (или наоборот), будет 3 тройки одинаковых таблиц значений
* если все и агенс, и глагол разделились на неск токенов, будет 9 разных таблиц значений


In [14]:
def find_indexes(sentence, w):
    sent = sentence.lower()
    words = re.findall('[а-яё\-]+|[a-z\-]+|[^а-яёa-z0-9\-]|[0-9\-]+', sent)
    indss = [0]
    word_indexes = {}
    for word in words:
        if word != ' ':
            inputs = tokenizer.encode_plus(word,  return_tensors='pt')
            input_ids = inputs['input_ids']
            token_type_ids = inputs['token_type_ids']
            attention = model(input_ids, token_type_ids=token_type_ids)[-1]
            input_id_list = input_ids[0].tolist() 
            tokens = tokenizer.convert_ids_to_tokens(input_id_list)
            del tokens[0]
            del tokens[-1]
            for i in range(len(tokens)):
                indss.append(indss[-1]+1)
            if word in w:
                word_indexes[word] = indss[-len(tokens)::]
    return word_indexes

In [24]:
def get_matrix(df):

    verb_lists = ['mean_bw_vtokens', 'max_bw_vtokens', 'st_bw_vtokens']
    role_lists = ['mean_bw_atokens', 'max_bw_atokens', 'st_bw_atokens']

    dicts = ['text', 'target', 'verb', 'layer', 'head', 'role']
    for d in dicts:
        globals()[d] = {vtoken:{token:[] for token in role_lists} for vtoken in verb_lists}

    indexes = df.index

    for ind in tqdm(indexes, total=len(indexes)):
        sentence = df.loc[ind, 'sentence']
        inputs = tokenizer.encode_plus(sentence,  return_tensors='pt')
        input_ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        attention = model(input_ids, token_type_ids=token_type_ids)[-1]

        input_id_list = input_ids[0].tolist() 
        tokens = tokenizer.convert_ids_to_tokens(input_id_list)

        role_ind = df.loc[ind, 'idx_target'].split('-')
        verb_ind = df.loc[ind, 'idx_head'].split('-')

        
        role_token = sentence[int(role_ind[0]):int(role_ind[1])].lower()
        verb_token = sentence[int(verb_ind[0]):int(verb_ind[1])].lower()

        words_indexes = find_indexes(sentence, [role_token.lower(), verb_token.lower()])

        role_indexes = words_indexes[role_token]
        verb_indexes = words_indexes[verb_token]

        role_name = df.loc[ind, 'role']

        all_types_of_weights = get_all_attention_weights(' '.join([role_token, verb_token]), attention, role_indexes, verb_indexes)

        for v in verb_lists:
            for a in role_lists:
                for l in list(all_types_of_weights[v][a]):
                    head_weights = []
                    for h in list(all_types_of_weights[v][a][l]):
                        head_weights.append(list(all_types_of_weights[v][a][l][h].values())[0])
      
                    text[v][a].append(sentence)
                    target[v][a].append(list(all_types_of_weights[v][a][0][0].keys())[0][0])
                    verb[v][a].append(list(all_types_of_weights[v][a][0][0].keys())[0][1])
                    layer[v][a].append(l)
                    head[v][a].append(head_weights)
                    role[v][a].append(role_name)

    for v in verb_lists:
        for a in role_lists:
            for i in range(12):
                try:
                    globals()['h_'+str(i)][v][a] = [h[i] for h in head[v][a]]
                except:
                    try:
                        globals()['h_'+str(i)][v] = {a: [h[i] for h in head[v][a]]}
                    except:
                        globals()['h_'+str(i)] = {v: {a: [h[i] for h in head[v][a]]}}

    columns = ['text', 'target', 'role', 'verb', 'layer']
    columns.extend(['h_'+str(i) for i in range(12)])

    for v in verb_lists:
        for a in role_lists:
            dt = {col: globals()[col][v][a] for col in columns}
            globals()['v'+v.split('_')[0]+'_a'+a.split('_')[0]] = pd.DataFrame.from_dict(dt, orient='index')
            globals()['v'+v.split('_')[0]+'_a'+a.split('_')[0]] = globals()['v'+v.split('_')[0]+'_a'+a.split('_')[0]].transpose()

    dframes = [vmean_amean, vmean_amax, vmean_ast, 
               vmax_amean, vmax_amax, vmax_ast, 
               vst_amean, vst_amax, vst_ast]

    dframes_names = ['vmean_amean', 'vmean_amax', 'vmean_ast', 
                     'vmax_amean', 'vmax_amax', 'vmax_ast', 
                     'vst_amean', 'vst_amax', 'vst_ast']
                     
    output_dfs = {dframes_names[i]:dframes[i] for i in range(9)}

    return output_dfs

In [25]:
weighted_df_list = get_matrix(role_df)
weighted_df_list.keys()

  0%|          | 0/111 [00:00<?, ?it/s]

9


In [None]:
# таблички вот такие: vmean_amean, vmean_amax, vmean_ast, vmax_amean, vmax_amax,
# vmax_ast, vst_amean, vst_amax, vst_ast
# (v -- как считается между токенами глагола, a -- как между токенами таргета)

# Среднее по столбцам и головам

In [125]:
from google.colab import files, drive
import pandas as pd
import re
from tqdm.auto import tqdm


drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [126]:
from statistics import mean

In [127]:
roles = ['agent', 'patient', 'causator', 'experiencer', 'addressee', 'theme', 'place', 'time', 'posessor', 'benefactive', 'goal']

In [128]:
# ниже показано только vmean_amean
dfs_names = ['vmean_amean', 'vmean_amax', 'vmean_ast', 
       'vmax_amean', 'vmax_amax', 'vmax_ast', 
       'vst_amean', 'vst_amax', 'vst_ast']

In [129]:
for role in roles: 
  with open('./drive/My Drive/workshop/'+role+'/vmean_amean.csv', encoding='utf-8', errors='ignore') as csvfile:
    locals()[role+'_df'] = pd.read_csv(csvfile, sep='\t')


In [130]:
agent_df.head(3)

Unnamed: 0,text,role,verb,layer,h_0,h_1,h_2,h_3,h_4,h_5,h_6,h_7,h_8,h_9,h_10,h_11
0,«Школа злословия» учит прикусить язык,школа,учит,0,0.294945,0.194346,0.065336,0.029518,0.024246,0.083016,0.1283298,0.013612,0.281211,0.196036,0.011122,0.002125
1,«Школа злословия» учит прикусить язык,школа,учит,1,0.106238,0.010475,0.022317,0.031149,0.00899,0.000449,1.641261e-23,6.9e-05,0.002085,0.304993,0.004542,6e-05
2,«Школа злословия» учит прикусить язык,школа,учит,2,0.007494,0.454758,0.004607,0.025094,0.079971,0.089657,0.003836624,0.028086,0.000386,0.002589,0.001305,0.51053


In [131]:
def get_mean_df(df):
  data = {'h_'+str(j):[] for j in range(12)}
  data['layer'] = []

  for i in range(12): # слой
    for j in range(12): # голова
      layer_df = df[df['layer']==i]
      col_name = 'h_'+str(j)
      data[col_name].append(mean(layer_df[col_name]))
    data['layer'].append(i)
  return pd.DataFrame(data)

In [132]:
get_mean_df(agent_df) # средние значения для vmean_amean для агенса

Unnamed: 0,h_0,h_1,h_2,h_3,h_4,h_5,h_6,h_7,h_8,h_9,h_10,h_11,layer
0,0.047319,0.048895,0.045843,0.036678,0.054891,0.022235,0.069766,0.032892,0.044472,0.254368,0.039785,0.035032,0
1,0.017645,0.021468,0.032753,0.045571,0.069057,0.022084,0.254743,0.001133,0.034846,0.110258,0.028279,0.014232,1
2,0.136747,0.22101,0.020686,0.04305,0.036177,0.057384,0.046406,0.002624,0.01974,0.045443,0.018019,0.028472,2
3,0.076799,0.042025,0.019346,0.004551,0.021582,0.040796,0.017527,0.041998,0.042419,0.041,0.042268,0.003176,3
4,0.174535,0.068397,0.003139,0.057167,0.028035,0.036152,0.134186,0.011021,0.042285,0.030683,0.032476,0.040502,4
5,0.025268,0.030201,0.030754,0.036888,0.012263,0.019086,0.121729,0.048164,0.019573,0.014198,0.298031,0.001141,5
6,0.022245,0.023952,0.008749,0.060213,0.014328,0.067186,0.057389,0.208774,0.024442,0.04368,0.035162,0.017654,6
7,0.010092,0.139206,0.04219,0.009427,0.041035,0.018738,0.084379,0.251632,0.027286,0.039223,0.091543,0.019085,7
8,0.034236,0.043002,0.057029,0.065193,0.022048,0.053745,0.069754,0.004848,0.181014,0.045214,0.091212,0.096026,8
9,0.004576,0.040481,0.036437,0.033145,0.032113,0.030067,0.018057,0.044663,0.025099,0.051844,0.040112,0.258148,9


In [135]:
scores_layer_head = {}
for i in tqdm(range(12)):  # layer
  scores_layer_head[i] = {}
  for j in range(12):  # head
    scores = []
    for role in roles:
      attention_score = get_mean_df(locals()[role+'_df']).iloc[i, j]
      scores.append(attention_score)
    max_score = max(scores)
    ind = scores.index(max_score)
    role_with_max_score = roles[ind]
    scores_layer_head[i][j] = [max_score, role_with_max_score]

  0%|          | 0/12 [00:00<?, ?it/s]

In [152]:
data_scores = {'head_'+str(i):[] for i in range(12)}
for i in range(12):  # head
  for j in range(12):  # layer
    data_scores['head_'+str(i)].append(scores_layer_head[j][i][0])

In [153]:
data_roles = {'head_'+str(i):[] for i in range(12)}
for i in range(12):  # head
  for j in range(12):  # layer
    data_roles['head_'+str(i)].append(scores_layer_head[j][i][1])

In [155]:
all_scores_df = pd.DataFrame(data_scores)

In [156]:
all_roles_df = pd.DataFrame(data_roles)

In [169]:
import pandas as pd
import matplotlib.pyplot as plt  
from matplotlib import colors

def b_g(s, cmap='PuBu', low=0, high=0):
    a = all_scores_df.loc[:,s.name].copy()
    rng = a.max() - a.min()
    norm = colors.Normalize(a.min() - (rng * low),
                        a.max() + (rng * high))
    normed = norm(a.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

colored_df = all_roles_df.style.apply(b_g,cmap='PuBu')

In [170]:
colored_df

Unnamed: 0,head_0,head_1,head_2,head_3,head_4,head_5,head_6,head_7,head_8,head_9,head_10,head_11
0,goal,place,causator,causator,patient,goal,benefactive,causator,place,experiencer,place,posessor
1,experiencer,benefactive,place,agent,patient,posessor,agent,theme,experiencer,experiencer,place,experiencer
2,experiencer,experiencer,agent,time,experiencer,time,patient,experiencer,posessor,goal,experiencer,goal
3,experiencer,experiencer,patient,goal,patient,causator,agent,goal,causator,posessor,benefactive,theme
4,experiencer,patient,theme,experiencer,causator,posessor,experiencer,posessor,causator,time,experiencer,experiencer
5,posessor,posessor,posessor,experiencer,theme,causator,experiencer,patient,goal,experiencer,experiencer,theme
6,agent,agent,posessor,patient,posessor,patient,posessor,experiencer,experiencer,experiencer,place,patient
7,experiencer,experiencer,experiencer,experiencer,patient,agent,agent,agent,experiencer,experiencer,experiencer,goal
8,addressee,experiencer,addressee,patient,experiencer,patient,experiencer,posessor,experiencer,posessor,experiencer,patient
9,posessor,experiencer,experiencer,causator,experiencer,posessor,posessor,patient,experiencer,patient,experiencer,experiencer


In [172]:
colored_df.to_excel('colored_role_df.xlsx')