<a href="https://colab.research.google.com/github/yashgupta-7/689_project/blob/main/winobias_attention_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
%cd /content/gdrive/MyDrive/cs689_project
!ls

/content/gdrive/MyDrive/cs689_project
professions_data  results  winobias_data  winobias.py  winogender_data


In [5]:
!pip install transformers==3.0.2

Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 5.5 MB/s 
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 29.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.9 MB/s 
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 39.8 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.46 sentencepiece-0.1.96 tokenizers-0.8.1rc1 transformers-3.0.2


In [6]:
model_name = "gpt2"
device = "cuda"
out_dir = "."
random_weights = False

In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm
from functools import partial

class Model():
  def __init__(self,
               device='cpu',
               output_attentions=False,
               random_weights=False,
               gpt2_version='gpt2'):
        super()
        self.device = device
        self.model = GPT2LMHeadModel.from_pretrained(gpt2_version, output_attentions=output_attentions)
        self.model.eval()
        self.model.to(device)
        if random_weights:
            print('Randomizing weights')
            self.model.init_weights()
        tokenizer = GPT2Tokenizer.from_pretrained(gpt2_version)

        self.attention_layer = lambda layer: self.model.transformer.h[layer].attn
        self.word_emb_layer = self.model.transformer.wte
        self.neuron_layer = lambda layer: self.model.transformer.h[layer].mlp

        self.num_layers = self.model.config.num_hidden_layers
        self.num_neurons = self.model.config.hidden_size
        self.num_heads = self.model.config.num_attention_heads

  def get_probabilities_for_examples_multitoken(self, context, candidates):
        """
        Return probability of multi-token candidates given context.
        Prob of each candidate is normalized by number of tokens.
        Args:
            context: Tensor of token ids in context
            candidates: list of list of token ids in each candidate
        Returns: list containing probability for each candidate
        """
        # TODO: Combine into single batch
        mean_probs = []
        context = context.tolist()
        for candidate in candidates:
            token_log_probs = []
            combined = context + candidate
            # Exclude last token position when predicting next token
            batch = torch.tensor(combined[:-1]).unsqueeze(dim=0).to(self.device)
            # Shape (batch_size, seq_len, vocab_size)
            logits = self.model(batch)[0]
            # Shape (seq_len, vocab_size)
            log_probs = F.log_softmax(logits[-1, :, :], dim=-1)
            context_end_pos = len(context) - 1
            continuation_end_pos = context_end_pos + len(candidate)
            # TODO: Vectorize this
            # Up to but not including last token position
            for i in range(context_end_pos, continuation_end_pos):
                next_token_id = combined[i+1]
                next_token_log_prob = log_probs[i][next_token_id].item()
                token_log_probs.append(next_token_log_prob)
            mean_token_log_prob = statistics.mean(token_log_probs)
            mean_token_prob = math.exp(mean_token_log_prob)
            mean_probs.append(mean_token_prob)
        return mean_probs

  def attention_intervention_experiment(self, intervention, effect):
        """
        Run one full attention intervention experiment
        measuring indirect or direct effect.
        """
        # E.g. The doctor asked the nurse a question. He
        x = intervention.base_strings_tok[0]
        # E.g. The doctor asked the nurse a question. She
        x_alt = intervention.base_strings_tok[1]

        if effect == 'indirect':
            input = x_alt  # Get attention for x_alt
        elif effect == 'direct':
            input = x  # Get attention for x
        else:
            raise ValueError(f"Invalid effect: {effect}")
        batch = input.clone().detach().unsqueeze(0).to(self.device)
        attention_override = self.model(batch)[-1]

        batch_size = 1
        seq_len = len(x)
        seq_len_alt = len(x_alt)
        assert seq_len == seq_len_alt

        with torch.no_grad():

            candidate1_probs_head = torch.zeros((self.num_layers, self.num_heads))
            candidate2_probs_head = torch.zeros((self.num_layers, self.num_heads))
            candidate1_probs_layer = torch.zeros(self.num_layers)
            candidate2_probs_layer = torch.zeros(self.num_layers)

            if effect == 'indirect':
                context = x
            else:
                context = x_alt

            # Intervene at every layer and head by overlaying attention induced by x_alt
            model_attn_override_data = [] # Save layer interventions for model-level intervention later
            for layer in range(self.num_layers):
                layer_attention_override = attention_override[layer]
                attention_override_mask = torch.ones_like(layer_attention_override, dtype=torch.uint8)
                layer_attn_override_data = [{
                    'layer': layer,
                    'attention_override': layer_attention_override,
                    'attention_override_mask': attention_override_mask
                }]
                candidate1_probs_layer[layer], candidate2_probs_layer[layer] = self.attention_intervention(
                    context=context,
                    outputs=intervention.candidates_tok,
                    attn_override_data = layer_attn_override_data)
                model_attn_override_data.extend(layer_attn_override_data)
                for head in range(self.num_heads):
                    attention_override_mask = torch.zeros_like(layer_attention_override, dtype=torch.uint8)
                    attention_override_mask[0][head] = 1 # Set mask to 1 for single head only
                    head_attn_override_data = [{
                        'layer': layer,
                        'attention_override': layer_attention_override,
                        'attention_override_mask': attention_override_mask
                    }]
                    candidate1_probs_head[layer][head], candidate2_probs_head[layer][head] = self.attention_intervention(
                        context=context,
                        outputs=intervention.candidates_tok,
                        attn_override_data=head_attn_override_data)

            # Intervene on entire model by overlaying attention induced by x_alt
            candidate1_probs_model, candidate2_probs_model = self.attention_intervention(
                context=context,
                outputs=intervention.candidates_tok,
                attn_override_data=model_attn_override_data)

        return candidate1_probs_head, candidate2_probs_head, candidate1_probs_layer, candidate2_probs_layer,\
            candidate1_probs_model, candidate2_probs_model

  def attention_intervention(self,
                               context,
                               outputs,
                               attn_override_data):
        """ Override attention values in specified layer
        Args:
            context: context text
            outputs: candidate outputs
            attn_override_data: list of dicts of form:
                {
                    'layer': <index of layer on which to intervene>,
                    'attention_override': <values to override the computed attention weights.
                           Shape is [batch_size, num_heads, seq_len, seq_len]>,
                    'attention_override_mask': <indicates which attention weights to override.
                                Shape is [batch_size, num_heads, seq_len, seq_len]>
                }
        """

        def intervention_hook(module, input, outputs, attn_override, attn_override_mask):
            attention_override_module = (AttentionOverride)(module, attn_override, attn_override_mask)
            return attention_override_module(*input)

        with torch.no_grad():
            hooks = []
            for d in attn_override_data:
                attn_override = d['attention_override']
                attn_override_mask = d['attention_override_mask']
                layer = d['layer']
                hooks.append(self.attention_layer(layer).register_forward_hook(
                    partial(intervention_hook,
                            attn_override=attn_override,
                            attn_override_mask=attn_override_mask)))

            new_probabilities = self.get_probabilities_for_examples_multitoken(
                context,
                outputs)

            for hook in hooks:
                hook.remove()

            return new_probabilities

In [8]:
class Intervention():
       def __init__(self,
                  tokenizer,
                  base_string: str,
                  substitutes: list,
                  candidates: list,
                  device='cpu'):
          super()
          self.device = device
          self.enc = tokenizer

          # First item should be neutral, others tainted
          self.base_strings = [base_string.format(s)
                              for s in substitutes]
          # Tokenized bases
          self.base_strings_tok = [
              self.enc.encode(s,
                              add_special_tokens=False)
              for s in self.base_strings
          ]
          # print(self.base_strings, self.base_strings_tok)
          self.base_strings_tok = torch.LongTensor(self.base_strings_tok)\
                                      .to(device)
          self.position = base_string.split().index('{}')
          self.candidates = []
          for c in candidates:
              # 'a ' added to input so that tokenizer understand that first word follows a space.
              tokens = self.enc.tokenize(
                  'a ' + c)[1:]
              self.candidates.append(tokens)

          self.candidates_tok = [self.enc.convert_tokens_to_ids(tokens)
                                for tokens in self.candidates]

def construct_interventions(base_sent, professions, tokenizer, DEVICE):
    interventions = {}
    unused = 0
    for p in professions:
      try:
        interventions[p] = Intervention(tokenizer, base_sent, [p, "man", "woman"], ["he", "she"], device=DEVICE)
      except:
        unused += 1
    print("Unused Professions", unused)
    return interventions

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm
from functools import partial

model = Model(output_attentions=True, gpt2_version=model_name, device=device, random_weights=random_weights)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [10]:
import os, re
class WinobiasExample():

    def __init__(self, base_string, female_pronoun, male_pronoun, female_occupation, male_occupation,
                 female_occupation_continuation, male_occupation_continuation):
        self.base_string = base_string
        self.female_pronoun = female_pronoun
        self.male_pronoun = male_pronoun
        self.female_occupation = female_occupation
        self.male_occupation = male_occupation
        self.female_occupation_continuation = female_occupation_continuation
        self.male_occupation_continuation = male_occupation_continuation

    def to_intervention(self, tokenizer):
        return Intervention(
            tokenizer=tokenizer,
            base_string=self.base_string,
            substitutes=[self.female_pronoun, self.male_pronoun],
            candidates=[self.female_occupation_continuation, self.male_occupation_continuation]
        )

    def __str__(self):
        return inspect.cleandoc(f"""
            base_string: {self.base_string}
            female_pronoun: {self.female_pronoun}
            male_pronoun: {self.male_pronoun}
            female_occupation: {self.female_occupation}
            male_occupation: {self.male_occupation}
            female_occupation_continuation: {self.female_occupation_continuation}
            male_occupation_continuation: {self.male_occupation_continuation}
        """)

    def __repr__(self):
        return str(self).replace('\n', ' ')

def _parse_row(row, occupations):
    _, sentence = row.strip().split(' ', 1)
    occupation = None
    for occ in occupations:
        if f'[the {occ.lower()}]' in sentence.lower():
            assert occupation is None
            occupation = occ.lower()
    assert occupation is not None

    pronoun_groups = [ # First element is female, second is male
        ('she', 'he'), # nominative
        ('her', 'his') # possessive
    ]

    num_matches = 0
    substitutes = None
    for pronouns in pronoun_groups:
        pattern = '|'.join(r'\[' + p + r'\]' for p in pronouns) # matches '[he]', '[she]', etc.
        pronoun_matches = re.findall(pattern, sentence)
        assert len(pronoun_matches) <= 1
        if pronoun_matches:
            num_matches += 1
            pronoun_match = pronoun_matches[0]
            context, continuation = sentence.split(pronoun_match)
            context = context.replace('[', '').replace(']', '')
            context = context.strip()
            assert '[' not in continuation
            continuation = continuation.strip()
            substitutes = pronouns
    assert num_matches == 1
    base_string = context + ' {}'

    return base_string, substitutes, continuation, occupation

def load_examples(path, verbose=False):
    with open(os.path.join(path, 'female_occupations.txt')) as f:
        female_occupations = [row.lower().strip() for row in f]
    with open(os.path.join(path, 'male_occupations.txt')) as f:
        male_occupations = [row.lower().strip() for row in f]
    occupations = female_occupations + male_occupations

    fname = f'pro_stereotyped_type1.txt.dev'

    with open(os.path.join(path, fname)) as f:
        examples = []
        row_pair = []
        skip_count = 0
        for row in f:
            row_pair.append(row)
            if len(row_pair) == 2:
                skip = False
                if row_pair[0].count('[') != 2 or row_pair[1].count('[') != 2: # Multiple pronouns
                    skip = True
                elif '[him]' in row_pair[0] + row_pair[1]: # Objective pronoun, almost always at end of sentence
                    skip = True
                else:
                    base_string1, substitutes1, continuation1, occupation1 = _parse_row(row_pair[0], occupations)
                    base_string2, substitutes2, continuation2, occupation2 = _parse_row(row_pair[1], occupations)
                    if base_string1 != base_string2 or substitutes1 != substitutes2:
                        skip = True
                if skip:
                    if verbose:
                        print('Skipping: ', row_pair)
                    skip_count += 1
                    row_pair = []
                    continue
                base_string = base_string1
                assert substitutes1 == substitutes2
                female_pronoun, male_pronoun = substitutes1
                assert len(continuation1) > 0 and len(continuation2) > 0 and continuation1 != continuation2
                assert len(occupation1) > 0 and len(occupation2) > 0 and occupation1 != occupation2
                if occupation1 in female_occupations:
                    female_occupation = occupation1
                    female_occupation_continuation = continuation1
                    male_occupation = occupation2
                    male_occupation_continuation = continuation2
                    assert occupation2 in male_occupations
                else:
                    male_occupation = occupation1
                    male_occupation_continuation = continuation1
                    female_occupation = occupation2
                    female_occupation_continuation = continuation2
                    assert occupation1 in male_occupations
                    assert occupation2 in female_occupations
                examples.append(WinobiasExample(base_string, female_pronoun, male_pronoun, female_occupation, male_occupation,
                 female_occupation_continuation, male_occupation_continuation))
                row_pair = []
        assert row_pair == []
    print(f'Loaded {len(examples)} pairs. Skipped {skip_count} pairs.')
    return examples

import torch
examples = load_examples('winobias_data/')
json_data = {'model_version': model_name,
            'do_filter': False,
            'split': 'dev',
            'num_examples_loaded': len(examples)}
json_data['num_examples_analyzed'] = len(examples)
interventions = [ex.to_intervention(tokenizer) for ex in examples]

Loaded 160 pairs. Skipped 38 pairs.


In [11]:
def perform_intervention(intervention, model, effect_types=('indirect', 'direct')):
    """Perform intervention and return results for specified effects"""
    x = intervention.base_strings_tok[0]  # E.g. The doctor asked the nurse a question. She
    x_alt = intervention.base_strings_tok[1]  # E.g. The doctor asked the nurse a question. He

    with torch.no_grad():
        candidate1_base_prob, candidate2_base_prob = model.get_probabilities_for_examples_multitoken(
            x,
            intervention.candidates_tok)
        candidate1_alt_prob, candidate2_alt_prob = model.get_probabilities_for_examples_multitoken(
            x_alt,
            intervention.candidates_tok)

    candidate1 = ' '.join(intervention.candidates[0]).replace('Ġ', '')
    candidate2 = ' '.join(intervention.candidates[1]).replace('Ġ', '')

    odds_base = candidate2_base_prob / candidate1_base_prob
    odds_alt = candidate2_alt_prob / candidate1_alt_prob
    total_effect = (odds_alt - odds_base) / odds_base

    results = {
        'base_string1': intervention.base_strings[0],
        'base_string2': intervention.base_strings[1],
        'candidate1': candidate1,
        'candidate2': candidate2,
        'candidate1_base_prob': candidate1_base_prob,
        'candidate2_base_prob': candidate2_base_prob,
        'odds_base': odds_base,
        'candidate1_alt_prob': candidate1_alt_prob,
        'candidate2_alt_prob': candidate2_alt_prob,
        'odds_alt': odds_alt,
        'total_effect': total_effect,
    }

    for effect_type in effect_types:
        candidate1_probs_head, candidate2_probs_head, candidate1_probs_layer, candidate2_probs_layer,\
            candidate1_probs_model, candidate2_probs_model = model.attention_intervention_experiment(
            intervention, effect_type)
        odds_intervention_head = candidate2_probs_head / candidate1_probs_head
        odds_intervention_layer = candidate2_probs_layer / candidate1_probs_layer
        odds_intervention_model = candidate2_probs_model / candidate1_probs_model
        effect_head = (odds_intervention_head - odds_base) / odds_base
        effect_layer = (odds_intervention_layer - odds_base) / odds_base
        effect_model = (odds_intervention_model - odds_base) / odds_base

        results[effect_type + "_odds_head"] = odds_intervention_head.tolist()
        results[effect_type + "_effect_head"] = effect_head.tolist()
        results[effect_type + "_effect_layer"] = effect_layer.tolist()
        results[effect_type + "_effect_model"] = effect_model

    return results

def perform_interventions(interventions, model, effect_types=('indirect', 'direct')):
    """Perform multiple interventions"""
    results_list = []
    for intervention in tqdm(interventions):
        results = perform_intervention(intervention, model, effect_types)
        results_list.append(results)
    return results_list

import torch
import torch.nn.functional as F
import numpy as np
from datetime import datetime
import os
from copy import deepcopy
import statistics
import math
import torch.nn as nn

class AttentionOverride(nn.Module):
    """A copy of `modeling_gpt2.Attention` class, but with overridden attention values"""

    def __init__(self, attention, attn_override, attn_override_mask):
        """
        Args:
            attention: instance of modeling_gpt2.Attention from which variables will be
                       copied.
            attn_override: values to override the computed attention weights.
                           Shape is [num_heads, seq_len, seq_len]
            attn_override_mask: indicates which attention weights to override.
                                Shape is [num_heads, seq_len, seq_len]
        """
        super(AttentionOverride, self).__init__()
        # Copy values from attention
        self.output_attentions = True #attention.output_attentions
        self.register_buffer("bias", attention._buffers["bias"])
        self.n_head = attention.n_head
        self.split_size = attention.split_size
        self.scale = attention.scale
        self.c_attn = attention.c_attn
        self.c_proj = attention.c_proj
        self.attn_dropout = attention.attn_dropout
        self.resid_dropout = attention.resid_dropout
        # Set attention override values
        self.attn_override = attn_override
        self.attn_override_mask = attn_override_mask

    def _attn(self, q, k, v, attention_mask=None, head_mask=None):
        w = torch.matmul(q, k)
        if self.scale:
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns - nd : ns, :ns]
        w = w * b - 1e4 * (1 - b)

        if attention_mask is not None:
            # Apply the attention mask
            w = w + attention_mask

        w = nn.Softmax(dim=-1)(w)
        w = self.attn_dropout(w)

        # Mask heads if we want to
        if head_mask is not None:
            w = w * head_mask

        # attn_override and attn_override_mask are of shape
        # (batch_size, num_heads, override_seq_len, override_seq_len)
        # where override_seq_len is the length of subsequence for which attention is
        # being overridden.
        override_seq_len = self.attn_override_mask.shape[-1]
        w[:, :, :override_seq_len, :override_seq_len] = torch.where(
            self.attn_override_mask,
            self.attn_override,
            w[:, :, :override_seq_len, :override_seq_len],
        )

        outputs = [torch.matmul(w, v)]
        if self.output_attentions:
            outputs.append(w)
        return outputs

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states

    def split_heads(self, x, k=False):
        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
        if k:
            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
        else:
            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
        x = self.c_attn(x)
        query, key, value = x.split(self.split_size, dim=2)
        query = self.split_heads(query)
        key = self.split_heads(key, k=True)
        value = self.split_heads(value)
        if layer_past is not None:
            past_key, past_value = (
                layer_past[0].transpose(-2, -1),
                layer_past[1],
            )  # transpose back cf below
            key = torch.cat((past_key, key), dim=-1)
            value = torch.cat((past_value, value), dim=-2)
        present = torch.stack(
            (key.transpose(-2, -1), value)
        )  # transpose to have same shapes for stacking

        attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
        a = attn_outputs[0]

        a = self.merge_heads(a)
        a = self.c_proj(a)
        a = self.resid_dropout(a)

        outputs = [a, present] + attn_outputs[1:]
        return outputs  # a, present, (attentions)

results = perform_interventions(interventions, model)

100%|██████████| 160/160 [33:04<00:00, 12.40s/it]


In [12]:
from pandas import DataFrame

json_data['mean_total_effect'] = DataFrame(results).total_effect.mean()
json_data['mean_model_indirect_effect'] = DataFrame(results).indirect_effect_model.mean()
json_data['mean_model_direct_effect'] = DataFrame(results).direct_effect_model.mean()
filter_name = 'unfiltered'
if random_weights:
    gpt2_version += '_random'
fname = f"winobias_data/attention_intervention_{model_name}_{filter_name}_{split}.json"
json_data['results'] = results
with open(fname, 'w') as f:
    json.dump(json_data, f)

NameError: ignored