In [13]:
import pandas as pd
import random
import numpy as np
from tqdm import tqdm
import ipdb
import re
from collections import Counter

import matplotlib.pyplot as plt
# import mplcursors
import seaborn as sns
%matplotlib inline
sns.set(style='darkgrid', context='notebook', rc={'figure.figsize':(14,10)}, font_scale=2)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('chained_assignment',None)

# Set random seeds for reproducibility on a specific machine
random.seed(1)
np.random.seed(1)
np.random.RandomState(1)

from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    set_seed
)
import torch

Control has the predictions of the best performing model at the task.

Compare and contrast that with the model's performance I guess

In [2]:
all_df = pd.read_csv('../../data/cong_data.tsv', sep='\t')

What is the specificity of the training distribution?

In [3]:
np.round(np.mean(all_df[all_df['Split']=='train']['Specificity']), 2), np.round(np.std(all_df[all_df['Split']=='train']['Specificity']), 2)

(3.49, 0.53)

In [4]:
control = pd.read_csv('qual/predictions-control.tsv', sep='\t')

In [5]:
df = pd.read_csv('qual/predictions-dev-neg48.tsv', sep='\t')

In [6]:
df['control'] = control['pred']
df['pred'] = df['pred'].astype(int)
df['control'] = df['control'].astype(int)

# Where does prediction change?

In [7]:
df[(df['pred']==0) & (df['control']==1)].loc[:, ['tweet', 'group', 'Specificity', 'affect', 'spec', 'pred', 'control']]

Unnamed: 0,tweet,group,Specificity,affect,spec,pred,control
3,"With the 2020 Census fast approaching, I joined @SenKamalaHarris and @senatorcarper in introducing legislation to ensure the approximately 10 million Americans who identify as LGBTQ are properly counted and represented in Census data collection efforts.",1,4.325275,1,1,0,1
4,"Rest In Peace, @senjohnmccain. Thank you for serving our country with dignity and honor, as a Navy Pilot, a POW, elected official and a presidential candidate. You were a patriot and a true American hero. My deepest condolences and prayers to his family and loved ones.",0,4.142287,1,1,0,1
8,"1\/3 Well said, @senatorlankford. We are $21 trillion in debt. Since 1974, we have passed all of our spending bills only 4 times! And, in the past 20 years, we have only passed a budget 11 times. We need to fix our broken budget and approps process.",1,3.420717,1,100,0,1
9,"Today the House approved @sendougjones' & my legislation to require the review, declassification & release of government records related to unsolved criminal civil rights cases. -->",0,4.088381,1,1,0,1
20,"Selma, AL has a special place in my heart. I was born here, and it’s also where my friend & civil rights hero @repjohnlewis & other activists crossed the Edmund Pettus Bridge while facing brutal opposition to their peaceful pursuit for civil rights.",1,4.396554,1,1,0,1
26,"Thank you, .@senschumer for thinking of including the Northern Marianas in your amendment to the disaster recovery appropriation bill. Our sincere gratitude for your help.",1,3.85178,1,100,0,1
28,"Senator @chriscoons just asked Attorney General Barr: ""What if a foreign adversary offers a presidential candidate dirt on a competitor in 2020. Do you agree with me the campaign should immediately contact the FBI?"" Barr sat silent, unable to give a prompt response.",1,4.053232,0,1,0,1
33,"Thank you to @senatortester for introducing our bill, the Veteran HOUSE Act, in the Senate. Together, we can ensure veterans with OTH discharges, who are more likely to experience homelessness, can access the wraparound services they need.",1,4.195439,1,1,0,1
38,"Incredibly honored to host @speakerpelosi in Colorado today, along with CO Insurance Commissioner @mike_conway14 for a robust policy discussion on efforts being done at the state level and in the House of Representatives to #ProtectOurCare",1,4.444076,1,1,0,1
41,"As @speakerpelosi says, the times have found each and every one of us to Defend our Democracy For The People. Worth reading every line.",1,3.217054,1,100,0,1


In [8]:
Counter(df['spec'])

Counter({1: 56, 100: 212, 0: 38})

In [9]:
def specificity_calc(x):
   '''
   0 if specificity is less than 3, and 1 if specificity is greater than 4
   '''

   if (x-3.0) < 0.00001:
      return 0
   elif (x-4.0) > 0.00001:
      return 1
   else:
      return 100

all_df['affect'] = all_df.apply(lambda x: 1 if (x['Feeling']=='warm' or x['Behavior']=='app') else 0, axis=1)

all_df['spec'] = all_df['Specificity'].apply(lambda x: specificity_calc(x))
Counter(all_df['spec'])

Counter({100: 1971, 1: 579, 0: 483})

In [10]:
Counter(all_df[all_df['spec']==100]['affect']), Counter(all_df[all_df['spec']==1]['affect']), Counter(all_df[all_df['spec']==0]['affect'])

(Counter({1: 1451, 0: 520}),
 Counter({1: 477, 0: 102}),
 Counter({1: 353, 0: 130}))

In [14]:
# Total Intervention hook function
def intervention(h_out, P, ws, alpha):
    '''
    Perform positive or negative intervention on all tokens
    '''
    # Take out the cls token
    h_tokens = h_out[0][:,1:,:]

    # AlterRep code starts here
    signs = torch.sign(h_tokens@ws.T).long()

    # h_r component
    proj = (h_tokens@ws.T)
    if alpha>=0:
        proj = proj * signs
    else:
        proj = proj * (-signs)
    h_r = proj@ws*np.abs(alpha)

    # Get vector only in the direction of perpendicular to decision boundary
    h_n = h_tokens@P

    # Now pushing it either in positive or negative intervention direction
    h_alter = h_n + h_r

    # Return h_alter concatenated with the cls token
    return (torch.cat((h_out[0][:,:1,:], h_alter), dim=1),)

from inlp.debias import debias_by_specific_directions

with open("../reps_affect/Ws.layer=11.seed=1.npy", "rb") as f:
    Ws = np.load(f)

        
# Reduce Ws to number of classifiers you want to set it to
Ws = Ws[:32:]

# Now derive P from Ws
list_of_ws = [np.array([Ws[i, :]]) for i in range(Ws.shape[0])]
P = debias_by_specific_directions(directions=list_of_ws, input_dim=Ws.shape[1])

Ws_aff = torch.tensor(Ws/np.linalg.norm(Ws, keepdims = True, axis = 1)).to(torch.float32)
P_aff = torch.tensor(P).to(torch.float32)

# Insert newaxis for 1 classifier edge case
if len(Ws_aff.shape) == 1:
    Ws_aff = Ws_aff[np.newaxis,:]

## What does the intervention do?

Here I want to apply the intervention on individual tokens, and see if it makes them more of a certain affect, or more general

In [15]:
text = "Happy <mask> @corybooker! I got you a new bill: #IIOA"

In [16]:
set_seed(1)
tokenizer = AutoTokenizer.from_pretrained('../saved_models/bertweet-base_proto_0_seed_1_epoch_3/', use_fast=True)
model = AutoModelForMaskedLM.from_pretrained('../saved_models/bertweet-base_proto_0_seed_1_epoch_3/')
input_dict=tokenizer(text, return_tensors='pt')
logits = torch.log_softmax(model(**input_dict)['logits'], dim=2)
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

Some weights of the model checkpoint at ../saved_models/bertweet-base_proto_0_seed_1_epoch_3/ were not used when initializing RobertaForMaskedLM: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ../saved_models/bertweet-base_proto_0_seed_1_epoch_3/ and are newly initialized: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.

['/status/@@',
 'ousem@@',
 'sworn',
 'LYR@@',
 'Timo@@',
 'giar@@',
 'opport@@',
 'Narr@@',
 '#tvtime',
 'Eup@@']

## Vanilla model use here

In [17]:
set_seed(1)
tokenizer2 = AutoTokenizer.from_pretrained('vinai/bertweet-base', use_fast=True)
model2 = AutoModelForMaskedLM.from_pretrained('vinai/bertweet-base')
input_dict=tokenizer2(text, return_tensors='pt')
logits = torch.log_softmax(model2(**input_dict)['logits'], dim=2)
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer2.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer2.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['birthday',
 'Birthday',
 'new',
 '#@@',
 'New',
 'anniversary',
 '4th',
 'Anniversary',
 '21st',
 'belated']

What happens when you push in negative affect direction?

In [18]:
set_seed(1)
hook = model2.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P, ws=Ws, alpha=-4))
input_dict=tokenizer2(text, return_tensors='pt')
logits = torch.log_softmax(model2(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer2.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer2.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

What about positive affect direction?

In [19]:
hook = model2.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P, ws=Ws, alpha=4))
input_dict=tokenizer2(text, return_tensors='pt')
logits = torch.log_softmax(model2(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer2.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer2.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

What about pushing to be more specific?

In [20]:
with open("../reps_spec/Ws.layer=11.seed=1.npy", "rb") as f:
    Ws = np.load(f)

        
# Reduce Ws to number of classifiers you want to set it to
Ws = Ws[:32:]

# Now derive P from Ws
list_of_ws = [np.array([Ws[i, :]]) for i in range(Ws.shape[0])]
P = debias_by_specific_directions(directions=list_of_ws, input_dim=Ws.shape[1])

Ws_sp = torch.tensor(Ws/np.linalg.norm(Ws, keepdims = True, axis = 1)).to(torch.float32)
P_sp = torch.tensor(P).to(torch.float32)

# Insert newaxis for 1 classifier edge case
if len(Ws_sp.shape) == 1:
    Ws_sp = Ws_sp[np.newaxis,:]

In [21]:
hook = model2.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P_sp, ws=Ws_sp, alpha=-4))
input_dict=tokenizer2(text, return_tensors='pt')
logits = torch.log_softmax(model2(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer2.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer2.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [22]:
hook = model2.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P, ws=Ws, alpha=4))
input_dict=tokenizer2(text, return_tensors='pt')
logits = torch.log_softmax(model2(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer2.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer2.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

What about the finetuned model?

In [23]:
hook = model.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P_aff, ws=Ws_aff, alpha=-4))
input_dict=tokenizer(text, return_tensors='pt')
logits = torch.log_softmax(model(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

['sworn',
 '/status/@@',
 'Agg@@',
 'LYR@@',
 'bib',
 '#forex@@',
 '#call@@',
 'JAP@@',
 'ODY',
 'Shim@@']

In [24]:
hook = model.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P_aff, ws=Ws_aff, alpha=4))
input_dict=tokenizer(text, return_tensors='pt')
logits = torch.log_softmax(model(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

['ousem@@',
 'essm@@',
 'Humph@@',
 'ask@@',
 'Esca@@',
 'enth@@',
 'Memo@@',
 '-Le@@',
 'enan@@',
 '#tvtime']

In [25]:
hook = model.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P_sp, ws=Ws_sp, alpha=-4))
input_dict=tokenizer(text, return_tensors='pt')
logits = torch.log_softmax(model(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

['opport@@',
 'COMM@@',
 'ask@@',
 '#Fitstats_en_@@',
 'mug@@',
 'Humph@@',
 'Narr@@',
 'Fah@@',
 'ˈ@@',
 '\x93']

In [26]:
hook = model.roberta.encoder.layer[11].register_forward_hook(lambda m, h_in, h_out: intervention(h_out=h_out, P=P_sp, ws=Ws_sp, alpha=4))
input_dict=tokenizer(text, return_tensors='pt')
logits = torch.log_softmax(model(**input_dict)['logits'], dim=2)
hook.remove()
mask_token_index = torch.where(input_dict['input_ids'][0]==tokenizer.mask_token_id)[0].detach().cpu().tolist()[0]
tokenizer.convert_ids_to_tokens(torch.topk(logits[0,mask_token_index,:], 10)[1])

['chain@@',
 '#AMNDBots',
 '#Nar@@',
 'Nuke',
 'Fail@@',
 '#iHeartAwards',
 'Ends',
 '#Smule',
 'Spoken',
 '#TheyreTheOne']