In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# https://aclanthology.org/P18-1032.pdf

In [None]:
# gradient of class logits with respect to input tokens

In [None]:
# then take the l2 norm of the gradient

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/LIN371/classification_only_model")
tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/LIN371/classification_only_model")
model.eval()

def compute_gradients(model, tokenizer, text):
    # tokenize input text
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    # get embeddings directly
    embedding_layer = model.bert.embeddings
    token_embeddings = embedding_layer.word_embeddings(input_ids)  # Shape: (batch_size, seq_len, hidden_size)
    position_ids = torch.arange(input_ids.size(1)).unsqueeze(0)  # Generate position indices
    position_embeddings = embedding_layer.position_embeddings(position_ids)
    segment_embeddings = embedding_layer.token_type_embeddings(token_type_ids)

    # sum token and position embeddings to get input embeddings
    input_embeddings = token_embeddings + position_embeddings + segment_embeddings
    input_embeddings.retain_grad()

    # zero out existing gradients
    model.zero_grad()

    # forward pass
    outputs = model(inputs_embeds=input_embeddings, attention_mask=attention_mask, output_hidden_states=True)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1)

    # backpropagate
    pred_logit = logits[0, predicted_class]
    pred_logit.backward()

    # get gradients wrt input embeddings
    gradients = input_embeddings.grad[0]

    # convert ids to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy())

    # get saliency scores (gradient magnitudes)
    saliency_scores = gradients.pow(2).sum(dim=-1).sqrt().squeeze()  # L2 norm (magnitude of gradients)

    return list(zip(tokens, saliency_scores))

# example input text
text = "Your ass is big, I love it"
scores = compute_gradients(model, tokenizer, text)
for token, score in scores:
    print(f"Token: {token}, L2 gradient: {score}")


Token: [CLS], L2 gradient: 0.4622496962547302
Token: your, L2 gradient: 1.4340214729309082
Token: ass, L2 gradient: 1.357755422592163
Token: is, L2 gradient: 0.9130953550338745
Token: big, L2 gradient: 1.3689920902252197
Token: ,, L2 gradient: 0.5416591763496399
Token: i, L2 gradient: 0.6006713509559631
Token: love, L2 gradient: 1.0875906944274902
Token: it, L2 gradient: 0.5941902995109558
Token: [SEP], L2 gradient: 0.7570545077323914


In [None]:
# read predictions
predicitons = pd.read_csv("/content/drive/My Drive/LIN371/predictions.csv")
predicitons.head()

Unnamed: 0.1,Unnamed: 0,text,label,new_label,prediction
0,0,Those pussy lips need more cleaning with my to...,1,explicit_source_has_explicit_words,1
1,1,"I have choices for you. Choice seating, at that",1,explicit_source_no_explicit_words,1
2,2,I want to finish.,1,explicit_source_no_explicit_words,0
3,3,"Oh it Will, one way or an other 😉",1,explicit_source_no_explicit_words,1
4,4,"No need to thank me, thank you so much for sha...",1,explicit_source_no_explicit_words,0


In [None]:
# get some examples where pred is 1

pred_1 = predicitons[predicitons['prediction'] == 1].sample(5, random_state=5)
pred_1

Unnamed: 0.1,Unnamed: 0,text,label,new_label,prediction
654,654,Looks wet and ready,1,explicit_source_no_explicit_words,1
365,365,Come here you cuddly little rascal,0,control_source_no_explicit_words,1
2283,2283,Exactly what I need after a long day at work.,1,explicit_source_no_explicit_words,1
2544,2544,man i love a Bush! u are stunning,1,explicit_source_no_explicit_words,1
128,128,Mmmmm mmmm mmm\n,1,explicit_source_no_explicit_words,1


In [None]:
texts = pred_1['text'].tolist()
texts

['Looks wet and ready',
 'Come here you cuddly little rascal',
 'Exactly what I need after a long day at work.',
 'man i love a Bush! u are stunning',
 'Mmmmm mmmm mmm\n']

In [None]:
text0_scores = compute_gradients(model, tokenizer, texts[0])
text1_scores = compute_gradients(model, tokenizer, texts[1])
text2_scores = compute_gradients(model, tokenizer, texts[2])
text3_scores = compute_gradients(model, tokenizer, texts[3])
text4_scores = compute_gradients(model, tokenizer, texts[4])

In [None]:
for i, score in enumerate([text0_scores, text1_scores, text2_scores, text3_scores, text4_scores]):
  print(f'score {i}')
  print(score)


score 0
[('[CLS]', tensor(0.6672)), ('looks', tensor(3.2478)), ('wet', tensor(2.7304)), ('and', tensor(1.0789)), ('ready', tensor(2.4219)), ('[SEP]', tensor(1.3652))]
score 1
[('[CLS]', tensor(4.8384)), ('come', tensor(11.4452)), ('here', tensor(8.4636)), ('you', tensor(6.1941)), ('cu', tensor(11.1416)), ('##dd', tensor(16.4019)), ('##ly', tensor(6.8540)), ('little', tensor(8.9964)), ('ras', tensor(19.2967)), ('##cal', tensor(20.3418)), ('[SEP]', tensor(8.6093))]
score 2
[('[CLS]', tensor(1.2309)), ('exactly', tensor(7.6262)), ('what', tensor(3.6593)), ('i', tensor(1.9471)), ('need', tensor(3.7886)), ('after', tensor(2.3352)), ('a', tensor(1.0885)), ('long', tensor(1.9685)), ('day', tensor(2.3491)), ('at', tensor(1.2458)), ('work', tensor(1.9233)), ('.', tensor(1.3054)), ('[SEP]', tensor(1.5767))]
score 3
[('[CLS]', tensor(0.1088)), ('man', tensor(0.2832)), ('i', tensor(0.1579)), ('love', tensor(0.2243)), ('a', tensor(0.2265)), ('bush', tensor(0.6714)), ('!', tensor(0.2760)), ('u', ten

In [None]:
# load the other model and do the same
mlm_model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/LIN371/bert-base-uncased-mlm-classifier")
mlm_tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/LIN371/bert-base-uncased-mlm-classifier")
mlm_model.eval()
print()




In [None]:
mlm0_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[0])
mlm1_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[1])
mlm2_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[2])
mlm3_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[3])
mlm4_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[4])

In [None]:
for i, score in enumerate([mlm0_scores, mlm1_scores, mlm2_scores, mlm3_scores, mlm4_scores]):
  print(f'score {i}')
  print(score)


score 0
[('[CLS]', tensor(0.9131)), ('looks', tensor(4.3247)), ('wet', tensor(4.9842)), ('and', tensor(1.5723)), ('ready', tensor(3.6555)), ('[SEP]', tensor(2.1108))]
score 1
[('[CLS]', tensor(0.9447)), ('come', tensor(1.9453)), ('here', tensor(1.4286)), ('you', tensor(0.9562)), ('cu', tensor(1.5854)), ('##dd', tensor(2.4634)), ('##ly', tensor(1.3113)), ('little', tensor(1.7543)), ('ras', tensor(3.6329)), ('##cal', tensor(3.1300)), ('[SEP]', tensor(1.3050))]
score 2
[('[CLS]', tensor(0.6070)), ('exactly', tensor(2.8047)), ('what', tensor(1.2008)), ('i', tensor(0.7139)), ('need', tensor(1.4274)), ('after', tensor(0.9830)), ('a', tensor(0.4741)), ('long', tensor(0.9711)), ('day', tensor(0.8651)), ('at', tensor(0.5935)), ('work', tensor(1.0031)), ('.', tensor(0.5722)), ('[SEP]', tensor(0.7130))]
score 3
[('[CLS]', tensor(0.2605)), ('man', tensor(0.5814)), ('i', tensor(0.2711)), ('love', tensor(0.4240)), ('a', tensor(0.2761)), ('bush', tensor(0.9231)), ('!', tensor(0.5610)), ('u', tensor(1