In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# https://pair-code.github.io/lit/tutorials/text-salience/

In [None]:
# saliency is calcualated as the gradient of the model's prediction with respect to each input token
# (or input embedding)

# gradient of class logits with respect to input tokens

In [None]:
# then take the l2 norm of the gradient

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/LIN371/classification_only_model")
tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/LIN371/classification_only_model")
model.eval()

def compute_gradients(model, tokenizer, text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    # Get embeddings directly
    embedding_layer = model.bert.embeddings
    token_embeddings = embedding_layer.word_embeddings(input_ids)  # Shape: (batch_size, seq_len, hidden_size)
    position_ids = torch.arange(input_ids.size(1)).unsqueeze(0)  # Generate position indices
    position_embeddings = embedding_layer.position_embeddings(position_ids)
    segment_embeddings = embedding_layer.token_type_embeddings(token_type_ids)

    # Sum token and position embeddings to get input embeddings
    input_embeddings = token_embeddings + position_embeddings + segment_embeddings
    input_embeddings.retain_grad()

    # Zero out any existing gradients
    model.zero_grad()

    # Forward pass with `inputs_embeds` to provide the custom embeddings
    outputs = model(inputs_embeds=input_embeddings, attention_mask=attention_mask, output_hidden_states=True)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1)

    # Get the logit of the predicted class and backpropagate
    pred_logit = logits[0, predicted_class]
    pred_logit.backward()

    # Extract gradients for the input embeddings
    gradients = input_embeddings.grad[0]

    # Convert input_ids to tokens for interpretation
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy())

    # Compute saliency scores (gradient magnitudes)
    saliency_scores = gradients.pow(2).sum(dim=-1).sqrt().squeeze()  # L2 norm (magnitude of gradients)

    return list(zip(tokens, saliency_scores))

# Example input text
text = "Your ass is big, I love it"

# Compute the saliency scores for the input text
scores = compute_gradients(model, tokenizer, text)

# Display the saliency scores for each token
for token, score in scores:
    print(f"Token: {token}, L2 gradient: {score}")


Token: [CLS], L2 gradient: 0.4622496962547302
Token: your, L2 gradient: 1.4340214729309082
Token: ass, L2 gradient: 1.357755422592163
Token: is, L2 gradient: 0.9130953550338745
Token: big, L2 gradient: 1.3689920902252197
Token: ,, L2 gradient: 0.5416591763496399
Token: i, L2 gradient: 0.6006713509559631
Token: love, L2 gradient: 1.0875906944274902
Token: it, L2 gradient: 0.5941902995109558
Token: [SEP], L2 gradient: 0.7570545077323914


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/LIN371/classification_only_model")
tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/LIN371/classification_only_model")
model.eval()

def compute_gradients_proba(model, tokenizer, text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    token_type_ids = inputs['token_type_ids']

    # Get embeddings directly
    embedding_layer = model.bert.embeddings
    token_embeddings = embedding_layer.word_embeddings(input_ids)  # Shape: (batch_size, seq_len, hidden_size)
    position_ids = torch.arange(input_ids.size(1)).unsqueeze(0)  # Generate position indices
    position_embeddings = embedding_layer.position_embeddings(position_ids)
    segment_embeddings = embedding_layer.token_type_embeddings(token_type_ids)

    # Sum token and position embeddings to get input embeddings
    input_embeddings = token_embeddings + position_embeddings + segment_embeddings
    input_embeddings.retain_grad()

    # Zero out any existing gradients
    model.zero_grad()

    # Forward pass with `inputs_embeds` to provide the custom embeddings
    outputs = model(inputs_embeds=input_embeddings, attention_mask=attention_mask, output_hidden_states=True)
    logits = outputs.logits

    probabilities = F.softmax(logits, dim=-1)

    # Choose the probability of the predicted class (the one with the highest probability)
    predicted_class = torch.argmax(probabilities, dim=-1)
    pred_probability = probabilities[0, predicted_class]

    # Get the logit of the predicted class and backpropagate
    pred_probability.backward()

    # Extract gradients for the input embeddings
    gradients = input_embeddings.grad[0]

    # Convert input_ids to tokens for interpretation
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy())

    # Compute saliency scores (gradient magnitudes)
    saliency_scores = gradients.pow(2).sum(dim=-1).sqrt().squeeze()  # L2 norm (magnitude of gradients)

    return list(zip(tokens, saliency_scores))

# Example input text
text = "Your ass is big, I love it"

# Compute the saliency scores for the input text
scores = compute_gradients_proba(model, tokenizer, text)

# Display the saliency scores for each token
for token, score in scores:
    print(f"Token: {token}, L2 gradient: {score}")


Token: [CLS], L2 gradient: 0.0011986501049250364
Token: your, L2 gradient: 0.003724822076037526
Token: ass, L2 gradient: 0.0035683128517121077
Token: is, L2 gradient: 0.002373127732425928
Token: big, L2 gradient: 0.00367000512778759
Token: ,, L2 gradient: 0.001428402727469802
Token: i, L2 gradient: 0.00159437523689121
Token: love, L2 gradient: 0.0028469939716160297
Token: it, L2 gradient: 0.0015406114980578423
Token: [SEP], L2 gradient: 0.0019966831896454096


In [None]:

# read predictions
predicitons = pd.read_csv("/content/drive/My Drive/LIN371/predictions.csv")
predicitons.head()

Unnamed: 0.1,Unnamed: 0,text,label,new_label,prediction
0,0,Those pussy lips need more cleaning with my to...,1,explicit_source_has_explicit_words,1
1,1,"I have choices for you. Choice seating, at that",1,explicit_source_no_explicit_words,1
2,2,I want to finish.,1,explicit_source_no_explicit_words,0
3,3,"Oh it Will, one way or an other 😉",1,explicit_source_no_explicit_words,1
4,4,"No need to thank me, thank you so much for sha...",1,explicit_source_no_explicit_words,0


In [None]:
# get some examples where pred is 1

pred_1 = predicitons[predicitons['prediction'] == 1].sample(5, random_state=5)
pred_1

Unnamed: 0.1,Unnamed: 0,text,label,new_label,prediction
654,654,Looks wet and ready,1,explicit_source_no_explicit_words,1
365,365,Come here you cuddly little rascal,0,control_source_no_explicit_words,1
2283,2283,Exactly what I need after a long day at work.,1,explicit_source_no_explicit_words,1
2544,2544,man i love a Bush! u are stunning,1,explicit_source_no_explicit_words,1
128,128,Mmmmm mmmm mmm\n,1,explicit_source_no_explicit_words,1


In [None]:
texts = pred_1['text'].tolist()
texts

['Looks wet and ready',
 'Come here you cuddly little rascal',
 'Exactly what I need after a long day at work.',
 'man i love a Bush! u are stunning',
 'Mmmmm mmmm mmm\n']

In [None]:
text0_scores = compute_gradients(model, tokenizer, texts[0])
text1_scores = compute_gradients(model, tokenizer, texts[1])
text2_scores = compute_gradients(model, tokenizer, texts[2])
text3_scores = compute_gradients(model, tokenizer, texts[3])
text4_scores = compute_gradients(model, tokenizer, texts[4])

In [None]:
# load the other model and do the same
mlm_model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/LIN371/transfer_learning")
mlm_tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/LIN371/transfer_learning")
mlm_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
mlm0_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[0])
mlm1_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[1])
mlm2_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[2])
mlm3_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[3])
mlm4_scores = compute_gradients(mlm_model, mlm_tokenizer, texts[4])

In [None]:
text0_scores

[('[CLS]', tensor(0.6672)),
 ('looks', tensor(3.2478)),
 ('wet', tensor(2.7304)),
 ('and', tensor(1.0789)),
 ('ready', tensor(2.4219)),
 ('[SEP]', tensor(1.3652))]

In [None]:
mlm0_scores

[('[CLS]', tensor(0.9131)),
 ('looks', tensor(4.3247)),
 ('wet', tensor(4.9842)),
 ('and', tensor(1.5723)),
 ('ready', tensor(3.6555)),
 ('[SEP]', tensor(2.1108))]

In [None]:
text0_df = pd.DataFrame({'token': [token for token, _ in text0_scores],
                         'simple_l2': [score.item() for _, score in text0_scores],
                         'mlm_simple_l2': [score.item() for _, score in mlm0_scores]})
text0_df

Unnamed: 0,token,simple_l2,mlm_simple_l2
0,[CLS],0.667221,0.913147
1,looks,3.247836,4.324699
2,wet,2.730376,4.984249
3,and,1.07888,1.572279
4,ready,2.421854,3.655497
5,[SEP],1.365179,2.110789


In [None]:
text1_df = pd.DataFrame({'token': [token for token, _ in text1_scores],
                         'simple_l2': [score.item() for _, score in text1_scores],
                         'mlm_simple_l2': [score.item() for _, score in mlm1_scores]})
text2_df = pd.DataFrame({'token': [token for token, _ in text2_scores],
                         'simple_l2': [score.item() for _, score in text2_scores],
                         'mlm_simple_l2': [score.item() for _, score in mlm2_scores]})
text3_df = pd.DataFrame({'token': [token for token, _ in text3_scores],
                         'simple_l2': [score.item() for _, score in text3_scores],
                         'mlm_simple_l2': [score.item() for _, score in mlm3_scores]})
text4_df = pd.DataFrame({'token': [token for token, _ in text4_scores],
                         'simple_l2': [score.item() for _, score in text4_scores],
                         'mlm_simple_l2': [score.item() for _, score in mlm4_scores]})

In [None]:
text0_df.to_csv('/content/drive/My Drive/LIN371/example_sent/text0_df', index=False)
text1_df.to_csv('/content/drive/My Drive/LIN371/example_sent/text1_df', index=False)
text2_df.to_csv('/content/drive/My Drive/LIN371/example_sent/text2_df', index=False)
text3_df.to_csv('/content/drive/My Drive/LIN371/example_sent/text3_df', index=False)
text4_df.to_csv('/content/drive/My Drive/LIN371/example_sent/text4_df', index=False)

OSError: Cannot save file into a non-existent directory: '/content/drive/My Drive/LIN371/example_sent'