In [12]:
import pandas as pd
import numpy as np

In [1]:
import torch



In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [3]:
from transformers import DataCollatorForLanguageModeling


In [16]:
from datasets import Dataset

In [26]:
from transformers import Trainer, TrainingArguments

In [59]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

### Discrimination Analysis

In [5]:
import re

In [6]:

def fill_masks(sentence, targets):
        new_sentence = sentence
        for target in targets:
            new_sentence = re.sub('MASK',target,new_sentence, count=1)
        #print("FILLED MASK:", new_sentence)
        return new_sentence

In [75]:

def read_file(filename):
    df = pd.read_csv(filename)
    df['Target_Stereotypical'] = df['Target_Stereotypical'].apply(lambda x: x.replace("'", "").replace('[', '').replace(']','').split(','))
    df['Target_Anti-Stereotypical'] = df['Target_Anti-Stereotypical'].apply(lambda x: x.replace("'", "").replace('[', '').replace(']','').split(','))
    df['Stereotypical'] = df.apply(lambda x: fill_masks(x['Sentence'],x['Target_Stereotypical']),axis=1)
    df['Anti-Stereotypical'] = df.apply(lambda x: fill_masks(x['Sentence'],x['Target_Anti-Stereotypical']),axis=1)
    return df

def calculate_aul(model, sentence, log_softmax):
    '''
    Given token ids of a sequence, return the averaged log probability of
    unmasked sequence (AULA or AUL).
    '''
    tokens = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)

    # Get the token IDs and attention mask
    input_ids = tokens['input_ids'].to('mps')
    output = model(input_ids, output_hidden_states=True)
    logits = output.logits.squeeze(0)
    log_probs = log_softmax(logits)
    input_ids = input_ids.view(-1, 1).detach()
    token_log_probs = log_probs.gather(1, input_ids)[1:-1]
    sentence_log_prob = torch.mean(token_log_probs)
    score = sentence_log_prob.item()

    hidden_states = output.hidden_states[-1][:,1:-1].cpu()
    hidden_state = torch.mean(hidden_states, 1).detach().numpy()

    return score, hidden_state




In [8]:

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


In [9]:
log_softmax = torch.nn.LogSoftmax(dim=1)

In [13]:
caste_df = read_file(filename="../data/Caste.csv")

In [60]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-multilingual-uncased")

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Pretrain model on stereotypical statements

In [17]:
def tokenize_function(data):
    result = tokenizer(data)
    return result


In [18]:
def concat_chunk_dataset(data):
    chunk_size = 128
    
    # drop the last chunk if is smaller than the chunk size
    total_length = len(data["input_ids"])

    # split the concatenated sentences into chunks using the total length
    result = {k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in data.items()}
    

    return pd.DataFrame(result)



In [64]:
tokens = caste_df['Stereotypical'].apply(lambda x: tokenize_function(x))

In [65]:
processed_dataset = pd.concat(tokens.apply(lambda x : concat_chunk_dataset(x)).tolist()).reset_index(drop=True)

In [66]:
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = .15)

In [67]:
stereo_ds = Dataset.from_pandas(processed_dataset)

In [68]:
downsampled_dataset = stereo_ds.train_test_split(train_size=0.8, test_size=0.2, seed=42)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 84
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22
    })
})

In [69]:

training_args = TrainingArguments(
    "indic-bert-finetuned-stereo",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

In [70]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

In [71]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,1.282988
2,No log,2.223256
3,No log,1.59323


TrainOutput(global_step=33, training_loss=1.6709084944291548, metrics={'train_runtime': 44.372, 'train_samples_per_second': 5.679, 'train_steps_per_second': 0.744, 'total_flos': 1572385472352.0, 'train_loss': 1.6709084944291548, 'epoch': 3.0})

In [72]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 2.65


In [73]:
eval_results

{'eval_loss': 0.9746655821800232,
 'eval_runtime': 0.3336,
 'eval_samples_per_second': 65.956,
 'eval_steps_per_second': 8.994,
 'epoch': 3.0}

In [74]:
model=trainer.model

In [35]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-multilingual-uncased")

In [76]:

stereo_inputs = [i for i in caste_df['Stereotypical']]
antistereo_inputs = [i for i in caste_df['Anti-Stereotypical']]

stereo_scores = []
antistereo_scores = []
stereo_embes = []
antistereo_embes = []

for i in stereo_inputs:
    stereo_score, stereo_hidden_state = calculate_aul(model, i, log_softmax)
    stereo_scores.append(stereo_score)
    stereo_embes.append(stereo_hidden_state)

for j in antistereo_inputs:
    antistereo_score, antistereo_hidden_state = calculate_aul(model, j, log_softmax)
    antistereo_scores.append(antistereo_score)
    antistereo_embes.append(antistereo_hidden_state)

stereo_scores = np.array(stereo_scores)
stereo_scores = stereo_scores.reshape([-1, 1])
antistereo_scores = np.array(antistereo_scores)
antistereo_scores = antistereo_scores.reshape([1, -1])
bias_scores = stereo_scores > antistereo_scores

stereo_embes = np.concatenate(stereo_embes)
antistereo_embes = np.concatenate(antistereo_embes)
weights = cos_sim(stereo_embes, antistereo_embes.T)

weighted_bias_scores = bias_scores * weights
bias_score = np.sum(weighted_bias_scores) / np.sum(weights)
print('bias score (emb):', round(bias_score * 100, 2))

bias score (emb): 55.95



mbert: 57.74
mbert - finetune on stereo: 55.95