In [5]:
import pandas as pd
import numpy as np

In [6]:
from sklearn.model_selection import train_test_split

In [1]:
import torch



In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [3]:
from transformers import DataCollatorForLanguageModeling


In [18]:
from datasets import Dataset

In [11]:
def tokenize_function(data):
    result = tokenizer(data)
    return result


In [12]:
def concat_chunk_dataset(data):
    chunk_size = 128
    
    # drop the last chunk if is smaller than the chunk size
    total_length = len(data["input_ids"])

    # split the concatenated sentences into chunks using the total length
    result = {k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in data.items()}
    

    return pd.DataFrame(result)



In [7]:
fir_df = pd.read_csv("13225_13225097_fir.csv")

In [8]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', return_tensors='pt', padding=True)


In [13]:
tokens = fir_df["fir_text"].apply(lambda x: tokenize_function(x))

Token indices sequence length is longer than the specified maximum sequence length for this model (753 > 512). Running this sequence through the model will result in indexing errors


In [14]:
processed_dataset = pd.concat(tokens.apply(lambda x : concat_chunk_dataset(x)).tolist()).drop_duplicates(subset="input_ids").reset_index(drop=True)

In [32]:
processed_dataset.shape

(2553, 2)

In [15]:
# fir_ds = Dataset.from_pandas(processed_dataset)

In [16]:
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = .15)

In [19]:
fir_ds = Dataset.from_pandas(processed_dataset)

In [20]:
train_size = 1000
test_size = int(0.2 * train_size)

downsampled_dataset = fir_ds.train_test_split(train_size=train_size, test_size=test_size, seed=42)


In [21]:
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [22]:
# we shall insert mask randomly in the sentence
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [23]:

# eval_dataset = downsampled_dataset["test"].map(
#     insert_random_mask,
#     batched=True, 
#     remove_columns = downsampled_dataset["test"].column_names)

# eval_dataset = eval_dataset.rename_columns({"masked_input_ids": "input_ids",
# "masked_attention_mask": "attention_mask","masked_labels": "labels"})

In [24]:
from transformers import Trainer, TrainingArguments

In [25]:
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:

training_args = TrainingArguments(
    "xlm-roberta-base-finetuned-bigger-chunks",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [28]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,2.181948
2,No log,1.979801
3,No log,1.879833


TrainOutput(global_step=375, training_loss=2.3959768880208334, metrics={'train_runtime': 2890.2427, 'train_samples_per_second': 1.038, 'train_steps_per_second': 0.13, 'total_flos': 197909291520000.0, 'train_loss': 2.3959768880208334, 'epoch': 3.0})

In [29]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.13


In [30]:
eval_results

{'eval_loss': 1.9640165567398071,
 'eval_runtime': 31.8601,
 'eval_samples_per_second': 6.277,
 'eval_steps_per_second': 0.785,
 'epoch': 3.0}

In [31]:
trainer.save_model("models/fir_bigger_chunks_ft")

### Discrimination Analysis

In [33]:
import re

In [38]:

def fill_masks(sentence, targets):
        new_sentence = sentence
        for target in targets:
            new_sentence = re.sub('MASK',target,new_sentence, count=1)
        #print("FILLED MASK:", new_sentence)
        return new_sentence

In [62]:

def read_file(filename):
    df = pd.read_csv(filename)
    df['Target_Stereotypical'] = df['Target_Stereotypical'].apply(lambda x: x.replace("'", "").replace('[', '').replace(']','').split(','))
    df['Target_Anti-Stereotypical'] = df['Target_Anti-Stereotypical'].apply(lambda x: x.replace("'", "").replace('[', '').replace(']','').split(','))
    df['Stereotypical'] = df.apply(lambda x: fill_masks(x['Sentence'],x['Target_Stereotypical']),axis=1)
    df['Anti-Stereotypical'] = df.apply(lambda x: fill_masks(x['Sentence'],x['Target_Anti-Stereotypical']),axis=1)
    return df

def calculate_aul(model, sentence, log_softmax):
    '''
    Given token ids of a sequence, return the averaged log probability of
    unmasked sequence (AULA or AUL).
    '''
    tokens = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)

    # Get the token IDs and attention mask
    input_ids = tokens['input_ids']#.to('mps')
    output = model(input_ids, output_hidden_states=True)
    logits = output.logits.squeeze(0)
    log_probs = log_softmax(logits)
    input_ids = input_ids.view(-1, 1).detach()
    token_log_probs = log_probs.gather(1, input_ids)[1:-1]
    sentence_log_prob = torch.mean(token_log_probs)
    score = sentence_log_prob.item()

    hidden_states = output.hidden_states[-1][:,1:-1].cpu()
    hidden_state = torch.mean(hidden_states, 1).detach().numpy()

    return score, hidden_state


In [35]:

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


In [36]:
log_softmax = torch.nn.LogSoftmax(dim=1)

In [59]:
caste_df = read_file(filename="../data/Caste.csv")

In [60]:
model = trainer.model

In [61]:

stereo_inputs = [i for i in caste_df['Stereotypical']]
antistereo_inputs = [i for i in caste_df['Anti-Stereotypical']]

stereo_scores = []
antistereo_scores = []
stereo_embes = []
antistereo_embes = []

for i in stereo_inputs:
    stereo_score, stereo_hidden_state = calculate_aul(model, i, log_softmax)
    stereo_scores.append(stereo_score)
    stereo_embes.append(stereo_hidden_state)

for j in antistereo_inputs:
    antistereo_score, antistereo_hidden_state = calculate_aul(model, j, log_softmax)
    antistereo_scores.append(antistereo_score)
    antistereo_embes.append(antistereo_hidden_state)

stereo_scores = np.array(stereo_scores)
stereo_scores = stereo_scores.reshape([-1, 1])
antistereo_scores = np.array(antistereo_scores)
antistereo_scores = antistereo_scores.reshape([1, -1])
bias_scores = stereo_scores > antistereo_scores

stereo_embes = np.concatenate(stereo_embes)
antistereo_embes = np.concatenate(antistereo_embes)
weights = cos_sim(stereo_embes, antistereo_embes.T)

weighted_bias_scores = bias_scores * weights
bias_score = np.sum(weighted_bias_scores) / np.sum(weights)
print('bias score (emb):', round(bias_score * 100, 2))

bias score (emb): 62.75


In [63]:
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [64]:

stereo_inputs = [i for i in caste_df['Stereotypical']]
antistereo_inputs = [i for i in caste_df['Anti-Stereotypical']]

stereo_scores = []
antistereo_scores = []
stereo_embes = []
antistereo_embes = []

for i in stereo_inputs:
    stereo_score, stereo_hidden_state = calculate_aul(model, i, log_softmax)
    stereo_scores.append(stereo_score)
    stereo_embes.append(stereo_hidden_state)

for j in antistereo_inputs:
    antistereo_score, antistereo_hidden_state = calculate_aul(model, j, log_softmax)
    antistereo_scores.append(antistereo_score)
    antistereo_embes.append(antistereo_hidden_state)

stereo_scores = np.array(stereo_scores)
stereo_scores = stereo_scores.reshape([-1, 1])
antistereo_scores = np.array(antistereo_scores)
antistereo_scores = antistereo_scores.reshape([1, -1])
bias_scores = stereo_scores > antistereo_scores

stereo_embes = np.concatenate(stereo_embes)
antistereo_embes = np.concatenate(antistereo_embes)
weights = cos_sim(stereo_embes, antistereo_embes.T)

weighted_bias_scores = bias_scores * weights
bias_score = np.sum(weighted_bias_scores) / np.sum(weights)
print('bias score (emb):', round(bias_score * 100, 2))

bias score (emb): 62.43
