In [None]:
import torch
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import numpy as np

# Transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForMaskedLM

from scipy import stats

import warnings
warnings.filterwarnings('ignore')

# Global Parameters

In [2]:
# Directory of adapters
all_adapters_dir = '../../results/'
# Create empty list to append results of all models
all_results = []

# Utils

## Import helper functions

In [3]:
%run ../utils/helper_functions.py
%run ../utils/target_terms.py

## Function to prepare data frames

In [4]:
def prepare_df(annotation_frame_dir, target_term_dir):
    """Prepare the DataFrames for the specific bias types to contain the biased and inversly biased sentences.

    Parameters
    ----------
    annotation_frame_dir : str
        Directory to the annotated DataFrame.
    target_term_dir : str
        Directory to the target term pairs of the respective bias type.         

    Returns
    -------
    DataFrame
        
    """
    # Load Annotation data
    df = pd.read_excel(annotation_frame_dir)   
    
    # Only keep biased sentences
    df = df[df['Biased Sentence'] == 1]

    # Only keep relevant columns
    df = df[['Column1', 'ID','newSent']]

    # Print the number of biased sentences for all biased attribute terms
    print("Number of biased sentences per Attribute: " + str(len(df)))

    # Rename Sentence column
    df = df.rename(columns={'newSent': 'Biased Sentence'})

    # Lowercase biased sentences
    df['Biased Sentence'] = df.apply(lambda row: row['Biased Sentence'].lower(), axis = 1)

    # Drop duplicate sentences
    df = df.drop_duplicates(subset=['Biased Sentence'])
    
    # Number of rows found per target term in sentence
    print("Number of biased sentences without duplicates: " + str(len(df)))
    
    # Merge Target Terms
    # Retrieve Target Term list
    target_term_pairs = pd.read_csv(target_term_dir)

    #Create list of all target terms
    target_terms = list(set(target_term_pairs['T1'].tolist())) + list(set(target_term_pairs['T2'].tolist()))

    # Find target terms in biased sentence
    df['tt_list'] = df.apply(lambda row: findTargetTerms(row['Biased Sentence'], target_terms), axis = 1)

    # Find matching opposite target term
    df['tt_opp_list'] = df.apply(lambda row: findOppositeTargetTerm(row['tt_list'], target_term_pairs), axis = 1)
    
    # Apply CDA
    # Create Combination of target terms and their opposite terms
    df['Target Term Combination'] = df.apply(lambda row: createCombination(row), axis = 1)

    # Create all possible Opposing Sentences
    df['Opposing Sentence'] = df.apply(lambda row: replaceTermsInSentence(row), axis = 1)
    
    print("Dataframe prepared!")
    
    return df

# Function to calculate Model Perplexity

In [7]:
def calculatePerplexity(model_name, 
                        test_df,
                        bias_type,
                        adapter_dir1 = None,
                        adapter_dir2 = None):
    """Calculates the perplexity for a sentence given a certain model and the pretrained adapter

    Parameters
    ----------
    model_name : str
        The name of the pre-trained model that should be used.
        To reproduce the thesis one can decide between: 
            - bert-base-uncased
            - roberta-base
            - gpt2
            - microsoft/DialoGPT-medium    
    bias_type : str
        Bias type that should be evaluated. Choose between:
            - Islamophobia
            - Queerphobia
    test_df : Data Frame
        Test dataframe containing the biased and augmented inversely biased sentences
    adapter_dir1 : str
        Name of directory where trained argumentative or debiasing adapter is stored.
    adapter_dir2 : str
        Name of directory where the second trained argumentative or debiasing adapter is stored.
    Returns
    -------
    DataFrame
        consisting of:
        - Model Name
        - Fine Tuning Dataset
        - Mean PPL BS
        - Mean PPL OS
        - t-value
        - p-value
    """
    if 'bert' in model_name:
        language_model = 'BERT'
        model_type = 'mlm'
    elif 'gpt2' in model_name:
        language_model = 'GPT-2'
        model_type = 'clm'
    else:
        raise InputError('Model type is not recognized.')
        
    if adapter_dir1:
        if 'argsme' in adapter_dir1:
            strategy = 'Args.me'
        elif 'wiki' in adapter_dir1:
            strategy = 'Wikipedia'
    elif adapter_dir1 & adapter_dir2:
        strategy = 'stacking'
    else:
        strategy = 'original'
    
    
    print('Evaluate Results for ' + language_model + ' on ' + strategy + ' Corpora concerning ' + bias_type)
    # Load pre-trained model (weights)
    print('Load trained model.....')
    if model_type == 'mlm':
        with torch.no_grad():
            model = AutoModelForMaskedLM.from_pretrained(model_name)
            if strategy == 'stacking':
                model.load_adapter(all_adapters_dir + adapter_dir1 + '/mlm', load_as='model_adapter1', with_head=False)
                model.load_adapter(all_adapters_dir + adapter_dir2 + '/mlm', load_as='model_adapter2', with_head=False)
                model.active_adapters = ac.Stack('model_adapter1', 'model_adapter2')
            elif strategy == 'Args.me' or strategy == 'Wikipedia':
                model.load_adapter(all_adapters_dir + adapter_dir1 + '/mlm', load_as='model_adapter')
                model.set_active_adapters('model_adapter')
    elif model_type == 'clm':
        with torch.no_grad():
            model = AutoModelForCausalLM.from_pretrained(model_name)
            if strategy == 'stacking':
                model.load_adapter(all_adapters_dir + adapter_dir1 + '/clm', load_as='model_adapter1', with_head=False)
                model.load_adapter(all_adapters_dir + adapter_dir2 + '/clm', load_as='model_adapter2', with_head=False)
                model.active_adapters = ac.Stack('model_adapter1', 'model_adapter2')
            elif strategy == 'Args.me' or strategy == 'Wikipedia':
                model.load_adapter(all_adapters_dir + adapter_dir1 + '/mlm', load_as='model_adapter')
                model.set_active_adapters('model_adapter')
    else:
        raise InputError('Model type is not recognized.')
    print('Model loaded successfully!')

    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Calculate perplexity score of biased sentences
    print('Calculate Perplexity Scores.....')
    biased_sentences =  test_df['Biased Sentence'].tolist()
    inversely_biased_sentences =  test_df['Opposing Sentence'].tolist()
    pps_biased_sents = []
    for biased_sent in tqdm(biased_sentences):
        if model_type == 'mlm':
            tokenize_input = tokenizer.tokenize(biased_sent)
            tokenize_input = ["[CLS]"]+tokenize_input+["[SEP]"]
            tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
        elif model_type == 'clm':
            tokenize_input = tokenizer.tokenize(biased_sent)
            tensor_input = torch.tensor([ [50256]  +  tokenizer.convert_tokens_to_ids(tokenize_input) + [50256]])
        loss = model(tensor_input, labels=tensor_input)[0]
        pps_biased_sents.append(np.exp(loss.detach().numpy()))

    # Calculate perplexity score of oppositely biased sentences    
    pps_opp_biased_sents = []
    for opp_biased_sents in tqdm(inversely_biased_sentences):
        tmp = []
        for opp_biased_sent in opp_biased_sents:
            if model_type == 'mlm':
                tokenize_input = tokenizer.tokenize(opp_biased_sent)
                tokenize_input = ["[CLS]"]+tokenize_input+["[SEP]"]
                tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
            elif model_type == 'clm':
                tokenize_input = tokenizer.tokenize(opp_biased_sent)
                tensor_input = torch.tensor([ [50256]  +  tokenizer.convert_tokens_to_ids(tokenize_input) + [50256]])
            loss = model(tensor_input, labels=tensor_input)[0]   
            tmp.append(np.exp(loss.detach().numpy()))
        tmp_mean = sum(tmp) / len(tmp)
        pps_opp_biased_sents.append(tmp_mean)
    print('Perplexity Scores calculated successfully!')
    
    ttest,pval,num_samples,mean_bs,mean_os = t_test(pps_biased_sents, pps_opp_biased_sents)
    
    result = {"Model": language_model,
              "Bias Type": bias_type,
              "Strategy": strategy,
              "Mean PPL BS": mean_bs, 
              "Mean PPL OS": mean_os, 
              "t-value": ttest, 
              "p-value": pval}

    all_results.append(result)
    
    from tabulate import tabulate
    print("Paired t-test of " + language_model)
    print(tabulate([['Sample size', num_samples], 
                    ['Mean PPL BS', mean_bs],
                    ['Mean PPL OS', mean_os],
                    ['Difference Mean', mean_bs-mean_os],
                    ['t-value', ttest],
                    ['P-value (two-tail)', pval],
                   ]))  

    return pd.DataFrame([result])

# Prepare Data Frames

In [5]:
df_queerphobia_bias = prepare_df('../../data/ABBA/abba_queerphobia_annotations.xlsx',
                               '../../data/target_term_pairs/target_term_pairs_queerphobia_bias.csv')

df_islamophobia_bias = prepare_df('../../data/ABBA/abba_islamophobia_annotations.xlsx',
                               '../../data/target_term_pairs/target_term_pairs_islamophobia_bias.csv')

Number of biased sentences per Attribute: 358
Number of biased sentences without duplicates: 280
Dataframe prepared!
Number of biased sentences per Attribute: 648
Number of biased sentences without duplicates: 465
Dataframe prepared!


# Calculate Perplexities

In [33]:
calculatePerplexity('bert-base-uncased', 
                    df_queerphobia_bias,
                    'Queerphobia',
                    '/bert_argsme_adapter_cda_sb_all_lbl/checkpoint-46981')
                    

Evaluate Results for BERT on Args.me Corpora concerning Queerness Bias
Load trained model.....


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model loaded successfully!


  0%|                                                                                          | 0/280 [00:00<?, ?it/s]

Calculate Perplexity Scores.....


100%|████████████████████████████████████████████████████████████████████████████████| 280/280 [00:42<00:00,  6.54it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 280/280 [02:30<00:00,  1.86it/s]


Perplexity Scores calculated successfully!
Paired t-test of BERT
------------------  -----------
Sample size         273
Mean PPL BS           7.67311
Mean PPL OS           7.39309
Difference Mean       0.280018
t-value               2.30089
P-value (two-tail)    0.0221545
------------------  -----------


Unnamed: 0,Model,Bias Type,Strategy,Mean PPL BS,Mean PPL OS,t-value,p-value
0,BERT,Queerness Bias,Args.me,7.67311,7.393092,2.300889,0.022154


In [None]:
# Return all results
pd.DataFrame([all_results])