#### Interprete the Model with Lime

In [None]:
from __future__ import print_function
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler

import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForTokenClassification
from IPython.display import display
from torch import cuda
from tqdm import tqdm



In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

In [None]:
# read data
train = pd.read_csv('../../Dataset/train.csv')
train_sentences = train['sentence'].to_list()
train_labels = train['word_labels'].to_list()
test = pd.read_csv('../../Dataset/test.csv')
test_sentences = test['sentence'].to_list()
test_labels = test['word_labels'].to_list()

In [None]:
# model
MAX_LEN = 128
MAX_GRAD_NORM = 10

temp_labels = ['Generic She', 'Behavioural Stereotypes', 'O', 'Generic He', 'i-Behavioural Stereotypes', 'i-Stereotyping Bias', 'Societal Stereotypes', 'i-Societal Stereotypes', 'Explicit Marking of Sex'] #set of labels in dataset
label2id = {k: v for v, k in enumerate(temp_labels)}
id2label = {v: k for v, k in enumerate(temp_labels)}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# download model from google drive https://drive.google.com/drive/u/0/folders/1XhdQ1rH-p1CGNWDvKcsELc5tEhNQtYtB
model = BertForTokenClassification.from_pretrained('./model')

In [None]:
class NERExplainerGenerator:
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    def get_predict_function(self, word_idx):
        def predict_func(texts):
            probs = np.zeros(shape=(len(texts), len(temp_labels)))
            for idx, text in tqdm(enumerate(texts)):
                inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')
                ids = inputs["input_ids"].to(device)
                mask = inputs["attention_mask"].to(device)
                outputs = self.model(ids, mask)
                logits = outputs[0]

                m = torch.nn.Softmax(dim=1)
                probabilities = m(logits[0])
                probabilities = probabilities.detach().numpy()
                probabilities = np.around(probabilities, decimals=2)
                tmp = probabilities.sum(axis=1).astype(float)[:, np.newaxis]
                probabilities = probabilities / tmp
                probabilities = np.around(probabilities, decimals=2)
                probabilities = probabilities.tolist()

                tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
                probabilities = [prob for prob, token in zip(probabilities, tokens) if not (token.startswith(" ##") or token in ['[CLS]', '[SEP]', '[PAD]'])]

                probs[idx] = np.array(probabilities[word_idx])
            return probs
            
        return predict_func

In [None]:
explainer_generator = NERExplainerGenerator(model=model, tokenizer=tokenizer, device=device)

sampler = MaskingTextSampler(
    replacement="fm",
    max_replace=0.7,
    token_pattern=None,
    bow=False
)

explainer = TextExplainer(
    sampler=sampler,
    position_dependent=True,
    random_state=42,
    n_samples=100,
)

In [None]:
# pick randomly n sentences for each bias class
generic_she_df = train.loc[train['word_labels'].str.contains("Generic She", case=False)].sample(n=5)
generic_he_df = train.loc[train['word_labels'].str.contains("Generic He", case=False)].sample(n=5)
bs_df = train.loc[train['word_labels'].str.contains("Behavioural Stereotypes", case=False)].sample(n=5)
ss_df = train.loc[train['word_labels'].str.contains("Societal Stereotypes", case=False)].sample(n=5)
ems_df = train.loc[train['word_labels'].str.contains("Explicit Marking of Sex", case=False)].sample(n=5)

# concat dataframes
sample_df = pd.concat([ems_df], ignore_index=False)
print(len(sample_df))


In [None]:
# helper function to find word of interest
def find_word_idx(label):
    label_list = label.split(',')
    word_idx = [idx for idx, label in enumerate(label_list) if not label == 'O']
    return word_idx[0]


In [None]:
for index, row in sample_df.iterrows():
    try:
        sentence = train_sentences[index]
        words = sentence.split()
        label = train_labels[index]

        word_idx = find_word_idx(label)
        predict_func = explainer_generator.get_predict_function(word_idx=word_idx)
        explainer.fit(sentence, predict_func)

        print('lime interpretation of the sentence: ')
        print(sentence)
        print('with label: ')
        print(label)
        print(f'for the word {words[word_idx]}')

        display(explainer.show_prediction(target_names=temp_labels))
        display(explainer.show_weights(target_names=temp_labels))
        print("prediction of lime model")
        print(explainer.y_proba_)
    except:
        print("Probability problem")
        print(sentence)
        print(label)

    