In [1]:
from lime.lime_text import LimeTextExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from random import randrange
import utils.dataset_processors as dataset_processors
from sota_list import *

In [2]:
# Load the finetuned BERT model

config = AutoConfig.from_pretrained('fine-tuned-bert-personality-sentence-segmentation', output_hidden_states =True)
bert_model = AutoModelForSequenceClassification.from_pretrained('fine-tuned-bert-personality-sentence-segmentation', config=config)
tokenizer = AutoTokenizer.from_pretrained('fine-tuned-bert-personality-sentence-segmentation')

In [3]:
# Load the trained LSTM model
model_type = 'lstm'
train_type = 'segmented'
file_name = f"finetuned_saved_models/{model_type}_{train_type}.pth"

persona_model = LSTMNetwork(768,128,5)
persona_model.load_state_dict(torch.load(f"{file_name}"))
persona_model.eval()
print(persona_model)

LSTMNetwork(
  (lstm): LSTM(768, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)


# Essays

- Pick the ones with less than 512 tokens
- Check the gold labels
- Check the predicted labels
- Visualize

In [4]:
# Load the dataset and pre-process
datafile = "data/essays/essays.csv"
dataset = dataset_processors.load_essays_df(datafile)

# sorting all essays in ascending order of their length
for ind in dataset.index:
    tokens = tokenizer.tokenize(dataset["text"][ind])
    dataset.at[ind, "token_len"] = len(tokens)

dataset.sort_values(by=["token_len", "user"], inplace=True, ascending=True, ignore_index=True)
print(dataset.head(10))

Token indices sequence length is longer than the specified maximum sequence length for this model (800 > 512). Running this sequence through the model will result in indexing errors


EXT :  EXT
1    1275
0    1192
Name: count, dtype: int64
NEU :  NEU
1    1234
0    1233
Name: count, dtype: int64
AGR :  AGR
1    1309
0    1158
Name: count, dtype: int64
CON :  CON
1    1254
0    1213
Name: count, dtype: int64
OPN :  OPN
1    1271
0    1196
Name: count, dtype: int64



              user                                               text  \
0  1997_971112.txt  alright. what can I talk about. well the only ...   
1  1998_659134.txt  this job is going to drain me if I don't say s...   
2  1998_045186.txt  I want to go to Mexico and dance I have not da...   
3  1998_082105.txt  Is there a reason?  These questions have been ...   
4  1999_973407.txt  For the past few days and now I have been thin...   
5  2002_805223.txt       cold air quietness stress sad unreal comp...   
6  1997_491971.txt  I hope I can finish this assignment in time be...   
7  1998_494559.txt  I believe sometimes I think to much about what...   
8  1997_953451.txt  My roommate won't shut up. O. K. Be

In [5]:
# Filter those less than a limit
limit = 100
filtered_dataset = dataset[dataset['token_len'] <= limit]
num_rows = len(filtered_dataset)
num_rows

3

In [6]:
rand_num = randrange(num_rows)
sample = filtered_dataset['text'][rand_num]
print(sample)

filtered_dataset.iloc[[rand_num]]

I want to go to Mexico and dance I have not dance in a wile no time the is so much good live music here it because money though I am hopefully going to get a job I need to turn in that application tomorrow too I have a lot of stuff to do oh well I am going take it as it comes though bye. 


Unnamed: 0,user,text,token_len,EXT,NEU,AGR,CON,OPN
2,1998_045186.txt,I want to go to Mexico and dance I have not da...,68,0,1,1,1,1


In [7]:
class_names = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']

In [14]:
# Create the function to fetch the probability scores
def get_proba_scores(texts):

    # Tokenize
    # NOTE: This is for just one input.
    # Let's adapt for future ones
    tokenized = tokenizer(texts, 
                padding = True,
                return_tensors="pt",
                truncation=True)

    # Get the output logits and convert to scores
    bert_output = bert_model(**tokenized)
    cls_embedding = bert_output.hidden_states[-1][0,0,:]
    cls_embedding = cls_embedding.unsqueeze(0)
    
    probability = persona_model(cls_embedding)
    return probability.detach().numpy()

In [15]:
proba = get_proba_scores(sample)
print(proba)

[[0.06544005 0.9209875  0.65194625 0.82494074 0.90769947]]


In [17]:
# Create the explainer object
explainer = LimeTextExplainer(class_names=class_names)

# Explain the instance passed in
explain_bert_output = explainer.explain_instance(
    sample, get_proba_scores, 
    num_features = 10, num_samples = 25,
    top_labels = 5
)

# Show the output in notebook
explain_bert_output.show_in_notebook(text=sample)

ValueError: Found input variables with inconsistent numbers of samples: [25, 1]

# DailyDialog

- Load the dataset
- Pick conversation, then utterance
- Pre-process and guess

In [20]:
import json
import codecs

def load_dataset(portion_set):
     
    file_directory = f"dataset_erc/dailydialogue/{portion_set}.json"

    train_file = []

    with codecs.open(file_directory, "r", "utf-8") as f:
        train_file = json.load(f)

    return train_file

In [21]:
portion_set = "train"
dataset = load_dataset(portion_set)

In [25]:
num_conv = len(dataset)
conv = dataset[randrange(num_conv)]

num_utterances = len(conv)
sample = conv[randrange(num_utterances)]

sample

{'utterance': 'you are under arrest , sir . you have the right to remain silent . you better not pout , you better not cry . anything you say can and will be used against you . you have the right to an attorney . if you cannot afford one , the state will appoint one for you.do you understand the arrest to you ?',
 'sentiment': '0',
 'act': '1'}

In [27]:
proba = get_proba_scores(utterance['utterance'])
print(proba)
class_names = ['EXT', 'NEU', 'AGR', 'CON', 'OPN']
class_names

[[0.33714816 0.24806674 0.9022168  0.6177298  0.67106724]]


['EXT', 'NEU', 'AGR', 'CON', 'OPN']