In [13]:
from lime.lime_text import LimeTextExplainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from random import randrange
import utils.dataset_processors as dataset_processors
from sota_list import *

In [2]:
# Load the finetuned BERT model

config = AutoConfig.from_pretrained('fine-tuned-bert-personality-sentence-segmentation', output_hidden_states =True)
model = AutoModelForSequenceClassification.from_pretrained('fine-tuned-bert-personality-sentence-segmentation', config=config)
tokenizer = AutoTokenizer.from_pretrained('fine-tuned-bert-personality-sentence-segmentation')

In [3]:
# Load the trained LSTM model
model_type = 'lstm'
train_type = 'segmented'
file_name = f"finetuned_saved_models/{model_type}_{train_type}.pth"

model = LSTMNetwork(768,128,5)
model.load_state_dict(torch.load(f"{file_name}"))
print(model)

LSTMNetwork(
  (lstm): LSTM(768, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)


# Essays

- Pick the ones with less than 512 tokens
- Check the gold labels
- Check the predicted labels
- Visualize

In [7]:
# Load the dataset and pre-process
datafile = "data/essays/essays.csv"
dataset = dataset_processors.load_essays_df(datafile)

# sorting all essays in ascending order of their length
for ind in dataset.index:
    tokens = tokenizer.tokenize(dataset["text"][ind])
    dataset.at[ind, "token_len"] = len(tokens)

dataset.sort_values(by=["token_len", "user"], inplace=True, ascending=True, ignore_index=True)
print(dataset.head(10))

EXT :  EXT
1    1275
0    1192
Name: count, dtype: int64
NEU :  NEU
1    1234
0    1233
Name: count, dtype: int64
AGR :  AGR
1    1309
0    1158
Name: count, dtype: int64
CON :  CON
1    1254
0    1213
Name: count, dtype: int64
OPN :  OPN
1    1271
0    1196
Name: count, dtype: int64



              user                                               text  \
0  1997_971112.txt  alright. what can I talk about. well the only ...   
1  1998_659134.txt  this job is going to drain me if I don't say s...   
2  1998_045186.txt  I want to go to Mexico and dance I have not da...   
3  1998_082105.txt  Is there a reason?  These questions have been ...   
4  1999_973407.txt  For the past few days and now I have been thin...   
5  2002_805223.txt       cold air quietness stress sad unreal comp...   
6  1997_491971.txt  I hope I can finish this assignment in time be...   
7  1998_494559.txt  I believe sometimes I think to much about what...   
8  1997_953451.txt  My roommate won't shut up. O. K. Be

In [14]:
# Filter those less than 512 tokens
filtered_dataset = dataset[dataset['token_len'] <= 512]
num_rows = len(filtered_dataset)
filtered_dataset.iloc[[5]]

Unnamed: 0,user,text,token_len,EXT,NEU,AGR,CON,OPN
5,2002_805223.txt,cold air quietness stress sad unreal comp...,167,1,0,1,0,0


In [17]:
rand_num = randrange(num_rows)
text = filtered_dataset['text'][rand_num]
print(text)
filtered_dataset.iloc[[rand_num]]

Yesterday in class we talked about what different parts of the brain are in control with. One interesting part to me is the Occipital Lobe. Three years ago I was in a car accident, which I have fully recovered from, expect I have blurred vision in my right eye. Within a few months after the accident I went to many doctors to see what the problem was. One doctor concluded that my optic nerve had been jarred which can cause loss of eyesight. He said that after a year if my nerve turned white then that meant that this is what happened. He also said that I would never regain my eyesight in that eye. Now that it's three years later, there is still no sign that my optic nerve was injured and my eyesight has gradually come back but it is a very slow process. Another doctor could not reach a conclusion but he figured that it might have something to do with my brain. He told me that you could never tell if the brain can overcome such a catastrophe but the brain can do miraculous things. Sometim

Unnamed: 0,user,text,token_len,EXT,NEU,AGR,CON,OPN
65,2000_535643.txt,Yesterday in class we talked about what differ...,289,0,0,0,1,0


# DailyDialog

- Load the dataset
- Pick conversation, then utterance
- Pre-process and guess