In [1]:
from sota_list import LSTMNetwork
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

import torch
import codecs
import json
from pprint import pprint


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model weights
model_weights_path = "finetuned_saved_models/bilstm-bert-finetuned-segmented-extracted.pth"

# PML
plm_name = 'bert-finetuned-segmented'

In [3]:
def load_llm_parts(model_name):

    # Load the config, model, and tokenizer
    config = AutoConfig.from_pretrained(model_name, output_hidden_states =True)

    return [
        AutoModelForSequenceClassification.from_pretrained(model_name, config=config),
        AutoTokenizer.from_pretrained(model_name)
    ]



# Initialize the model and load the weights
model = LSTMNetwork(768,128,5,True)
model.load_state_dict(torch.load(model_weights_path))
model.eval()

# Load the LLM fine-tuned model
llm_model, tokenizer = load_llm_parts(plm_name)

In [38]:
# Load the ERC datasets
def load_dataset(name, type):
    file_name = f'erc-datasets/{name}/{type}.json'
    with codecs.open(file_name, 'r', 'utf-8') as fr:
            return json.load(fr)

    return None

name = 'MELD'
partition = 'train'

dataset = load_dataset(name, partition)
#pprint(dataset[0])

In [28]:
def perform_classification(model, llm_model, tokenizer, text):

    # Tokenize
    token_ids = tokenizer(
        text, 
        truncation = True, 
        return_tensors='pt', 
        max_length = 512, 
        add_special_tokens=True
    )

    # Extract CLS
    cls_output = llm_model(**token_ids)
    cls_output = cls_output.hidden_states[-1][0,0,:]
    cls_output = cls_output.unsqueeze(0)

    # Get the output
    output = model.features_extraction(cls_output)

    # Convert to labels
    scores = model.single_classification(output)
    scores = scores.detach().tolist()[0]

    return [
        int(score > 0.5)
        for score in scores
    ]


In [None]:
labels = ['extraverted', 'neurotic', 'agreeable', 'conscientious', 'open']

for conversation in dataset:
    
    for utt_data in conversation:
        scores = perform_classification(model, llm_model, tokenizer, utt_data['utterance'])
        
        # Get the string output
        utt_data['personality'] = [
            f'not {labels[i]}' if scores[i] == 0 else f'{labels[i]}'
            for i in range(len(scores))
        ]

In [42]:
# Save the results
new_file = f'erc-datasets/{name}/{partition}-personality.json'

with codecs.open(new_file, "w", "utf-8") as fr:
    json.dump(dataset, fr, ensure_ascii=False, indent=4)