In [1]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets.dataset_dict import DatasetDict

import utils.dataset_processors as dataset_processors
import numpy as np

from datasets import Dataset

In [2]:
# Load the dataset
datafile = "data/essays/essays.csv"
dataset = dataset_processors.load_essays_df(datafile)

# Split the dataset (6:2:2)
train_data, temp_data = train_test_split(dataset, train_size=0.6, random_state=42)
validation_data, test_data = train_test_split(temp_data, train_size=0.2, random_state=42)

# Convert to DatasetDict
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
valid_dataset = Dataset.from_dict(validation_data)

full_dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "valid": valid_dataset
})

print(full_dataset_dict)

EXT :  EXT
1    1275
0    1192
Name: count, dtype: int64
NEU :  NEU
1    1234
0    1233
Name: count, dtype: int64
AGR :  AGR
1    1309
0    1158
Name: count, dtype: int64
CON :  CON
1    1254
0    1213
Name: count, dtype: int64
OPN :  OPN
1    1271
0    1196
Name: count, dtype: int64



DatasetDict({
    train: Dataset({
        features: ['user', 'text', 'token_len', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 1480
    })
    test: Dataset({
        features: ['user', 'text', 'token_len', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 790
    })
    valid: Dataset({
        features: ['user', 'text', 'token_len', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 197
    })
})


In [3]:
# Get the Big 5 labels
column_names = list(train_data.columns)
labels = [label for label in column_names if label not in ['user','text','token_len']]

# Forward and backward mapping
id2label = {idx:label for idx,label in enumerate(labels)}
label2id = {label:idx for idx,label in enumerate(labels)}

In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(row):

    # Extract the text
    essays = row['text']

    # Clean up
    essays = [dataset_processors.preprocess_text(essay) for essay in essays]

    # Encode them using the tokenizer
    encoded_essay = tokenizer(essays, truncation = True)
    
    # Add the labels
    labels_batch = {key: row[key] for key in row.keys() if key in labels}

    # Create numpy array of batch and labels
    labels_matrix = np.zeros((len(essays), len(labels)))

    # Fill the array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    # Return the encoding
    encoded_essay["labels"] = labels_matrix.tolist()
    
    return encoded_essay

# Perform the preprocessing
full_dataset_dict = full_dataset_dict.map(
    preprocess_text, batched = True, 
    remove_columns = full_dataset_dict['train'].column_names
)

Map:   0%|          | 0/1480 [00:00<?, ? examples/s]

Map:   0%|          | 0/790 [00:00<?, ? examples/s]

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

In [None]:
#print(full_dataset_dict['train'][0].keys())
#print(full_dataset_dict['train'][0])
#print(full_dataset_dict['train'][0]['labels'])
#tokenizer.decode(full_dataset_dict['train'][5]['input_ids'])
#[id2label[idx] for idx, label in enumerate(full_dataset_dict['train'][5]['labels']) if label == 1.0]

In [5]:
full_dataset_dict.set_format("torch")

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type = "multi_label_classification",
    num_labels = len(labels),
    id2label = id2label,
    label2id = label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import TrainingArguments, Trainer

batch_size = 16
learning_rate = 2e-5
epochs = 10
metric_name = "accuracy"

print(learning_rate)


2e-05


In [8]:
args = TrainingArguments(
    'bert-finetuned-personality-detection',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs = epochs,
    weight_decay = 0.01,
    load_best_model_at_end = True,
    metric_for_best_model = metric_name
)

In [9]:
print(labels)

['EXT', 'NEU', 'AGR', 'CON', 'OPN']


In [33]:
from sklearn.metrics import accuracy_score
import torch

def multi_label_metrics(pred_logits, gold_labels):

    # Our threshold
    threshold = 0.5

    # Apply sigmoid to logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(pred_logits))

    # Convert predictions to integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # Compute metrics
    y_true = gold_labels

    metrics = {
        f"{id2label[i]} - accuracy": accuracy_score(y_true[:, i], y_pred[:, i]) 
        for i in range(len(labels))
    }

    overall_accuracy = accuracy_score(y_true, y_pred)

    # Store and return as dictionary
    metrics['accuracy'] = overall_accuracy
    
    return metrics

def compute_metrics(p):

    # Get the type of predictions
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    # Compute the results
    results = multi_label_metrics(preds, p.label_ids)

    return results

In [14]:
#full_dataset_dict['train'][0]['labels'].type()
#full_dataset_dict['train']['input_ids'][0]

In [21]:
# forward pass (testing)
outputs = model(
    input_ids=full_dataset_dict['train']['input_ids'][0].unsqueeze(0), 
    labels=full_dataset_dict['train'][0]['labels'].unsqueeze(0)
)

outputs

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [35]:
# Initialize the trainer before training
trainer = Trainer(
    model,
    args,
    train_dataset=full_dataset_dict["train"],
    eval_dataset=full_dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Ext - accuracy,Neu - accuracy,Agr - accuracy,Con - accuracy,Opn - accuracy,Accuracy
1,No log,0.769261,0.526582,0.525316,0.510127,0.529114,0.593671,0.049367


In [None]:
trainer.save_model('fine-tuned-bert')

# Feature evaluation

In [None]:
import json
import codecs
import pickle

In [None]:
dataset = []

from pprint import pprint

with open('train_dataset_vad.pkl','rb') as file:
    dataset = pickle.load(file)

pprint(dataset[0][0])

In [None]:
file_directory = 'dataset_erc\dailydialogue\dev.json'

train_file = []

with codecs.open(file_directory, "r", "utf-8") as f:
    train_file = json.load(f)

In [None]:
print(train_file[0])

In [None]:
def get_personality():
    return 5

def iterate_conversation(conversation_list):
    conversation_with_personality = []

    for conversation in conversation_list:
        personality_features = get_personality()

        conversation['personality_features'] = personality_features
        conversation_with_personality.append(conversation)
        
    return conversation_with_personality
    

train_file_with_personality = []

for conversation_list in train_file:
    print(conversation_list)
    print('\n\n')
    conversation_with_personality = iterate_conversation(conversation_list)
    print(conversation_with_personality)
    break