In [6]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

import utils.dataset_processors as dataset_processors
import numpy as np
import pandas as pd
import re

In [17]:
# Load the dataset
datafile = "data/essays/essays.csv"
dataset = dataset_processors.load_essays_df(datafile)

# Split the dataset (6:2:2)
train_data, temp_data = train_test_split(dataset, train_size=0.6, random_state=42)
validation_data, test_data = train_test_split(temp_data, train_size=0.2, random_state=42)

EXT :  EXT
1    1275
0    1192
Name: count, dtype: int64
NEU :  NEU
1    1234
0    1233
Name: count, dtype: int64
AGR :  AGR
1    1309
0    1158
Name: count, dtype: int64
CON :  CON
1    1254
0    1213
Name: count, dtype: int64
OPN :  OPN
1    1271
0    1196
Name: count, dtype: int64





In [15]:
# Get the Big 5 labels
column_names = list(train_data.columns)
labels = [label for label in column_names if label not in ['user','text','token_len']]

# Forward and backward mapping
id2label = {idx:label for idx,label in enumerate(labels)}
label2id = {label:idx for idx,label in enumerate(labels)}

labels

['EXT', 'NEU', 'AGR', 'CON', 'OPN']

In [None]:
# Convert from essay to sentences
def split_text_with_labels(row):
    
    # Split sentences
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", row['text'])

    return [{
        'text': sentence,
        'EXT': row['EXT'],
        'NEU': row['NEU'],
        'AGR': row['AGR'],
        'CON': row['CON'],
        'OPN': row['OPN']
    }
        for sentence in sentences       
    ]

def transform_dataframe(old_dataframe):

    # Begin the split
    split_data = []
    
    for index, row in old_dataframe.iterrows():
        split_data.extend(split_text_with_labels(row))
        
    return pd.DataFrame(split_data)

train_data = transform_dataframe(train_data)
test_data = transform_dataframe(test_data)
validation_data = transform_dataframe(validation_data)

In [20]:
print(train_data.head(20))

                                                 text  EXT  NEU  AGR  CON  OPN
0                                     I am tired now.    1    0    1    0    1
1              I don't know what I should talk about.    1    0    1    0    1
2                             I like this assignment.    1    0    1    0    1
3                               Wonder when it's due?    1    0    1    0    1
4    Kristi Urey is the most beautiful person I know.    1    0    1    0    1
5                                         I love you.    1    0    1    0    1
6   I mean I love her with all of my heart, mind, ...    1    0    1    0    1
7                   I like psychology in high school.    1    0    1    0    1
8               It was very interesting and personal.    1    0    1    0    1
9                 That draws people in or so I think.    1    0    1    0    1
10                 Psychology is the food of the sea.    1    0    1    0    1
11                            My roommate's a weirdo

In [19]:
# Convert to DatasetDict
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
valid_dataset = Dataset.from_dict(validation_data)

full_dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "valid": valid_dataset
})

print(full_dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 73537
    })
    test: Dataset({
        features: ['text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 39898
    })
    valid: Dataset({
        features: ['text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 9542
    })
})


In [21]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(row):

    # Extract the text
    essays = row['text']

    # Clean up
    essays = [dataset_processors.preprocess_text(essay) for essay in essays]

    # Encode them using the tokenizer
    encoded_essay = tokenizer(essays, truncation = True)
    
    # Add the labels
    labels_batch = {key: row[key] for key in row.keys() if key in labels}

    # Create numpy array of batch and labels
    labels_matrix = np.zeros((len(essays), len(labels)))

    # Fill the array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    # Return the encoding
    encoded_essay["labels"] = labels_matrix.tolist()
    
    return encoded_essay

# Perform the preprocessing
full_dataset_dict = full_dataset_dict.map(
    preprocess_text, batched = True, 
    remove_columns = full_dataset_dict['train'].column_names
)

Map:   0%|          | 0/73537 [00:00<?, ? examples/s]

Map:   0%|          | 0/39898 [00:00<?, ? examples/s]

Map:   0%|          | 0/9542 [00:00<?, ? examples/s]

In [24]:
#print(full_dataset_dict['train'][0].keys())
#print(full_dataset_dict['train'][0])
#print(full_dataset_dict['train'][0]['labels'])
#tokenizer.decode(full_dataset_dict['train'][5]['input_ids'])
#[id2label[idx] for idx, label in enumerate(full_dataset_dict['train'][5]['labels']) if label == 1.0]

[1.0, 0.0, 1.0, 0.0, 1.0]


In [25]:
full_dataset_dict.set_format("torch")

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type = "multi_label_classification",
    num_labels = len(labels),
    id2label = id2label,
    label2id = label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from transformers import TrainingArguments, Trainer

batch_size = 16
learning_rate = 2e-5
epochs = 10
metric_name = "accuracy"

print(learning_rate)


2e-05


In [28]:
args = TrainingArguments(
    'bert-finetuned-personality-detection',
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs = epochs,
    weight_decay = 0.01,
    metric_for_best_model = metric_name
)

In [29]:
print(labels)

['EXT', 'NEU', 'AGR', 'CON', 'OPN']


In [30]:
from sklearn.metrics import accuracy_score
import torch

def multi_label_metrics(pred_logits, gold_labels):

    # Our threshold
    threshold = 0.5

    # Apply sigmoid to logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(pred_logits))

    # Convert predictions to integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # Compute metrics
    y_true = gold_labels

    metrics = {
        f"{id2label[i]} - accuracy": accuracy_score(y_true[:, i], y_pred[:, i]) 
        for i in range(len(labels))
    }

    overall_accuracy = accuracy_score(y_true, y_pred)

    # Store and return as dictionary
    metrics['accuracy'] = overall_accuracy
    
    return metrics

def compute_metrics(p):

    # Get the type of predictions
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    # Compute the results
    results = multi_label_metrics(preds, p.label_ids)

    return results

In [None]:
#full_dataset_dict['train'][0]['labels'].type()
#full_dataset_dict['train']['input_ids'][0]

In [31]:
# forward pass (testing)
outputs = model(
    input_ids=full_dataset_dict['train']['input_ids'][0].unsqueeze(0), 
    labels=full_dataset_dict['train'][0]['labels'].unsqueeze(0)
)

outputs

SequenceClassifierOutput(loss=tensor(0.6925, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.6925, -0.7053, -0.4995,  0.1744, -0.3741]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [33]:
# Initialize the trainer before training
trainer = Trainer(
    model,
    args,
    train_dataset=full_dataset_dict["train"],
    eval_dataset=full_dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model('fine-tuned-bert-personality')

# Feature evaluation

In [None]:
import json
import codecs
import pickle

In [None]:
dataset = []

from pprint import pprint

with open('train_dataset_vad.pkl','rb') as file:
    dataset = pickle.load(file)

pprint(dataset[0][0])

In [None]:
file_directory = 'dataset_erc\dailydialogue\dev.json'

train_file = []

with codecs.open(file_directory, "r", "utf-8") as f:
    train_file = json.load(f)

In [None]:
print(train_file[0])

In [None]:
def get_personality():
    return 5

def iterate_conversation(conversation_list):
    conversation_with_personality = []

    for conversation in conversation_list:
        personality_features = get_personality()

        conversation['personality_features'] = personality_features
        conversation_with_personality.append(conversation)
        
    return conversation_with_personality
    

train_file_with_personality = []

for conversation_list in train_file:
    print(conversation_list)
    print('\n\n')
    conversation_with_personality = iterate_conversation(conversation_list)
    print(conversation_with_personality)
    break