In [6]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets.dataset_dict import DatasetDict

import utils.dataset_processors as dataset_processors
import numpy as np

from datasets import Dataset

In [7]:
# Load the dataset
datafile = "data/essays/essays.csv"
dataset = dataset_processors.load_essays_df(datafile)

# Split the dataset (6:2:2)
train_data, temp_data = train_test_split(dataset, train_size=0.6, random_state=42)
validation_data, test_data = train_test_split(temp_data, train_size=0.2, random_state=42)

# Convert to DatasetDict
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
valid_dataset = Dataset.from_dict(validation_data)

full_dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "valid": valid_dataset
})

print(full_dataset_dict)

EXT :  EXT
1    1275
0    1192
Name: count, dtype: int64
NEU :  NEU
1    1234
0    1233
Name: count, dtype: int64
AGR :  AGR
1    1309
0    1158
Name: count, dtype: int64
CON :  CON
1    1254
0    1213
Name: count, dtype: int64
OPN :  OPN
1    1271
0    1196
Name: count, dtype: int64



DatasetDict({
    train: Dataset({
        features: ['user', 'text', 'token_len', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 1480
    })
    test: Dataset({
        features: ['user', 'text', 'token_len', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 790
    })
    valid: Dataset({
        features: ['user', 'text', 'token_len', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 197
    })
})


In [15]:
# Get the Big 5 labels
column_names = list(train_data.columns)
labels = [label for label in column_names if label not in ['user','text','token_len']]

# Forward and backward mapping
id2label = {idx:label for idx,label in enumerate(labels)}
label2id = {label:idx for idx,label in enumerate(labels)}

In [18]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(row):

    # Extract the text
    essays = row['text']

    # Clean up
    essays = [dataset_processors.preprocess_text(essay) for essay in essays]

    # Encode them using the tokenizer
    encoded_essay = tokenizer(essays, truncation = True)
    
    # Add the labels
    labels_batch = {key: row[key] for key in row.keys() if key in labels}

    # Create numpy array of batch and labels
    labels_matrix = np.zeros((len(essays), len(labels)))

    # Fill the array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    # Return the encoding
    encoded_essay["labels"] = labels_matrix.tolist()
    
    return encoded_essay

# Perform the preprocessing
full_dataset_dict = full_dataset_dict.map(
    preprocess_text, batched = True, 
    remove_columns = full_dataset_dict['train'].column_names
)

Map:   0%|          | 0/1480 [00:00<?, ? examples/s]

Map:   0%|          | 0/790 [00:00<?, ? examples/s]

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

In [27]:
#print(full_dataset_dict['train'][0].keys())
#print(full_dataset_dict['train'][0])
#print(full_dataset_dict['train'][0]['labels'])
#tokenizer.decode(full_dataset_dict['train'][5]['input_ids'])
#[id2label[idx] for idx, label in enumerate(full_dataset_dict['train'][5]['labels']) if label == 1.0]

['CON']

# Feature evaluation

In [None]:
import json
import codecs
import pickle

In [None]:
dataset = []

from pprint import pprint

with open('train_dataset_vad.pkl','rb') as file:
    dataset = pickle.load(file)

pprint(dataset[0][0])

In [None]:
file_directory = 'dataset_erc\dailydialogue\dev.json'

train_file = []

with codecs.open(file_directory, "r", "utf-8") as f:
    train_file = json.load(f)

In [None]:
print(train_file[0])

In [None]:
def get_personality():
    return 5

def iterate_conversation(conversation_list):
    conversation_with_personality = []

    for conversation in conversation_list:
        personality_features = get_personality()

        conversation['personality_features'] = personality_features
        conversation_with_personality.append(conversation)
        
    return conversation_with_personality
    

train_file_with_personality = []

for conversation_list in train_file:
    print(conversation_list)
    print('\n\n')
    conversation_with_personality = iterate_conversation(conversation_list)
    print(conversation_with_personality)
    break