In [1]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

import utils.dataset_processors as dataset_processors
import numpy as np
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_model_names(model_name):
    return {
        'distillbert': 'distilbert-base-uncased', # 66M
        'xlnet': 'xlnet-base-cased', # 110M
        'bert': 'bert-base-uncased', # 110M
        'roberta': 'roberta-base', # 125M
        'albert': 'albert-base-v2', # 11M
        'electra': 'google/electra-small-discriminator', # 14M
        'big-bird': 'google/bigbird-roberta-base', # 125M
        'longformer': 'allenai/longformer-base-4096' # 149M
    }[model_name]

In [3]:
# Initialize the model name for tokenizer and also the saved model name
plm = 'xlnet'
model_name = get_model_names(plm)

# Add boolean for sentence segmentation:
segment_sentences = True

# Add filename that will be saved later:
saved_model_file = f'{plm}-finetuned-segmented'

In [4]:
# Load the dataset
datafile = "data/essays/essays.csv"
dataset = dataset_processors.load_essays_df(datafile)

# Split the dataset (6:2:2)
train_data, temp_data = train_test_split(dataset, train_size=0.6, random_state=42)
validation_data, test_data = train_test_split(temp_data, train_size=0.5, random_state=42)

In [5]:
# Get the Big 5 labels
column_names = list(train_data.columns)
labels = [label for label in column_names if label not in ['user','text','token_len']]

# Forward and backward mapping
id2label = {idx:label for idx,label in enumerate(labels)}
label2id = {label:idx for idx,label in enumerate(labels)}

labels

['EXT', 'NEU', 'AGR', 'CON', 'OPN']

In [43]:
# Import the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    problem_type = "multi_label_classification",
    num_labels = len(labels),
    id2label = id2label,
    label2id = label2id,
    use_mems_eval=False
)

if model_name == 'xlnet':
    model.config.use_mems_train = False
    model.config.use_mems_eval = False


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Convert from essay to sentences
def split_text_with_labels(row):
    
    # Split sentences
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s", row['text'])

    return [{
        'text': sentence,
        'EXT': row['EXT'],
        'NEU': row['NEU'],
        'AGR': row['AGR'],
        'CON': row['CON'],
        'OPN': row['OPN']
    }
        for sentence in sentences       
    ]

def transform_dataframe(old_dataframe):

    # Begin the split
    split_data = []
    
    for _, row in old_dataframe.iterrows():
        split_data.extend(split_text_with_labels(row))
        
    return pd.DataFrame(split_data)

if segment_sentences:
    train_data = transform_dataframe(train_data)
    test_data = transform_dataframe(test_data)
    validation_data = transform_dataframe(validation_data)

In [8]:
print(train_data.head(20))

                                                 text  EXT  NEU  AGR  CON  OPN
0                                     I am tired now.    1    0    1    0    1
1              I don't know what I should talk about.    1    0    1    0    1
2                             I like this assignment.    1    0    1    0    1
3                               Wonder when it's due?    1    0    1    0    1
4    Kristi Urey is the most beautiful person I know.    1    0    1    0    1
5                                         I love you.    1    0    1    0    1
6   I mean I love her with all of my heart, mind, ...    1    0    1    0    1
7                   I like psychology in high school.    1    0    1    0    1
8               It was very interesting and personal.    1    0    1    0    1
9                 That draws people in or so I think.    1    0    1    0    1
10                 Psychology is the food of the sea.    1    0    1    0    1
11                            My roommate's a weirdo

In [9]:
# Convert to DatasetDict
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)
valid_dataset = Dataset.from_dict(validation_data)

full_dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "valid": valid_dataset
})

print(full_dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 73537
    })
    test: Dataset({
        features: ['text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 25629
    })
    valid: Dataset({
        features: ['text', 'EXT', 'NEU', 'AGR', 'CON', 'OPN'],
        num_rows: 23811
    })
})


In [None]:
def preprocess_text(row):

    # Extract the text
    essays = row['text']

    # Clean up
    essays = [dataset_processors.preprocess_text(essay) for essay in essays]

    # Encode them using the tokenizer
    encoded_essay = tokenizer(essays, truncation=True, max_length=256)
    
    # Add the labels
    labels_batch = {key: row[key] for key in row.keys() if key in labels}

    # Create numpy array of batch and labels
    labels_matrix = np.zeros((len(essays), len(labels)))

    # Fill the array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    # Return the encoding
    encoded_essay["labels"] = labels_matrix.tolist()
    
    return encoded_essay

# Perform the preprocessing
full_dataset_dict = full_dataset_dict.map(
    preprocess_text, batched = True, 
    remove_columns = full_dataset_dict['train'].column_names
)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 73537/73537 [00:10<00:00, 7230.63 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 25629/25629 [00:03<00:00, 7800.78 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 23811/23811 [00:03<00:00, 6991.84 examples/s]


In [11]:
#print(full_dataset_dict['train'][0].keys())
print(full_dataset_dict['train'][0])
#print(full_dataset_dict['train'][0]['labels'])
#tokenizer.decode(full_dataset_dict['train'][5]['input_ids'])
#[id2label[idx] for idx, label in enumerate(full_dataset_dict['train'][5]['labels']) if label == 1.0]

{'input_ids': [35, 569, 5020, 145, 9, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [1.0, 0.0, 1.0, 0.0, 1.0]}


In [44]:
full_dataset_dict.set_format("torch", device="cuda:0")
model = model.to("cuda:0")

In [13]:
from transformers import TrainingArguments, Trainer

batch_size = 16
learning_rate = 2e-5
epochs = 10
metric_name = "accuracy"

In [14]:
args = TrainingArguments(
    saved_model_file,
    evaluation_strategy = "epoch",
    save_strategy = "no",
    learning_rate = learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs = epochs,
    weight_decay = 0.01,
    metric_for_best_model = metric_name
)



In [15]:
print(labels)

['EXT', 'NEU', 'AGR', 'CON', 'OPN']


In [16]:
from sklearn.metrics import accuracy_score
import torch

def multi_label_metrics(pred_logits, gold_labels):

    # Our threshold
    threshold = 0.5

    # Apply sigmoid to logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(pred_logits))

    # Convert predictions to integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # Compute metrics
    y_true = gold_labels

    metrics = {
        f"{id2label[i]} - accuracy": accuracy_score(y_true[:, i], y_pred[:, i]) 
        for i in range(len(labels))
    }

    overall_accuracy = accuracy_score(y_true, y_pred)

    # Store and return as dictionary
    metrics['accuracy'] = overall_accuracy
    
    return metrics

def compute_metrics(p):

    # Get the type of predictions
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    # Compute the results
    results = multi_label_metrics(preds, p.label_ids)

    return results

In [45]:
# forward pass (testing)

outputs = model(
    input_ids=full_dataset_dict['train']['input_ids'][0].unsqueeze(0), 
    attention_mask=full_dataset_dict['train']['attention_mask'][0].unsqueeze(0),
    labels=full_dataset_dict['train'][0]['labels'].unsqueeze(0),
    use_mems=None
)

outputs

XLNetForSequenceClassificationOutput(loss=tensor(0.7372, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0788,  0.0643, -0.0348,  0.3570, -0.0282]], device='cuda:0',
       grad_fn=<AddmmBackward0>), mems=None, hidden_states=None, attentions=None)

In [46]:
# Initialize the trainer before training
trainer = Trainer(
    model,
    args,
    train_dataset=full_dataset_dict["train"],
    eval_dataset=full_dataset_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [47]:
trainer.train()

  0%|          | 109/45970 [00:40<6:00:31,  2.12it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 360.00 MiB. GPU 

In [39]:
trainer.save_model(saved_model_file)