In [1]:
import numpy as np
import os
import pandas as pd

from datasets import Dataset
from functools import reduce
from seqeval.metrics import recall_score, precision_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from functools import partial
from tqdm import tqdm

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split
from src.models.utils import get_fbeta_score, compute_metrics

## Trainign roberta on just the classification head

In [2]:
class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'roberta-base'
    train_head_only = True

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'roberta-base')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch"
        )
    model_save_path = os.path.join(target_dir, 'model')

tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True)

## Loading the data

In [3]:
keys_to_flatten = ['input_ids', 'attention_mask', 'org_word_ids', 'document']
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id, keys_to_flatten=keys_to_flatten)
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'],
    num_rows: 6807
})
encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 5/5 [00:06<00:00,  1.20s/it]


## Training

In [4]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    #print(predictions[1])
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    #print(true_predictions[1])
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    #print(true_labels[1])

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    fbeta_score = get_fbeta_score(precision, recall)

    results = {
        'recall': recall,
        'precision': precision,
        'fbeta_score': fbeta_score
    }
        
    return results

In [5]:
model = AutoModelForTokenClassification.from_pretrained(CFG.model_name, num_labels=len(CFG.label2id), id2label=CFG.id2label, label2id=CFG.label2id)
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args_head = CFG.training_args
training_args_head.output_dir = os.path.join(CFG.target_dir, 'head_only')
training_args_head.num_train_epochs = 10
trainer_head = Trainer(
    model=model,
    args=training_args_head,
    train_dataset=data_train,
    eval_dataset=data_eval,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, id2label=CFG.id2label),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [7]:
trainer_head.train()

  0%|          | 0/14300 [00:00<?, ?it/s]

{'loss': 1.476, 'learning_rate': 4.825174825174825e-05, 'epoch': 0.35}
{'loss': 0.2472, 'learning_rate': 4.6503496503496505e-05, 'epoch': 0.7}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.030649319291114807, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 50.263, 'eval_samples_per_second': 25.287, 'eval_steps_per_second': 3.163, 'epoch': 1.0}
{'loss': 0.0533, 'learning_rate': 4.475524475524476e-05, 'epoch': 1.05}
{'loss': 0.0299, 'learning_rate': 4.300699300699301e-05, 'epoch': 1.4}
{'loss': 0.0221, 'learning_rate': 4.125874125874126e-05, 'epoch': 1.75}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.01109398901462555, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.2611, 'eval_samples_per_second': 26.336, 'eval_steps_per_second': 3.295, 'epoch': 2.0}
{'loss': 0.0181, 'learning_rate': 3.9510489510489516e-05, 'epoch': 2.1}
{'loss': 0.0165, 'learning_rate': 3.776223776223776e-05, 'epoch': 2.45}
{'loss': 0.0124, 'learning_rate': 3.601398601398602e-05, 'epoch': 2.8}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.00758145097643137, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 48.1492, 'eval_samples_per_second': 26.397, 'eval_steps_per_second': 3.302, 'epoch': 3.0}
{'loss': 0.013, 'learning_rate': 3.4265734265734265e-05, 'epoch': 3.15}
{'loss': 0.011, 'learning_rate': 3.251748251748252e-05, 'epoch': 3.5}
{'loss': 0.0105, 'learning_rate': 3.0769230769230774e-05, 'epoch': 3.85}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.006165102124214172, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 46.3098, 'eval_samples_per_second': 27.446, 'eval_steps_per_second': 3.433, 'epoch': 4.0}
{'loss': 0.0097, 'learning_rate': 2.9020979020979022e-05, 'epoch': 4.2}
{'loss': 0.0097, 'learning_rate': 2.7272727272727273e-05, 'epoch': 4.55}
{'loss': 0.0101, 'learning_rate': 2.5524475524475528e-05, 'epoch': 4.9}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.005371148698031902, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 47.7529, 'eval_samples_per_second': 26.616, 'eval_steps_per_second': 3.33, 'epoch': 5.0}
{'loss': 0.0083, 'learning_rate': 2.377622377622378e-05, 'epoch': 5.24}
{'loss': 0.0097, 'learning_rate': 2.202797202797203e-05, 'epoch': 5.59}
{'loss': 0.0078, 'learning_rate': 2.027972027972028e-05, 'epoch': 5.94}


  0%|          | 0/159 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.0048636700958013535, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 49.59, 'eval_samples_per_second': 25.63, 'eval_steps_per_second': 3.206, 'epoch': 6.0}
{'loss': 0.0082, 'learning_rate': 1.8531468531468532e-05, 'epoch': 6.29}
{'loss': 0.0075, 'learning_rate': 1.6783216783216786e-05, 'epoch': 6.64}
{'loss': 0.0081, 'learning_rate': 1.5034965034965034e-05, 'epoch': 6.99}


  0%|          | 0/159 [00:00<?, ?it/s]

  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.0045405118726193905, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 49.5648, 'eval_samples_per_second': 25.643, 'eval_steps_per_second': 3.208, 'epoch': 7.0}
{'loss': 0.008, 'learning_rate': 1.3286713286713287e-05, 'epoch': 7.34}
{'loss': 0.0081, 'learning_rate': 1.153846153846154e-05, 'epoch': 7.69}


  0%|          | 0/159 [00:00<?, ?it/s]

  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.004338290076702833, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 49.6329, 'eval_samples_per_second': 25.608, 'eval_steps_per_second': 3.204, 'epoch': 8.0}
{'loss': 0.0064, 'learning_rate': 9.79020979020979e-06, 'epoch': 8.04}
{'loss': 0.0076, 'learning_rate': 8.041958041958042e-06, 'epoch': 8.39}
{'loss': 0.0071, 'learning_rate': 6.2937062937062944e-06, 'epoch': 8.74}


  0%|          | 0/159 [00:00<?, ?it/s]

  return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


{'eval_loss': 0.004230338614434004, 'eval_recall': 0.0, 'eval_precision': 0.0, 'eval_fbeta_score': nan, 'eval_runtime': 50.5476, 'eval_samples_per_second': 25.145, 'eval_steps_per_second': 3.146, 'epoch': 9.0}
{'loss': 0.007, 'learning_rate': 4.5454545454545455e-06, 'epoch': 9.09}
{'loss': 0.0072, 'learning_rate': 2.7972027972027974e-06, 'epoch': 9.44}
{'loss': 0.0068, 'learning_rate': 1.0489510489510491e-06, 'epoch': 9.79}


  0%|          | 0/159 [00:00<?, ?it/s]

{'eval_loss': 0.004195818677544594, 'eval_recall': 0.006329113924050633, 'eval_precision': 0.4, 'eval_fbeta_score': 0.006578115117014548, 'eval_runtime': 47.3983, 'eval_samples_per_second': 26.815, 'eval_steps_per_second': 3.355, 'epoch': 10.0}
{'train_runtime': 5999.5517, 'train_samples_per_second': 19.066, 'train_steps_per_second': 2.384, 'train_loss': 0.07172457303200569, 'epoch': 10.0}


TrainOutput(global_step=14300, training_loss=0.07172457303200569, metrics={'train_runtime': 5999.5517, 'train_samples_per_second': 19.066, 'train_steps_per_second': 2.384, 'train_loss': 0.07172457303200569, 'epoch': 10.0})

In [8]:
model_head_save_path = os.path.join(CFG.target_dir, 'head_only', 'model')
trainer_head.save_model(model_head_save_path)

In [9]:
trainer_head.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/177 [00:00<?, ?it/s]

{'test_loss': 0.004614074714481831,
 'test_recall': 0.007653061224489796,
 'test_precision': 0.2727272727272727,
 'test_fbeta_score': 0.007950259912343289,
 'test_runtime': 52.2445,
 'test_samples_per_second': 27.046,
 'test_steps_per_second': 3.388,
 'epoch': 10.0}