In [1]:
import wandb
wandb.init(mode="disabled")



# Install and Import Packages

In [2]:
!pip install seqeval datasets transformers accelerate

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m712.3 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=4e6f12e69223c3eb29b4e70cc4d475d47ea4f950f212f4145c6ff4bf6e0481c9
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
import os
import numpy as np
import torch

from datasets import Dataset
from functools import partial, reduce
from pandas import read_json, read_csv
from seqeval.metrics import recall_score, precision_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from tqdm import tqdm

2024-06-02 17:05:52.351094: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-02 17:05:52.351226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-02 17:05:52.612031: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
from pii_dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split

/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv
/kaggle/input/pii-detection-removal-from-educational-data/train.json
/kaggle/input/pii-detection-removal-from-educational-data/test.json


# Evaluation

In [5]:
def get_fbeta_score(precision, recall, beta=5.0):
    b2 = beta ** 2
    return (1 + b2) * ((precision * recall) / (b2 * precision + recall))


def compute_metrics(p, labels_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    fbeta_score = get_fbeta_score(precision, recall)

    results = {
        'recall': recall,
        'precision': precision,
        'fbeta_score': fbeta_score
    }

    return results


# Configurations

In [6]:
# Model configuration

class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = True

    # model checkpoint
    model_name = 'roberta-base'
    train_head_only = False

    # path to data
    data_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"

    # path to the directory where the model will be saved
    model_save_name = 'Roberta'
    target_dir = '/kaggle/working/model/trainer_'+model_save_name
    model_save_path = '/kaggle/working/model/model_'+model_save_name

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer_args'),
        evaluation_strategy="epoch")

# Training

In [7]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name, add_prefix_space=True)

model = AutoModelForTokenClassification.from_pretrained(CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
data = get_dataset_from_path(CFG.data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id)

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'],
    num_rows: 6807
})
encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 5/5 [00:20<00:00,  4.13s/it]


In [9]:
data 

Dataset({
    features: ['input_ids', 'attention_mask', 'org_word_ids', 'document', 'labels'],
    num_rows: 14123
})

In [10]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

In [11]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss,Recall,Precision,Fbeta Score
1,0.0161,0.001415,0.816456,0.617225,0.806444
2,0.0015,0.001314,0.911392,0.60251,0.893769
3,0.0005,0.000634,0.908228,0.82,0.904485




TrainOutput(global_step=2145, training_loss=0.004492811810025524, metrics={'train_runtime': 2060.7394, 'train_samples_per_second': 16.653, 'train_steps_per_second': 1.041, 'total_flos': 8968055772266496.0, 'train_loss': 0.004492811810025524, 'epoch': 3.0})

In [13]:
trainer.evaluate(data_test, metric_key_prefix='test')

{'test_loss': 0.0007013895665295422,
 'test_recall': 0.9056122448979592,
 'test_precision': 0.8637469586374696,
 'test_fbeta_score': 0.9039271374008423,
 'test_runtime': 34.6786,
 'test_samples_per_second': 40.746,
 'test_steps_per_second': 2.566,
 'epoch': 3.0}