# This notebook contains the currently working code for the baseline model. The final and ready-to-use version will be in src/baseline.py (at some point)

### Libraries

In [1]:
import torch 
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import os
import numpy as np
from seqeval.metrics import recall_score, precision_score, accuracy_score
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split
from src.models.utils import compute_weights, weight_to_tensor, get_fbeta_score, compute_metrics
from src.models.trainer import PIITrainer

In [2]:
# Model configuration

class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = True

    # model checkpoint
    model_name = 'bert-base-uncased'
    train_head_only = False

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'baseline')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch"
        )

### Loading the model and data

In [4]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

In [5]:
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:09<00:00,  1.56s/it]


In [10]:
data

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document', 'labels'],
    num_rows: 12812
})

Calculate Weights for Loss Function

In [6]:
weights = weight_to_tensor(compute_weights(data['labels'], CFG.id2label, method="effective", beta=0.99)).to(CFG.device)

In [8]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

## Training the model

In [10]:
model = AutoModelForTokenClassification.from_pretrained(
    CFG.model_name, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# freezing the BERT layers
if CFG.train_head_only:
    print('Training head only')
    for param in model.base_model.parameters():
        param.requires_grad = False
else:
    print('Training all layers')

Training all layers


In [15]:
# trainer = PIITrainer(
#     model=model,
#     args=CFG.training_args,
#     train_dataset=data_train,
#     eval_dataset=data_eval,
#     tokenizer=tokenizer,
#     compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
#     weights=weights
# )

# trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3894 [00:00<?, ?it/s]

RuntimeError: Placeholder storage has not been allocated on MPS device!

In [14]:
model_save_path = os.path.join(CFG.target_dir, 'model')

In [None]:
trainer.save_model(model_save_path)

In [17]:
# model = AutoModelForTokenClassification.from_pretrained(model_save_path)
# token = [put your token here]
# model.push_to_hub('zmilczarek/pii-detection-baseline-v0.2', token = token)

In [15]:
# load model

model = AutoModelForTokenClassification.from_pretrained(model_save_path)

In [24]:
model.to('cpu')
with torch.no_grad():
    p = model(torch.tensor(data['input_ids'][:10]), return_dict=True)
    p = (p.logits, torch.tensor(data['labels'][:10]))
    r = compute_metrics(p, CFG.LABELS_LIST)

In [25]:
r

{'recall': 0.9411764705882353,
 'precision': 0.8888888888888888,
 'fbeta_score': 0.9390519187358916}

In [16]:
trainer = Trainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.0023406886029988527,
 'eval_recall': 0.7954070981210856,
 'eval_precision': 0.8141025641025641,
 'eval_fbeta_score': 0.7961102627983605,
 'eval_runtime': 49.5266,
 'eval_samples_per_second': 25.885,
 'eval_steps_per_second': 3.251}

In [34]:
def compute_metrics_just_tags(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100 and labels_list[l] != 'O']
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100 and labels_list[l] != 'O']
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)
        accuracy = accuracy_score(true_labels, true_predictions)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score,
            'accuracy': accuracy
            }
        
        return results

In [35]:
trainer = Trainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics_just_tags, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.0023406886029988527,
 'eval_recall': 0.7974947807933194,
 'eval_precision': 0.9769820971867008,
 'eval_fbeta_score': 0.8031699822092835,
 'eval_accuracy': 0.801980198019802,
 'eval_runtime': 48.6449,
 'eval_samples_per_second': 26.354,
 'eval_steps_per_second': 3.31}

## Using the model

### Loading the model

In [None]:
model_dir = 'model/model_initial_preprocessing'
model_loaded = AutoModelForTokenClassification.from_pretrained(model_dir)
model_loaded = model_loaded.to('cpu')

In [29]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-baseline-v0.2')

In [55]:
encoded_data_split = data.train_test_split(test_size=0.1)

In [60]:
data

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document'],
    num_rows: 5474
})

In [33]:
trainer = Trainer(
    model=model_from_huggingface,
    train_dataset=data_train,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics_just_tags, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.000548205862287432,
 'eval_recall': 0.964509394572025,
 'eval_precision': 0.9506172839506173,
 'eval_fbeta_score': 0.9639675788459995,
 'eval_accuracy': 0.9519094766619519,
 'eval_runtime': 50.3617,
 'eval_samples_per_second': 25.456,
 'eval_steps_per_second': 3.197}

### Inference

In [None]:
"""
The plan to make the model label test.csv correctly

1. Load the model
2. Prepare the dataset  (prepare input ids/ att mask in chunks)
3. Get the labels

"""