# This notebook contains the currently working code for the baseline model. The final and ready-to-use version will be in src/baseline.py (at some point)

### Libraries

In [2]:
import torch 
from transformers import AutoTokenizer, AutoModelForTokenClassification, EarlyStoppingCallback, DataCollatorForTokenClassification, Trainer, TrainingArguments
import os
import pandas as pd
import tensorflow as tf
from datasets import Dataset
import numpy as np
from seqeval.metrics import recall_score, precision_score, accuracy_score
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data

In [3]:
MODEL_NAME = 'bert-base-uncased'
TRAIN_HEAD_ONLY = False

In [4]:
os.path.abspath('')

'/Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks'

In [5]:
#model configuration

class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42

    # model checkpoint
    model_name = 'bert-base-uncased'

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'baseline')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch"
        )

In [6]:
CFG.target_dir

'/Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/baseline'

### Loading the model and data

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
data_path = os.path.join('..','data', 'raw', 'train.json')
data = preprocess_data(data_path, tokenizer, CFG.label2id, keys_to_keep=['document'])

Dataset({
    features: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'],
    num_rows: 6807
})
encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

flattening the data...


100%|██████████| 6/6 [00:11<00:00,  1.99s/it]


In [15]:
data

Dataset({
    features: ['document', 'org_word_ids', 'attention_mask', 'token_type_ids', 'labels', 'input_ids'],
    num_rows: 12812
})

In [10]:
data['labels'][0][:20], tokenizer.decode(data['input_ids'][0][:20])

([-100, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 7, 7, 14, 14, 14],
 '[CLS] design thinking for innovation reflexion - avril 2021 - nathalie sylla challenge & selection')

In [16]:
data_train_test = data.train_test_split(test_size=0.1, seed=CFG.seed)
data_test = data_train_test['test']
data_train_eval = data_train_test['train'].train_test_split(test_size=0.1)

## Training the model

In [17]:
def get_fbeta_score(precision, recall, beta=5.0):
        b2 = beta ** 2
        return (1 + b2) * ((precision * recall) / (b2 * precision + recall))

def compute_metrics(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score
            }
        
        return results

In [18]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(CFG.id2label), id2label=CFG.id2label, label2id=CFG.label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
#freezing the BERT layers
if TRAIN_HEAD_ONLY:
    print('Training head only')
    for param in model.base_model.parameters():
        param.requires_grad = False
else:
    print('Training all layers')

Training all layers


In [20]:
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train_eval["train"],
    eval_dataset=data_train_eval["test"],
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/3894 [00:00<?, ?it/s]

Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/baseline/trainer/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0243, 'learning_rate': 4.3579866461222396e-05, 'epoch': 0.39}


Checkpoint destination directory /Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/baseline/trainer/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.0035, 'learning_rate': 3.715973292244479e-05, 'epoch': 0.77}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.0016380001325160265, 'eval_recall': 0.8444444444444444, 'eval_precision': 0.7139874739039666, 'eval_fbeta_score': 0.8385514900037722, 'eval_runtime': 48.5972, 'eval_samples_per_second': 23.726, 'eval_steps_per_second': 2.984, 'epoch': 1.0}
{'loss': 0.0025, 'learning_rate': 3.073959938366718e-05, 'epoch': 1.16}
{'loss': 0.0013, 'learning_rate': 2.4319465844889575e-05, 'epoch': 1.54}
{'loss': 0.0012, 'learning_rate': 1.789933230611197e-05, 'epoch': 1.93}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.0010410952381789684, 'eval_recall': 0.8864197530864197, 'eval_precision': 0.8466981132075472, 'eval_fbeta_score': 0.8848232059910892, 'eval_runtime': 45.4898, 'eval_samples_per_second': 25.346, 'eval_steps_per_second': 3.188, 'epoch': 2.0}
{'loss': 0.0007, 'learning_rate': 1.147919876733436e-05, 'epoch': 2.31}
{'loss': 0.0004, 'learning_rate': 5.059065228556755e-06, 'epoch': 2.7}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.0010464797960594296, 'eval_recall': 0.9061728395061729, 'eval_precision': 0.8615023474178404, 'eval_fbeta_score': 0.9043692540991376, 'eval_runtime': 44.4972, 'eval_samples_per_second': 25.912, 'eval_steps_per_second': 3.259, 'epoch': 3.0}
{'train_runtime': 4385.6561, 'train_samples_per_second': 7.098, 'train_steps_per_second': 0.888, 'train_loss': 0.004421819466380015, 'epoch': 3.0}


TrainOutput(global_step=3894, training_loss=0.004421819466380015, metrics={'train_runtime': 4385.6561, 'train_samples_per_second': 7.098, 'train_steps_per_second': 0.888, 'train_loss': 0.004421819466380015, 'epoch': 3.0})

In [21]:
model_save_path = os.path.join(CFG.target_dir, 'model')
trainer.save_model(model_save_path)

In [25]:
# model = AutoModelForTokenClassification.from_pretrained(model_save_path)
# token = [put your token here]
# model.push_to_hub('zmilczarek/pii-detection-baseline-v0.2', token = token)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zmilczarek/pii-detection-baseline-v0.2/commit/69813013ed2ebe1117333b9d8b8a1fbe2e85cf93', commit_message='Upload BertForTokenClassification', commit_description='', oid='69813013ed2ebe1117333b9d8b8a1fbe2e85cf93', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
# load model

model = AutoModelForTokenClassification.from_pretrained(model_save_path)

In [24]:
model.to('cpu')
with torch.no_grad():
    p = model(torch.tensor(data['input_ids'][:10]), return_dict=True)
    p = (p.logits, torch.tensor(data['labels'][:10]))
    r = compute_metrics(p, CFG.LABELS_LIST)

In [25]:
r

{'recall': 0.9411764705882353,
 'precision': 0.8888888888888888,
 'fbeta_score': 0.9390519187358916}

In [28]:
trainer = Trainer(
    model=model,
    train_dataset=data_train_eval["train"],
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.0023406886029988527,
 'eval_recall': 0.7954070981210856,
 'eval_precision': 0.8141025641025641,
 'eval_fbeta_score': 0.7961102627983605,
 'eval_runtime': 50.3979,
 'eval_samples_per_second': 25.438,
 'eval_steps_per_second': 3.195}

In [34]:
def compute_metrics_just_tags(p, labels_list):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [labels_list[p] for (p, l) in zip(prediction, label) if l != -100 and labels_list[l] != 'O']
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [labels_list[l] for (p, l) in zip(prediction, label) if l != -100 and labels_list[l] != 'O']
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        fbeta_score = get_fbeta_score(precision, recall)
        accuracy = accuracy_score(true_labels, true_predictions)

        results = {
            'recall': recall,
            'precision': precision,
            'fbeta_score': fbeta_score,
            'accuracy': accuracy
            }
        
        return results

In [35]:
trainer = Trainer(
    model=model,
    train_dataset=data_train_eval["train"],
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics_just_tags, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.0023406886029988527,
 'eval_recall': 0.7974947807933194,
 'eval_precision': 0.9769820971867008,
 'eval_fbeta_score': 0.8031699822092835,
 'eval_accuracy': 0.801980198019802,
 'eval_runtime': 48.6449,
 'eval_samples_per_second': 26.354,
 'eval_steps_per_second': 3.31}

## Using the model

### Loading the model

In [None]:
model_dir = 'model/model_initial_preprocessing'
model_loaded = AutoModelForTokenClassification.from_pretrained(model_dir)
model_loaded = model_loaded.to('cpu')

In [29]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-baseline-v0.2')

In [55]:
encoded_data_split = data.train_test_split(test_size=0.1)

In [60]:
data

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document'],
    num_rows: 5474
})

In [33]:
trainer = Trainer(
    model=model_from_huggingface,
    train_dataset=data_train_eval["train"],
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics_just_tags, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate()

  0%|          | 0/161 [00:00<?, ?it/s]

{'eval_loss': 0.000548205862287432,
 'eval_recall': 0.964509394572025,
 'eval_precision': 0.9506172839506173,
 'eval_fbeta_score': 0.9639675788459995,
 'eval_accuracy': 0.9519094766619519,
 'eval_runtime': 50.3617,
 'eval_samples_per_second': 25.456,
 'eval_steps_per_second': 3.197}

### Inference

In [None]:
"""
The plan to make the model label test.csv correctly

1. Load the model
2. Prepare the dataset  (prepare input ids/ att mask in chunks)
3. Get the labels

"""