# This notebook contains the currently working code for the baseline model. The final and ready-to-use version will be in src/baseline.py (at some point)

### Libraries

In [1]:
import torch 
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import os
import numpy as np
from seqeval.metrics import recall_score, precision_score, accuracy_score
from functools import partial

# add the parent directory to the path so we can import the dataloader module
import sys
sys.path.append('..')
from src.data.dataloader import preprocess_data, get_dataset_from_path, get_train_val_test_split
from src.models.utils import compute_weights, weight_to_tensor, get_fbeta_score, compute_metrics
from src.models.trainer import PIITrainer

# Clean Version

In [2]:
#model configuration

class CFG:
    LABELS_LIST = ['B-NAME_STUDENT', 'B-EMAIL', 'B-USERNAME', 'B-ID_NUM', 'B-PHONE_NUM', 'B-URL_PERSONAL', 'B-STREET_ADDRESS', 'I-NAME_STUDENT', 'I-EMAIL', 'I-USERNAME', 'I-ID_NUM', 'I-PHONE_NUM','I-URL_PERSONAL','I-STREET_ADDRESS', 'O']
    label2id = {label: i for i, label in enumerate(LABELS_LIST)}
    label2id['[PAD]'] = -100
    id2label = {i: label for label, i in label2id.items()}
    seed = 42
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = True

    # model checkpoint
    model_name = 'bert-base-uncased'
    train_head_only = False

    # path to the directory where the model will be saved
    local_path = os.path.abspath(os.path.abspath(''))
    target_dir = os.path.join(local_path,'..','models', 'baseline')

    #training arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(target_dir, 'trainer'), 
        evaluation_strategy="epoch"
        )

## Loading the model and data

In [3]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

In [4]:
data_path = os.path.join('..','data', 'raw', 'train.json')
data = get_dataset_from_path(data_path)
data = preprocess_data(data, tokenizer, label2id = CFG.label2id)

encoding the labels...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

tokenizing and aligning...


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [None]:
data['labels'][0][:20], tokenizer.decode(data['input_ids'][0][:20])

([-100, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 7, 7, 14, 14, 14],
 '[CLS] design thinking for innovation reflexion - avril 2021 - nathalie sylla challenge & selection')

In [None]:
data_train, data_eval, data_test = get_train_val_test_split(data, seed=CFG.seed)

## Training the model

In [None]:
model = AutoModelForTokenClassification.from_pretrained(CFG.model_name, num_labels=len(CFG.label2id), id2label=CFG.id2label, label2id=CFG.label2id)
trainer = Trainer(
    model=model,
    args=CFG.training_args,
    train_dataset=data_train,
    eval_dataset=data_eval,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

In [None]:
model_save_path = os.path.join(CFG.target_dir, 'model')
trainer.save_model(model_save_path)

In [None]:
trainer.evaluate(data_test, metric_key_prefix='test')

## Loading the model

### From local path

In [None]:
model_from_disk = AutoModelForTokenClassification.from_pretrained(model_save_path)

### From huggingface

In [None]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained("zmilczarek/pii-detection-baseline-v0.3")

Evaluating the huggingface model

In [None]:
trainer = Trainer(
    model=model_from_huggingface,
    train_dataset=data_train,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate(data_test, metric_key_prefix='test')

-------------------------

## Training head only 

In [7]:
model = AutoModelForTokenClassification.from_pretrained(CFG.model_name, num_labels=len(CFG.label2id), id2label=CFG.id2label, label2id=CFG.label2id)
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args_head = CFG.training_args
training_args_head.output_dir = os.path.join(CFG.target_dir, 'head_only')
training_args_head.num_train_epochs = 10
trainer_head = Trainer(
    model=model,
    args=training_args_head,
    train_dataset=data_train,
    eval_dataset=data_eval,
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, labels_list=CFG.LABELS_LIST),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [9]:
training_args_head.output_dir

'/Users/zofia/Documents/UNI/UPARIS/sem2/pii-projet-tal/notebooks/../models/baseline/head_only'

The training in the cellbelow was run from a checkpoint

In [10]:
trainer_head.train(resume_from_checkpoint=True)

  0%|          | 0/12980 [00:00<?, ?it/s]

{'loss': 0.0053, 'learning_rate': 1.5331278890600924e-05, 'epoch': 6.93}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.003764230292290449, 'eval_recall': 0.18256130790190736, 'eval_precision': 0.41358024691358025, 'eval_fbeta_score': 0.18656956195780228, 'eval_runtime': 44.1511, 'eval_samples_per_second': 26.115, 'eval_steps_per_second': 3.284, 'epoch': 7.0}
{'loss': 0.0053, 'learning_rate': 1.3405238828967643e-05, 'epoch': 7.32}
{'loss': 0.0055, 'learning_rate': 1.147919876733436e-05, 'epoch': 7.7}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.0035935097839683294, 'eval_recall': 0.23978201634877383, 'eval_precision': 0.4888888888888889, 'eval_fbeta_score': 0.24457509353287016, 'eval_runtime': 45.7927, 'eval_samples_per_second': 25.179, 'eval_steps_per_second': 3.166, 'epoch': 8.0}
{'loss': 0.0049, 'learning_rate': 9.553158705701079e-06, 'epoch': 8.09}
{'loss': 0.0052, 'learning_rate': 7.627118644067798e-06, 'epoch': 8.47}
{'loss': 0.005, 'learning_rate': 5.701078582434515e-06, 'epoch': 8.86}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.003504975698888302, 'eval_recall': 0.2615803814713896, 'eval_precision': 0.4948453608247423, 'eval_fbeta_score': 0.2664105027217419, 'eval_runtime': 43.0381, 'eval_samples_per_second': 26.79, 'eval_steps_per_second': 3.369, 'epoch': 9.0}
{'loss': 0.0054, 'learning_rate': 3.775038520801233e-06, 'epoch': 9.24}
{'loss': 0.005, 'learning_rate': 1.848998459167951e-06, 'epoch': 9.63}


  0%|          | 0/145 [00:00<?, ?it/s]

{'eval_loss': 0.0034799715504050255, 'eval_recall': 0.26430517711171664, 'eval_precision': 0.48743718592964824, 'eval_fbeta_score': 0.26904203114998937, 'eval_runtime': 46.1732, 'eval_samples_per_second': 24.971, 'eval_steps_per_second': 3.14, 'epoch': 10.0}
{'train_runtime': 1939.6339, 'train_samples_per_second': 53.5, 'train_steps_per_second': 6.692, 'train_loss': 0.001781421302830677, 'epoch': 10.0}


TrainOutput(global_step=12980, training_loss=0.001781421302830677, metrics={'train_runtime': 1939.6339, 'train_samples_per_second': 53.5, 'train_steps_per_second': 6.692, 'train_loss': 0.001781421302830677, 'epoch': 10.0})

In [11]:
model_head_save_path = os.path.join(CFG.target_dir, 'head_only', 'model')
trainer_head.save_model(model_head_save_path)

In [12]:
trainer_head.evaluate(data_test, metric_key_prefix='test')

  0%|          | 0/161 [00:00<?, ?it/s]

{'test_loss': 0.005043829325586557,
 'test_recall': 0.2045929018789144,
 'test_precision': 0.37547892720306514,
 'test_fbeta_score': 0.20823798627002288,
 'test_runtime': 51.5587,
 'test_samples_per_second': 24.865,
 'test_steps_per_second': 3.123,
 'epoch': 10.0}

### Loading the model

In [22]:
model_just_head_from_huggingface = AutoModelForTokenClassification.from_pretrained("zmilczarek/pii-detection-baseline-head-only-v0.1")

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

## Training on 10 epochs

# Other attempts (different compute metrics, different loss function etc)

## Using the model

### Loading the model

In [None]:
# model_dir = 'model/model_initial_preprocessing'
# model_loaded = AutoModelForTokenClassification.from_pretrained(model_dir)
# model_loaded = model_loaded.to('cpu')

In [22]:
model_loaded = AutoModelForTokenClassification.from_pretrained(model_save_path)

In [23]:
trainer = Trainer(
    model=model_loaded,
    train_dataset=data_train,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate(data_test, metric_key_prefix='test')

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/161 [00:00<?, ?it/s]

{'test_loss': 0.0023406886029988527,
 'test_recall': 0.7954070981210856,
 'test_precision': 0.8141025641025641,
 'test_fbeta_score': 0.7961102627983605,
 'test_runtime': 50.2165,
 'test_samples_per_second': 25.529,
 'test_steps_per_second': 3.206}

In [24]:
token ='put token here'
model_loaded.push_to_hub('zmilczarek/pii-detection-baseline-v0.3', token = token)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zmilczarek/pii-detection-baseline-v0.3/commit/77c400cec12e992133d9528279b271d49480754f', commit_message='Upload BertForTokenClassification', commit_description='', oid='77c400cec12e992133d9528279b271d49480754f', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
model_from_huggingface = AutoModelForTokenClassification.from_pretrained('zmilczarek/pii-detection-baseline-v0.2')

In [13]:
trainer = Trainer(
    model=model_from_huggingface,
    train_dataset=data_train,
    eval_dataset=data_test,
    compute_metrics=partial(compute_metrics, labels_list = CFG.LABELS_LIST),
)

#eval
trainer.evaluate(data_test, metric_key_prefix='test')

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/161 [00:00<?, ?it/s]

{'test_loss': 0.000548205862287432,
 'test_recall': 0.964509394572025,
 'test_precision': 0.9295774647887324,
 'test_fbeta_score': 0.9631173829377806,
 'test_runtime': 50.3708,
 'test_samples_per_second': 25.451,
 'test_steps_per_second': 3.196}

## Training just the head

### Inference

In [None]:
"""
The plan to make the model label test.csv correctly

1. Load the model
2. Prepare the dataset  (prepare input ids/ att mask in chunks)
3. Get the labels

"""