In [2]:
import json
import numpy as np
from functools import partial

from itertools import chain
from datasets import Dataset

import torch
from transformers import set_seed
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate

import pandas as pd

2024-03-06 21:15:50.476679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 21:15:50.476823: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 21:15:50.597112: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
OUTPUT_DIR = "output"
SEED = 42

In [4]:
seed_value = SEED
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)

# Set random seed for NumPy
np.random.seed(seed_value)

# Set random seed for Hugging Face
set_seed(seed_value)

In [5]:
def df_to_dict_list(df):
    """
    Convert Pandas DataFrame to a list of dictionaries.

    Parameters:
        df (DataFrame): The input Pandas DataFrame.

    Returns:
        list: A list of dictionaries where each dictionary represents a row in the DataFrame.
    """
    # Convert DataFrame to list of dictionaries
    dict_list = df.to_dict(orient='records')
    return dict_list

In [6]:
import ast
data_moth = pd.read_csv('/kaggle/input/pii-external-dataset/pii_dataset.csv')
data_moth['tokens'] = data_moth['tokens'].apply(ast.literal_eval)
data_moth['trailing_whitespace'] = data_moth['trailing_whitespace'].apply(ast.literal_eval)
data_moth['labels'] = data_moth['labels'].apply(ast.literal_eval)

data_moth = data_moth.rename(columns={"text":"full_text"})
data_moth = df_to_dict_list(data_moth[['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels']])

In [7]:
data_competition = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))
data_nicholas    = json.load(open("/kaggle/input/pii-dd-mistral-generated/mixtral-8x7b-v1.json"))

data = data_competition + data_nicholas

In [8]:
ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

ds = ds.shuffle(seed=SEED)

In [9]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [10]:
def tokenize(example, tokenizer, label2id, max_length):
    """
    Function to tokenize the text and map the old labels/tokens to the
    new tokens.
    """

    # Rebuild text from tokens with respective labels
    text = []
    labels = []

    for t, l, ws in zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"]):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")
    
    text = "".join(text)
    # Tokenize the text with offset mapping to keep track of tokens positions
    tokenized = tokenizer(text, return_offsets_mapping=True, max_length=max_length, truncation=True)

    # Map labels from old tokens to new tokenization
    labels = np.array(labels)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])


    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

In [11]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results


In [12]:
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": TRAINING_MAX_LENGTH}, num_proc=4)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
    )

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



     

#0:   0%|          | 0/2291 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/2291 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/2290 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/2290 [00:00<?, ?ex/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
args = TrainingArguments(
        output_dir=OUTPUT_DIR, 
        fp16=True,
        #warmup_steps=100,
        learning_rate=2e-5,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=1,
        report_to="none",
        evaluation_strategy="no",
        save_strategy="epoch",
        save_total_limit=1,
        overwrite_output_dir=True,
        #load_best_model_at_end=True,
        lr_scheduler_type='cosine',
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_steps=100,
        weight_decay=0.01,
        seed = SEED,
    )

trainer = Trainer(
        model=model, 
        args=args, 
        train_dataset=ds, 
        #eval_dataset=ds_splits["test"], 
        data_collator=collator, 
        tokenizer=tokenizer,
        compute_metrics=partial(compute_metrics, all_labels=all_labels),
    )

In [14]:
print('start training')

start training


In [15]:
trainer.train()

Step,Training Loss
100,0.2968
200,0.0202
300,0.0081
400,0.0056
500,0.0042
600,0.0033
700,0.0047
800,0.0019
900,0.002
1000,0.0015


TrainOutput(global_step=6873, training_loss=0.005631958662533489, metrics={'train_runtime': 5842.73, 'train_samples_per_second': 4.704, 'train_steps_per_second': 1.176, 'total_flos': 1.331643078340416e+16, 'train_loss': 0.005631958662533489, 'epoch': 3.0})

In [16]:
trainer.save_model("deberta3base_nicholas_3epochs_bs4_acc1")
tokenizer.save_pretrained("deberta3base_nicholas_3epochs_bs4_acc1")

('deberta3base_nicholas_3epochs_bs4_acc1/tokenizer_config.json',
 'deberta3base_nicholas_3epochs_bs4_acc1/special_tokens_map.json',
 'deberta3base_nicholas_3epochs_bs4_acc1/spm.model',
 'deberta3base_nicholas_3epochs_bs4_acc1/added_tokens.json',
 'deberta3base_nicholas_3epochs_bs4_acc1/tokenizer.json')