# Libraries

In [None]:
import os
import gc
import json
import numpy as np
import pandas as pd
import codecs
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import re

from torch.utils.data import DataLoader
from datasets import Dataset, load_from_disk
from sklearn.metrics import log_loss
from tqdm.auto import tqdm
from itertools import chain
from text_unidecode import unidecode
from typing import Dict, List, Tuple
from transformers import TrainingArguments, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from scipy.special import softmax
from spacy.lang.en import English

# Config

In [None]:
class config:
    device = 'gpu'
    seed = 69
    train_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    test_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
    sample_submission_path = "/home/nischay/PID/Data/sample_submission.csv"

    save_dir = temp_data_folder + "1/"

    downsample = 0.48
    truncation = True
    padding = False
    max_length = 3700
    doc_stride = 512

    target_cols = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM',
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM',
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL','O']

    load_from_disk = None
    learning_rate = 2e-5
    batch_size = 1
    epochs = 6
    NFOLDS = [0]
    trn_fold = 0
    model_paths = {
    '/kaggle/input/37vp4pjt': 10/10,
    '/kaggle/input/pii-deberta-models/cuerpo-de-piiranha': 2/10,
    '/kaggle/input/pii-deberta-models/cola del piinguuino' : 1/10,
    '/kaggle/input/pii-deberta-models/cabeza-del-piinguuino': 5/10,
    '/kaggle/input/pii-deberta-models/cabeza-de-piiranha': 3/10,
    '/kaggle/input/pii-deberta-models/cola-de-piiranha':1/10,
    '/kaggle/input/pii-models/piidd-org-sakura': 2/10,
    '/kaggle/input/pii-deberta-models/cabeza-de-piiranha-persuade_v0':1/10,
    }
    converted_path = '/kaggle/input/toonnx2-converted-models'

In [None]:
nlp = English()
INFERENCE_MAX_LENGTH = 3500
threshold = 0.99
email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
url_regex = re.compile(
    r'http[s]?://'
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'
    r'localhost|'
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
    r'(?::\d+)?'
    r'(?:/?|[/?]\S+)', re.IGNORECASE)
street_regex = re.compile(r'\d{1,4} [\w\s]{1,20}(?:street|apt|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|parkway|pkwy|circle|cir|boulevard|blvd)\W?(?=\s|$)', re.IGNORECASE)

# Preprocessing

In [None]:
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue

    return spans

In [None]:
data = json.load(open(config.train_dataset_path))
test_data = json.load(open(config.test_dataset_path))

print('num_samples:', len(data))
print(data[0].keys())

In [None]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

print(id2label)

In [None]:
first_model_path = list(config.model_paths.keys())[0]
tokenizer = AutoTokenizer.from_pretrained(first_model_path)

In [None]:
df_train = pd.DataFrame(data)
df_train.head(5)

In [None]:
df_train['fold'] = df_train['document'] % 4
df_train.head(3)

In [None]:
df_test = pd.DataFrame(test_data)
df_test.head(3)

In [None]:
def downsample_df(train_df, percent):

    train_df['is_labels'] = train_df['labels'].apply(lambda labels: any(label != 'O' for label in labels))

    true_samples = train_df[train_df['is_labels'] == True]
    false_samples = train_df[train_df['is_labels'] == False]

    n_false_samples = int(len(false_samples) * percent)
    downsampled_false_samples = false_samples.sample(n=n_false_samples, random_state=42)

    downsampled_df = pd.concat([true_samples, downsampled_false_samples])
    return downsampled_df

In [None]:
def tokenize_row(example):
    text = []
    token_map = []

    idx = 0

    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)

        idx += 1

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=config.truncation, max_length=config.max_length)

    return {
        "input_ids": tokenized.input_ids,
        "attention_mask": tokenized.attention_mask,
        "offset_mapping": tokenized.offset_mapping,
        "token_map": token_map,}

In [None]:
if debug_on_train_df:


    if config.load_from_disk is None:

        df_train['fold'] = df_train['document'] % 4
        df_train.head(3)

        for i in range(-1, 4):
            train_df = df_train[df_train['fold']==i].reset_index(drop=True)

            if i==config.trn_fold:
                config.valid_stride = True
            if i!=config.trn_fold and config.downsample > 0:
                train_df = downsample_df(train_df, config.downsample)
                config.valid_stride = False

            train_df = train_df
            print(len(train_df))
            ds = Dataset.from_pandas(train_df)

            ds = ds.map(
              tokenize_row,
              batched=False,
              num_proc=2,
              desc="Tokenizing",
            )

            ds.save_to_disk(f"{config.save_dir}fold_{i}.dataset")
            with open(f"{config.save_dir}_pkl", "wb") as fp:
                pickle.dump(train_df, fp)
            print("Saving dataset to disk:", config.save_dir)

else:

    if config.load_from_disk is None:

        config.valid_stride = True
        print(len(df_test))

        ds = Dataset.from_pandas(df_test)
        ds = ds.map(
          tokenize_row,
          batched=False,
          num_proc=2,
          desc="Tokenizing",
          )

        ds.save_to_disk(f"{config.save_dir}test.dataset")
        print("Saving dataset to disk:", config.save_dir)



# Inference & quantization

In [None]:
def process_predictions(flattened_preds):
    """
    Processes each prediction in flattened predictions by applying softmax to convert logits to probabilities.

    Parameters:
    - flattened_preds: Iterable of prediction tensors.

    Returns:
    - List of predictions after applying softmax.
    """

    predictions_softmax_all = []

    for predictions in flattened_preds:

        predictions_softmax = torch.softmax(predictions, dim=-1)

        predictions_softmax_all.append(predictions_softmax)

    return predictions_softmax_all

In [None]:
from transformers.convert_graph_to_onnx import convert
from onnxconverter_common import auto_convert_mixed_precision_model_path
import onnx
import torch.onnx
import onnxruntime

def predict_and_convert(data_loader, model, config, onnx_model_path):
    """
    Exports the given model to the ONNX format after processing a single batch from the data loader.

    Parameters:
    - data_loader: DataLoader object to provide input data for the model.
    - model: The model to be exported to ONNX format.
    - config: Configuration object containing device and others
    - onnx_model_path: Path where the ONNX model will be saved.

    Returns:
    - prediction_outputs: List of model outputs for the processed batch. Currently initialized but not used.
    """

    model.eval()

    prediction_outputs = []

    data_iter = iter(data_loader)

    batch = next(data_iter)

    with torch.no_grad():

        inputs = {key: val.reshape(val.shape[0], -1).to(config.device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        torch.onnx.export(model,
                          args=(input_ids, attention_mask),
                          f=onnx_model_path,
                          opset_version=12,
                          input_names=['input_ids', 'attention_mask'],
                          output_names=['logits'],
                          dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'},
                                        'attention_mask': {0: 'batch_size', 1: 'sequence_length'}}
                          )

    print("Model saved to", onnx_model_path)

    return prediction_outputs


def predict_and_quant(data_loader, config, original_onnx_model_path, output_file_name, data_path):
    """
    Performs quantization on a given ONNX model based on a single batch from the data loader and saves the quantized model.

    Parameters:
    - data_loader: DataLoader object providing input data for quantization.
    - config: Configuration object containing device settings.
    - original_onnx_model_path: Path to the original ONNX model that will be quantized.
    - output_file_name: Filename for the quantized ONNX model.
    - data_path: Path where additional data related to quantization might be stored.

    Returns:
    - prediction_outputs: List of model outputs for the processed batch. Currently, it only appends a placeholder value.
    """

    prediction_outputs = []

    data_iter = iter(data_loader)

    batch = next(data_iter)

    with torch.no_grad():

        inputs = {key: val.reshape(val.shape[0], -1).to(config.device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}

        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        print("Quantization")

        input_data = {"input_ids": input_ids.cpu().numpy(), "attention_mask": attention_mask.cpu().numpy()}

        auto_convert_mixed_precision_model_path(
            original_onnx_model_path,
            input_data,
            output_file_name,
            provider=['CUDAExecutionProvider'],
            location=data_path,
            rtol=2,
            atol=20,
            keep_io_types=True,
            verbose=True
        )

        prediction_outputs.append(0)

    print("Model saved to", output_file_name)

    return prediction_outputs


def predict(data_loader, session, config):
    """
    Performs inference using a given ONNX model session over all batches from a data loader.

    Parameters:
    - data_loader: DataLoader object providing batches of input data for inference.
    - session: The ONNX runtime session initialized with the model to be used for inference.
    - config: Configuration object containing settings

    Returns:
    - processed_predictions: List of processed predictions after inference for all input data.
    """

    prediction_outputs = []

    for batch in tqdm(data_loader, desc="Predicting"):
        with torch.no_grad():
            inputs = {key: val.reshape(val.shape[0], -1).to(config.device) for key, val in batch.items() if key in ['input_ids', 'attention_mask']}

            input_names = [inp.name for inp in session.get_inputs()]
            output_names = [out.name for out in session.get_outputs()]

            input_ids = inputs['input_ids']
            attention_mask = inputs['attention_mask']

            input_data = {"input_ids": input_ids.cpu().numpy(), "attention_mask": attention_mask.cpu().numpy()}

            onnx_outputs = session.run(None, input_data)

            prediction_outputs.append(torch.tensor(onnx_outputs[0]))

    prediction_outputs = [logit for batch in prediction_outputs for logit in batch]

    processed_predictions = process_predictions(prediction_outputs)

    return processed_predictions

In [None]:
def process_predictions_ans(flattened_preds, threshold=0.9):
    """
    Processes predictions by applying a threshold to distinguish between a specific class and others.
    It assumes softmax has already been applied to the predictions.

    Parameters:
    - flattened_preds: A list of prediction tensors, with each tensor representing predictions for a batch.
    - threshold: A probability threshold used to decide whether to classify a prediction as a specific class or as 'other'.

    Returns:
    - preds_final: A list of numpy arrays with final predictions after applying the threshold.
    """

    print("\nPrediction")
    preds_final = []

    for predictions in flattened_preds:
        predictions_softmax = predictions
        predictions_argmax = predictions.argmax(-1)
        predictions_without_O = predictions_softmax[:, :12].argmax(-1)
        O_predictions = predictions_softmax[:, 12]
        pred_final = torch.where(O_predictions < threshold, predictions_without_O, predictions_argmax)
        preds_final.append(pred_final.numpy())

    return preds_final

In [None]:
keep_cols = {"input_ids", "attention_mask"}
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=512)

if not debug_on_train_df:
    test_ds = load_from_disk(f'{config.save_dir}test.dataset')
    test_ds = test_ds.remove_columns([c for c in test_ds.column_names if c not in keep_cols])
    config.data_length = len(test_ds)
    config.len_token = len(tokenizer)
    print('Dataset Loaded....')
    print((test_ds[0].keys()))
    print("Generating Test DataLoader")
    test_dataloader = DataLoader(test_ds, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=False, collate_fn=collator)

else:
    fold = config.trn_fold
    test_ds = load_from_disk(f'{config.save_dir}fold_{fold}.dataset')
    test_ds = test_ds.remove_columns([c for c in test_ds.column_names if c not in keep_cols])
    config.data_length = len(test_ds)
    config.len_token = len(tokenizer)
    print('Dataset Loaded....')
    print(test_ds)
    print((test_ds[0].keys()))
    print("Generating Test DataLoader")
    test_dataloader = DataLoader(test_ds, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=False, collate_fn=collator)

In [None]:
predictions_softmax_logits = []
all_preds = []

for model_path, weight in config.model_paths.items():

    fold = config.trn_fold

    if convert_before_inference:

        model = AutoModelForTokenClassification.from_pretrained(model_path)

        converted_model_name = temp_data_folder + "original_model.onnx"
        predictions_softmax_all = predict_and_convert(test_dataloader, model, config, converted_model_name)
        del model
        gc.collect()
        torch.cuda.empty_cache()

        quantized_model_name = "/kaggle/working/optimized" + model_path.split("/")[-1] + "_f" + str(fold) + ".onnx"
        quantized_data_path = "optimized" + model_path.split("/")[-1] + "_f" + str(fold) + ".data"
        predictions_softmax_all = predict_and_quant(test_dataloader, config, converted_model_name, quantized_model_name, quantized_data_path)

    else:
        quantized_model_name = config.converted_path + "/optimized" + model_path.split("/")[-1] + "_f" + str(fold) + ".onnx"


    print("Inference")

    session = onnxruntime.InferenceSession(quantized_model_name, providers=['CUDAExecutionProvider'])

    predictions_softmax_all = predict(test_dataloader, session, config)

    predictions_softmax_logits.append(predictions_softmax_all)

del test_dataloader, test_ds
gc.collect()
torch.cuda.empty_cache()

# Making final preds

In [None]:
predictions_mean_all = []

total_weight = sum(config.model_paths.values())
print(f"Total weight: {total_weight}")

model_weights = list(config.model_paths.values())

for sample_index in range(len(predictions_softmax_logits[0])):

    weighted_predictions_sum = torch.zeros(predictions_softmax_logits[0][sample_index].size())

    for model_index in range(len(predictions_softmax_logits)):
        weighted_prediction = predictions_softmax_logits[model_index][sample_index] * (model_weights[model_index] / total_weight)
        weighted_predictions_sum += weighted_prediction

    predictions_mean_all.append(weighted_predictions_sum)

In [None]:
triplets = []
pairs = set()
processed = []
emails = []
phone_nums = []
urls = []
streets = []
print(id2label)

for p, token_map, offsets, tokens, doc, full_text in zip(
    processed_predictions,
    ds["token_map"],
    ds["offset_mapping"],
    ds["tokens"],
    ds["document"],
    ds["full_text"]
):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[token_pred]
        if start_idx + end_idx == 0:
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map):
            break
        token_id = token_map[start_idx]
        if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue
        pair = (doc, token_id)
        if pair not in pairs:
            processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
            pairs.add(pair)

    for token_idx, token in enumerate(tokens):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": doc, "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )

    matches = phone_num_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": tokens[token_idx]}
            )

    matches = url_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            urls.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-URL_PERSONAL", "token_str": tokens[token_idx]}
            )

In [None]:
df = pd.DataFrame(processed + phone_nums + emails + urls)

df["row_id"] = list(range(len(df)))

df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)
df