In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pii-fine-tuned/pytorch/v1/1/model.safetensors
/kaggle/input/pii-test/test.json


In [8]:
from functools import reduce
from datasets import Dataset


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/pii-fine-tuned/transformers/v_with_config/1")
model = AutoModelForTokenClassification.from_pretrained("/kaggle/input/pii-fine-tuned/transformers/v_with_config/1")

In [4]:
from datasets import load_dataset
dataset = load_dataset('json', data_files='/kaggle/input/pii-test/test.json', split='train')


Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
def tokenize_and_align(example, overlap_size = 0):
    """
    To be used with datasets.map() with batched=False

    Takes in 
        - example : an example from the datasets class
        - overlap_size: the number of tokens that overlap between two consecutive chunks
        
    outputs:
        - a Dict[]->List with columns:
            - of the bert tokenizer output
            - encoded labels
    """

    tokenized_inputs = tokenizer(example['tokens'], is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True, stride=overlap_size, return_tensors='pt')
    tokenized_inputs.pop('overflow_to_sample_mapping')
    tokenized_inputs.pop('offset_mapping')
    
    org_word_ids_list = []
    document_id = []
    #iterating over chunks
    for i, chunk in enumerate(tokenized_inputs['input_ids']):
        ids_of_tokens = tokenized_inputs.word_ids(i)
        
        org_word_ids_list.append(ids_of_tokens)
        document_id.append(example['document'])
        

    tokenized_inputs['org_word_ids'] = org_word_ids_list
    tokenized_inputs['document'] = document_id

    return tokenized_inputs
    
data_tokenized = dataset.map(tokenize_and_align, batched=False)
print(data_tokenized['document'][0])

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

[7, 7]


In [8]:
def flatten_data(data, keys_to_flatten):

    data_flat = {}

    for key in keys_to_flatten:
        data_flat[key] = reduce(lambda x,y: x+y, data[key])

    return Dataset.from_dict(data_flat)

keys_to_flatten = ['input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids','document']

data_flat = flatten_data(data_tokenized, keys_to_flatten)

input_ids
token_type_ids
attention_mask
org_word_ids
document


In [25]:
print(len(data_flat))

data_tokenized['document']
data_flat.set_format(type='pt', columns=['input_ids', 'token_type_ids', 'attention_mask'])
encoded = {'input_ids':data_tokenized['input_ids'], 'token_type_ids':data_tokenized['token_type_ids'], 'attention_mask':data_tokenized['attention_mask']}
outputs = model(input_ids=data_flat['input_ids'], token_type_ids=data_flat['token_type_ids'], attention_mask=data_flat['attention_mask'], )


21


In [26]:
data_flat

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'org_word_ids', 'document'],
    num_rows: 21
})

In [24]:
import numpy as np
import torch
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.detach().numpy()
test_preds = np.argmax(predictions, axis=-1)
print(len(test_preds))

21


In [None]:
model.config.id2label

In [34]:
document_list = []
token_id_list = []
label_id_list = []
for doc, token_id, pred in zip(data_flat['document'],data_flat['org_word_ids'],test_preds):
    for i in range(len(test_preds)):
        current_word_id = token_id
        if pred[i] != 14 and token_id[i] != None:
            document_list.append(doc)
            token_id_list.append(token_id[i])
            label_id_list.append(pred[i])

In [51]:
pred_df = pd.DataFrame(
    {
        "document": document_list,
        "token": token_id_list,
        "label_id": label_id_list,
    }
)
pred_df["label"] = pred_df.label_id.map(model.config.id2label) # map integer label to BIO format label
no_duplicates_df = pred_df.drop_duplicates(subset = ['document', 'token', 'label_id'],keep = 'first').reset_index(drop = True)
final_df = no_duplicates_df.drop(columns=["label_id"]) # remove extra columns
final_df = final_df.rename_axis("row_id").reset_index() # add `row_id` column
# final_df.head(10)
final_df.to_csv("submission.csv", index=False)