In [1]:
import json
import copy
import gc
import os
import re
from collections import defaultdict
from pathlib import Path

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from spacy.lang.en import English

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers.data.data_collator import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, concatenate_datasets
import wandb
from transformers import TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup

2024-04-01 17:03:04.105314: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 17:03:04.105439: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 17:03:04.240545: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
class CFG:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    seed = 69
    # dataset path 
    train_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    external_dataset_path_1 = "/kaggle/input/pii-mixtral8x7b-generated-essays/mpware_mixtral8x7b_v1.1-no-i-username.json"
    test_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
    sample_submission_path = "/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv"
    save_dir="/kaggle/working/exp1"

    #tokenizer params
    downsample = 0.45
    truncation = True 
    padding = False #'max_length'
    max_length = 1024
    freeze_layers = 0
    # model params
    model_name = "Qwen/Qwen1.5-0.5B"
    
    target_cols = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL','O']

    load_from_disk = None
    #training params
    learning_rate = 1e-5
    batch_size = 2
    epochs = 3


seed_everything(CFG.seed)

69

In [3]:
original_data = None
mpware=None

In [4]:
with Path(CFG.train_dataset_path).open("r") as f:
    original_data = json.load(f)

with Path(CFG.external_dataset_path_1).open("r") as f:
    mpware = json.load(f)
print("MPWARE's datapoints: ", len(mpware))

MPWARE's datapoints:  2692


In [5]:
df_train = pd.DataFrame(original_data)
df_mpware = pd.DataFrame(mpware)
df_mpware['document'] =  [i+30000 for i in range(len(df_mpware))]

In [6]:
df_train = pd.concat([df_train,df_mpware],axis=0).reset_index(drop=True)

In [7]:
if not os.path.exists(CFG.save_dir):
  os.makedirs(CFG.save_dir)

In [8]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

In [9]:
def tokenize_row(example):
    text = []
    token_map = []
    labels = []
    targets = []
    idx = 0
    for t, l, ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):
        text.append(t)
        labels.extend([l]*len(t))
        token_map.extend([idx]*len(t))

        if l in CFG.target_cols:  
            targets.append(1)
        else:
            targets.append(0)
        
        if ws:
            text.append(" ")
            labels.append("O")
            token_map.append(-1)
        idx += 1

    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=CFG.max_length)  # Adjust max_length if needed
        
    target_num = sum(targets)
    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        if start_idx == 0 and end_idx == 0: 
            token_labels.append(label2id["O"])
            continue
        
        if text[start_idx].isspace():
            start_idx += 1
        try:
            token_labels.append(label2id[labels[start_idx]])
        except:
            continue
    length = len(tokenized.input_ids)
    
    return {
        "input_ids": tokenized.input_ids,
        "attention_mask": tokenized.attention_mask,
        "offset_mapping": tokenized.offset_mapping,
        "labels": token_labels,
        "length": length,
        "target_num": target_num,
        "group": 1 if target_num > 0 else 0,
        "token_map": token_map,
    }

In [10]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
tokenizer.save_pretrained(f'{CFG.save_dir}')

tokenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('/kaggle/working/exp1/tokenizer_config.json',
 '/kaggle/working/exp1/special_tokens_map.json',
 '/kaggle/working/exp1/vocab.json',
 '/kaggle/working/exp1/merges.txt',
 '/kaggle/working/exp1/added_tokens.json',
 '/kaggle/working/exp1/tokenizer.json')

In [11]:
import pandas as pd

def downsample_df(train_df, percent):

    train_df['is_labels'] = train_df['labels'].apply(lambda labels: any(label != 'O' for label in labels))
    
    true_samples = train_df[train_df['is_labels'] == True]
    false_samples = train_df[train_df['is_labels'] == False]
    
    n_false_samples = int(len(false_samples) * percent)
    downsampled_false_samples = false_samples.sample(n=n_false_samples, random_state=42)
    
    downsampled_df = pd.concat([true_samples, downsampled_false_samples])    
    return downsampled_df

In [12]:
def add_token_indices(doc_tokens):
    token_indices = list(range(len(doc_tokens)))
    return token_indices

df_train['token_indices'] = df_train['tokens'].apply(add_token_indices)

In [13]:
df_train = df_train[:10]

In [14]:
df_train = downsample_df(df_train, CFG.downsample)
ds = Dataset.from_pandas(df_train)
ds = ds.map(
          tokenize_row,
          batched=False,
          num_proc=2,
          desc="Tokenizing",
      )

    

Tokenizing #0:   0%|          | 0/5 [00:00<?, ?ex/s]

Tokenizing #1:   0%|          | 0/5 [00:00<?, ?ex/s]

In [15]:
import random

class LSTMHead(nn.Module):
    def __init__(self, in_features, hidden_dim, n_layers):
        super().__init__()
        self.lstm = nn.LSTM(in_features,
                            hidden_dim,
                            n_layers,
                            batch_first=True,
                            bidirectional=True,
                            dropout=0.1)
        self.out_features = hidden_dim

    def forward(self, x):
        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm(x)
        out = hidden
        return out

    
class PIIModel(pl.LightningModule):
    def __init__(self,config):
        super().__init__()
        self.cfg = config
        self.model_config = AutoConfig.from_pretrained(
            config.model_name,
        )

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )

        self.transformers_model = AutoModel.from_pretrained(self.cfg.model_name,config=self.model_config)
        self.head = LSTMHead(in_features=self.model_config.hidden_size, hidden_dim=self.model_config.hidden_size//2, n_layers=1)

        self.output = nn.Linear(self.model_config.hidden_size, len(self.cfg.target_cols))
        
        if self.cfg.freeze_layers>0:
            print(f'Freezing {self.cfg.freeze_layers} layers.')
            for layer in self.transformers_model.longformer.encoder.layer[:self.cfg.freeze_layers]:
                for param in layer.parameters():
                    param.requires_grad = False


        self.loss_function = nn.CrossEntropyLoss(reduction='mean',ignore_index=-100) 
        self.validation_step_outputs = []

    def forward(self, input_ids, attention_mask,train):
        
        transformer_out = self.transformers_model(input_ids,attention_mask = attention_mask)
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.head(sequence_output)
        logits = self.output(sequence_output)

        return (logits, _)
    

    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['labels'] 

        outputs = self(input_ids,attention_mask,train=True)
        output = outputs[0]
        loss = self.loss_function(output.view(-1,len(self.cfg.target_cols)), target.view(-1))
        
        self.log('train_loss', loss , prog_bar=True)
        return {'loss': loss}
    
    def train_epoch_end(self,outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        print(f'epoch {trainer.current_epoch} training loss {avg_loss}')
        return {'train_loss': avg_loss} 
        
    def train_dataloader(self):
        return self._train_dataloader 

    def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.transformers_model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in self.transformers_model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.named_parameters() if "transformers_model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr = self.cfg.learning_rate)

        epoch_steps = self.cfg.data_length
        batch_size = self.cfg.batch_size

        warmup_steps = 0.05 * epoch_steps // batch_size
        training_steps = self.cfg.epochs * epoch_steps // batch_size
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps, num_cycles=0.5)
        
        lr_scheduler_config = {
                'scheduler': scheduler,
                'interval': 'step',
                'frequency': 1,
            }

        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_config}

In [16]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=512)

In [17]:
keep_cols = {"input_ids", "attention_mask", "labels"}
train_ds = ds
train_ds = train_ds.remove_columns([c for c in train_ds.column_names if c not in keep_cols])
CFG.data_length = len(train_ds)
CFG.len_token = len(tokenizer)

In [18]:
print('Dataset Loaded....')
print((train_ds[0].keys()))
print("Generating Train DataLoader")
train_dataloader = DataLoader(train_ds, batch_size = CFG.batch_size, shuffle = True, num_workers= 4, pin_memory=False,collate_fn = collator)

Dataset Loaded....
dict_keys(['labels', 'input_ids', 'attention_mask'])
Generating Train DataLoader


In [19]:
early_stop_callback = EarlyStopping(monitor="train_loss", min_delta=0.00, patience=8, verbose= True, mode="min")
checkpoint_callback = ModelCheckpoint(monitor='train_loss',
                                          dirpath= CFG.save_dir,
                                      save_top_k=1,
                                      save_last= True,
                                      save_weights_only=True,
                                      verbose= True,
                                      mode='min')
    
print("Model Creation")

Model Creation


In [20]:
model = PIIModel(CFG)
# model.load_state_dict(torch.load('/home/nischay/PID/nbs/outputs2/exp12_baseline_debv3base_1024_extv1/ckeckpoint_0-v2.ckpt','cpu')['state_dict'])
trainer = Trainer(max_epochs= CFG.epochs,
                      deterministic=False,
                      accumulate_grad_batches=1, 
                      devices=[0],
                      precision=16, 
                      accelerator=CFG.device ,
                      callbacks=[checkpoint_callback,early_stop_callback]) 

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

/opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!


In [21]:
CFG.data_length = len(train_ds)
trainer.fit(model,train_dataloader)

/opt/conda/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:653: Checkpoint directory /kaggle/working/exp1 exists and is not empty.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]