In [1]:
from tqdm import tqdm
import warnings 
import numpy as np
import pandas as pd 
from torch.optim import AdamW
import torch.nn as nn
import torch 
import re
import gc
import codecs
from transformers import AutoModelForTokenClassification , AutoConfig ,AutoTokenizer, set_seed,Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from itertools import chain
from text_unidecode import unidecode
from typing import Any, Optional, Tuple
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import os 
os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"



In [2]:
train_data=pd.read_csv('../input/feedback-prize-effectiveness/train.csv')

In [3]:
class Config:
    max_length = 1024
    model='google/bigbird-roberta-base'
    learning_rate=9e-5
    train_batch_size = 4
    test_batch_size = 2 
    num_gradient_accumlation = 2 
    num_of_classlabels=3
    weight_deccay=0.1
    epochs=10
    
    
    

In [4]:
train_data['disclouse_start']=-1
train_data['disclouse_end']=-1

In [5]:
data=train_data.groupby(['essay_id'])['discourse_text'].apply(list)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(Config.model)

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

In [7]:
train_data['discourse_type'].unique()

array(['Lead', 'Position', 'Claim', 'Evidence', 'Counterclaim',
       'Rebuttal', 'Concluding Statement'], dtype=object)

In [8]:
# Below code are copied from kaggle 
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

previous_id = ''
for idx, input_data in  enumerate(train_data[['essay_id','discourse_text']].values):
    # check for id 
    if previous_id != input_data[0]:
        with open(f'../input/feedback-prize-effectiveness/train/{input_data[0]}.txt','r') as f:
            data=f.read()
        main_text=resolve_encodings_and_normalize(data)    
        previous_id = input_data[0]
 
    matches = list(re.finditer(re.escape(input_data[1].strip()), main_text))
    
    
    if(len(matches)==0):
        train_data.loc[idx,'disclouse_start']=-1
        train_data.loc[idx,'disclouse_end']=-1
    else:
        # if the matches is greater than > 1 i gone pick  first match 
        match=matches[0]
        train_data.loc[idx,'disclouse_start']=match.start()
        train_data.loc[idx,'disclouse_end']=match.end()  

In [9]:
# Lets create a map for disclouser_type 

disclouser_types=['Lead',
                  'Position', 
                  'Claim', 
                  'Evidence', 
                  'Counterclaim',
                   'Rebuttal',
                   'Concluding Statement']
# Create token_ids for above disclouser types 
start_tokens = { x: f"[CLS_{x.upper()}]" for x in disclouser_types}
end_tokens =   { x: f"[SEP_{x.upper()}]"  for x in disclouser_types}

In [10]:
# lets add new tokens to the tokenizer 
tokenizer=AutoTokenizer.from_pretrained(Config.model)
tokenizer.add_special_tokens({"additional_special_tokens": list(start_tokens.values())+list(end_tokens.values())})

14

In [11]:
train_data['discourse_effectiveness'].unique()

array(['Adequate', 'Ineffective', 'Effective'], dtype=object)

In [12]:
# Lets convert class labels to ids & ids to class lable reason behing it was during inferene time instead of 0,1,2 we gone get actual class value 
lable2id={
    "Adequate":0,
    "Ineffective":1,
    "Effective":2
    
}
id2label = {V:K for K ,V in lable2id.items()}

# And Map actual token_id with label 

label_token_map ={
    K:tokenizer.encode(V)[1] for K ,V in start_tokens.items()
}

In [13]:
# Lets add CLS_disclouser_type & END_Disclouser_type to the complete essay string
previous_id = ''
data_chunks =[]
token_ids_chunks = []
data_chunks_dict ={}
for essay_id , start , end , dis_type in  zip(train_data['essay_id'],train_data['disclouse_start'],train_data['disclouse_end'],train_data['discourse_type']):
    # check for id 
    
    if previous_id != essay_id:
        with open(f'../input/feedback-prize-effectiveness/train/{essay_id}.txt','r') as f:
            data=f.read()
        main_text=resolve_encodings_and_normalize(data)  
        
    if int(start) == -1 & int(end) == -1:
        continue 
 
    if essay_id not in data_chunks_dict:
        data_chunks_dict[essay_id]=[]
        data_chunks_dict[essay_id].append(start_tokens[dis_type])
        data_chunks_dict[essay_id].append(data[start:end])
        data_chunks_dict[essay_id].append(end_tokens[dis_type])
    else:
        data_chunks_dict[essay_id].append(start_tokens[dis_type])
        data_chunks_dict[essay_id].append(data[start:end])
        data_chunks_dict[essay_id].append(end_tokens[dis_type])
       
    

In [14]:
train_data.loc[:,'labelids']=train_data['discourse_effectiveness'].map(lable2id)
label_output = train_data.groupby(['essay_id'])['labelids'].apply(list)

In [15]:
# create a dataset using labels & text data 
tokenized_data=[]
labels=[]
for K ,V in data_chunks_dict.items():
    tokenized = tokenizer(''.join(V),max_length=Config.max_length,padding='max_length',truncation=True)
#     tokenized_data.append(tokenized)
    # lets iterate over tokens 
    label=[] 
    idx=0
    for _id in tokenized['input_ids']:
        if _id in label_token_map.values():
            label.append(label_output[K][idx])
            idx+=1
        else:
            # Cross entropy will be ignored when -100 
            label.append(-100)
            
    tokenized['label']=label
    tokenized_data.append(tokenized)

In [16]:
[x  for x in tokenized_data[0]['label'] if x in [0,1,2]]

[0, 0, 0, 0, 0, 1, 0, 0, 0]

In [17]:
# did the santy check to make sure the labels are correct 
label_output['007ACE74B050']

[0, 0, 0, 0, 0, 1, 0, 0, 0]

In [18]:
# Lets convert list of dict input_ids ,attention_mask , labels to Dataset 
df = pd.DataFrame(tokenized_data)

X_train , X_eval = train_test_split(df,test_size=0.2,random_state=42,shuffle=True)

# Convert to dataset 
train_dataset = Dataset.from_pandas(X_train) # train data set 
eval_dataset = Dataset.from_pandas(X_eval)

In [19]:
! mkdir output_model

In [20]:
# Model trainarguments & trainer 
args = TrainingArguments(
       output_dir ='/output',
       do_train=False,
       do_eval=True,
       per_device_train_batch_size = Config.train_batch_size,
       per_device_eval_batch_size=Config.test_batch_size,
       gradient_accumulation_steps=Config.num_gradient_accumlation, # acuumulate  batches before the update 
       num_train_epochs=Config.epochs,
       learning_rate = Config.learning_rate,
       weight_decay =Config.weight_deccay, # its nothing but lambda parameter for regualrizer
       metric_for_best_model= 'loss',
       optim='adamw_torch', # use adam optinmizer 
       evaluation_strategy='epoch',
       logging_strategy='steps',
       logging_steps=50
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [21]:
# Build the Model 
model_config = AutoConfig.from_pretrained(
            Config.model,
        )
model_config.update(
        {
            "num_labels": 3,
            "cls_tokens": list(label_token_map.values()), 
            "lable2id": lable2id, # reason of usinf label2id  and opposite is during inference instead of value as o or 1 we can get actual valiue of the class
            "id2label": id2label,
            "attention_type":'original_full' # if the tokens count is less than 1024 use original it would be o(n**2) attention
        }
)
    
model = AutoModelForTokenClassification.from_pretrained(Config.model, config=model_config)
    
    # Because tokens were added, it is important to resize the embeddings
model.resize_token_embeddings(len(tokenizer)) 


Downloading:   0%|          | 0.00/489M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

Embedding(50372, 768)

In [22]:
trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer
    )
trainer.train()
    
del model
gc.collect()
torch.cuda.empty_cache()
    

The following columns in the training set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3340
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 4170


Epoch,Training Loss,Validation Loss
0,0.8816,0.846878
1,0.7897,0.845622
2,0.7846,0.858485
3,0.7736,0.834126
4,0.7775,0.810718
5,0.7419,0.805701
6,0.6844,0.78436
7,0.6695,0.775083
8,0.6916,0.790753
9,0.6705,0.779085


The following columns in the evaluation set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 835
  Batch size = 2
Saving model checkpoint to /output/checkpoint-500
Configuration saved in /output/checkpoint-500/config.json
Model weights saved in /output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /output/checkpoint-500/tokenizer_config.json
Special tokens file saved in /output/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num ex

In [None]:
# With proper hyperparameter  tunning it could yield better results 