# BERT Model Fine-Tuning

### BERT model

In [105]:
import torch
from transformers import BertTokenizer, BertForTokenClassification


model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)


model.eval()


def perform_ner(text):
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    
    predictions = torch.argmax(outputs.logits, dim=2)

    
    predicted_tokens = [tokenizer.decode(input_id) for input_id in input_ids[0]]
    predicted_labels = [model.config.id2label[prediction.item()] for prediction in predictions[0]]

    return list(zip(predicted_tokens, predicted_labels))



ner_results = perform_ner(result_str)


print("Named Entity Recognition Results:")
for token, label in ner_results:
    if label != "O":  
        print(f"{token}: {label}")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Named Entity Recognition Results:
Indian: I-ORG
Union: I-ORG
Government: I-ORG
Of: I-ORG
Tamil: I-LOC
Nadu: I-LOC
Burt: I-ORG
##od: I-ORG
Group: I-ORG
PA: I-ORG
##MA: I-ORG
K: I-ORG
##RI: I-ORG
##S: I-ORG
##H: I-ORG
##NA: I-ORG
##NR: I-ORG
S: I-ORG
##U: I-ORG
##BR: I-ORG
##AM: I-LOC
##AN: I-ORG
##I: I-ORG
##Y: I-ORG
##A: I-ORG
NE: I-ORG
##R: I-ORG
##U: I-ORG
N: I-ORG
##AR: I-ORG
V: I-LOC
##ad: I-LOC
##urai: I-LOC
Tamil: I-LOC
Nadu: I-LOC


### Fine Tune


In [106]:
import pandas as pd
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


df = pd.read_csv(r'D:\nvm\obscure\Code\SampleData.csv')


df['Tag'].fillna('O', inplace=True)  # Replace NaN with 'O' or drop the rows



input_ids = []
attention_masks = []
label_ids = []
max_length = 128  


unique_labels = sorted(df['Tag'].unique())
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}


current_tokens = []
current_labels = []

for index, row in df.iterrows():
    token = row['Token']
    tag = row['Tag']

    if token == ':':  
        
        if current_tokens:
            
            tokens = ['[CLS]'] + current_tokens + ['[SEP]']
            labels = ['O'] + current_labels + ['O'] 
            
            input_id = tokenizer.convert_tokens_to_ids(tokens)
            
            attention_mask = [1] * len(input_id)

            padding_length = max_length - len(input_id)
            if padding_length > 0:
                input_id += [0] * padding_length  
                attention_mask += [0] * padding_length
            
            # Convert labels to IDs and pad
            label_id = [label_mapping[label] for label in labels]
            label_id += [0] * padding_length  # Pad with 0s
            
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
            label_ids.append(label_id)

            current_tokens = []
            current_labels = []
    else:
    
        current_tokens.append(token)
        current_labels.append(tag)

if current_tokens:
    tokens = ['[CLS]'] + current_tokens + ['[SEP]']
    labels = ['O'] + current_labels + ['O']
    
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(input_id)

    padding_length = max_length - len(input_id)
    if padding_length > 0:
        input_id += [0] * padding_length
        attention_mask += [0] * padding_length
    
    label_id = [label_mapping[label] for label in labels]
    label_id += [0] * padding_length
    
    input_ids.append(input_id)
    attention_masks.append(attention_mask)
    label_ids.append(label_id)

preprocessed_df = pd.DataFrame({
    'input_ids': input_ids,
    'attention_mask': attention_masks,
    'label_ids': label_ids
})

preprocessed_df.to_csv('preprocessed_data.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Tag'].fillna('O', inplace=True)  # Replace NaN with 'O' or drop the rows


In [107]:
import torch
from torch.utils.data import Dataset
import pandas as pd
import ast

class YourCustomDataset(Dataset):
    def __init__(self, file_path):
        self.data = pd.read_csv(file_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        input_id = torch.tensor(ast.literal_eval(self.data['input_ids'].iloc[idx])).long()
        attention_mask = torch.tensor(ast.literal_eval(self.data['attention_mask'].iloc[idx])).long()
        label_id = torch.tensor(ast.literal_eval(self.data['label_ids'].iloc[idx])).long()

        return {
            'input_ids': input_id,
            'attention_mask': attention_mask,
            'labels': label_id
        }


In [110]:

train_dataset = YourCustomDataset(r'D:\nvm\obscure\Code\preprocessed_data.csv')


model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_labels))


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 2/200 [06:46<11:11:32, 203.50s/it]
100%|██████████| 50/50 [09:25<00:00, 11.31s/it]

{'train_runtime': 565.3268, 'train_samples_per_second': 1.167, 'train_steps_per_second': 0.088, 'train_loss': 0.28956954956054687, 'epoch': 10.0}





TrainOutput(global_step=50, training_loss=0.28956954956054687, metrics={'train_runtime': 565.3268, 'train_samples_per_second': 1.167, 'train_steps_per_second': 0.088, 'total_flos': 43131115653120.0, 'train_loss': 0.28956954956054687, 'epoch': 10.0})

In [111]:
# Save the model and tokenizer
model.save_pretrained('./saved_model')  
tokenizer.save_pretrained('./saved_model')  


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json')

### Predict (Test Case)

In [112]:
from transformers import BertTokenizer, BertForTokenClassification

model = BertForTokenClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')


In [114]:
import torch
from transformers import BertTokenizer, BertForTokenClassification

model = BertForTokenClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

model.eval()

def predict(text, model, tokenizer, label_list):
    encoding = tokenizer(
        text,
        return_tensors="pt",
        padding='max_length',  
        truncation=True,       
        max_length=128,       
        is_split_into_words=True
    )
    
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_label_ids = torch.argmax(logits, dim=2).squeeze().tolist()
    

    predicted_labels = [label_list[label_id] for label_id in predicted_label_ids]
    
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
    return list(zip(tokens, predicted_labels))

# Test Case
sample_sentence = "Ramesh Kumar's Aadhaar number is 1234-5678-9012."


label_list = ['B-NAME', 'I-NAME', 'O', 'B-AADHAAR', 'I-AADHAAR', 'B-DL', 'I-DL', 'B-PASSPORT',
              'I-PASSPORT', 'B-DATE', 'I-DATE', 'B-ADDRESS', 'I-ADDRESS', 'B-MOBILE',
              'I-MOBILE', 'B-EMAIL', 'I-EMAIL', 'B-BANK', 'I-BANK', 'B-CC', 'I-CC',
              'B-MEDICAL', 'I-MEDICAL', 'B-LOAN', 'I-LOAN', 'B-PIN', 'I-PIN', 'B-OTP',
              'I-OTP', 'B-FINANCIAL', 'I-FINANCIAL', 'B-IP', 'I-IP', 'B-LOGIN', 'I-LOGIN',
              'B-COOKIES', 'B-CREDIT', 'I-CREDIT', 'B-INSURANCE', 'I-INSURANCE', 'B-GENETIC',
              'I-GENETIC', 'B-BIOMETRIC', 'I-BIOMETRIC', 'B-CARD', 'I-CARD']


predictions = predict(sample_sentence.split(), model, tokenizer, label_list)

# Output 👍
print("Token  |  Predicted Label")
print("-------------------------")
for token, label in predictions:
    print(f"{token}  |  {label}")


Token  |  Predicted Label
-------------------------
[CLS]  |  I-CARD
ram  |  B-NAME
##esh  |  B-NAME
kumar  |  B-NAME
'  |  B-NAME
s  |  B-NAME
aa  |  B-NAME
##dha  |  B-NAME
##ar  |  B-NAME
number  |  B-NAME
is  |  B-NAME
123  |  B-NAME
##4  |  B-NAME
-  |  B-NAME
56  |  B-NAME
##7  |  B-NAME
##8  |  B-NAME
-  |  B-NAME
90  |  B-NAME
##12  |  B-NAME
.  |  I-CARD
[SEP]  |  I-CARD
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]  |  B-NAME
[PAD]