<a href="https://colab.research.google.com/github/sundarg4/document-classifier-BERT/blob/main/BERT_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Classifier for multiclass classification

In [None]:
# switch between datasets and models
dataset = '/content/drive/MyDrive/Colab\ Notebooks/data.json'
modelname = 'bert-base-uncased' # 'distilbert-base-uncased'

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:

!mkdir /content/data
!cp dataset /content/data

In [None]:
!pip install scikit-learn
!pip install -q torch
!pip install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [None]:
import pandas as pd

In [None]:
# Reading data with utf-8 encoding since data is from pdfs
df = pd.read_json("/content/data/data.json", encoding='utf-8')
df.head(3)

Unnamed: 0,file_name,pagenum,content,el_number,category
0,00206BA4E8F5200610123622.pdf,0,K3G800-PW07-01 EC centrifugal module...,0,36
1,00206BA4E8F5200610123622.pdf,0,"backward-curved, single-intake\n ...",1,36
2,00206BA4E8F5200610123622.pdf,0,ebm-papst Mulfingen GmbH & Co. KG\n ...,2,36


In [None]:
df.shape

(478064, 5)

In [None]:
df.isna().any()
# no na is present

file_name    False
pagenum      False
content      False
el_number    False
category     False
dtype: bool

In [None]:
df.iloc[2]["content"]
# content with whitespaces and special characters

'ebm-papst Mulfingen GmbH & Co. KG\n                                Bachmihle 2 - D-74673 Mulfingen\n                                Phone +49 7938 81-0\n                                Fax +49 7938 81-110\n                                info1(Øde.ebmpapst.com\n                                www.ebmpapst.com\n                                Limited partnership * Headquarters Mulfingen\n                                Amtsgericht (court of registration) Stuttgart : HRA 590344\n                                General partner Elektrobau Mulfingen GmbH * Headquarters Mulfingen\n                                Amtsgericht (court of registration) Stuttgart : HRB 590142'

# Preprocessing

In [None]:
import re
# text to Lower case

df["content"] = df["content"].str.lower()

# remove white space
# match except everything in the regex

regex = re.compile('[^A-Za-z0-9 .]')

def remove_whitespaces_and_spl_chars(content):
    return re.sub(regex, "" , " ".join(content.split()))

df['content'] = df['content'].apply(remove_whitespaces_and_spl_chars)

df.iloc[0]["content"]

'k3g800pw0701 ec centrifugal module  radipac'

In [None]:
# unique values of categories
category = df["category"].value_counts()
category.index

Index(['11', '36', '43', '62', '56', '40', '24', '54', '19', '57', '23', '41',
       '25', '32', '35', '61', '67', '44', '29', '31', '15', '27', '546', '33',
       '52', '20', '53', '542', '66', '26', '13', '14', '30', '47', '37', '12',
       '28', '556', '46', '45', '58', '65', '21', '51', '22', '10', '55', '77',
       '210', '233', '620', '16', '18', '74', '34', '10 2', '552', '547',
       '548', '42', '49', '76', '71'],
      dtype='object')

In [None]:
# convert categories to 0...n labels

category_to_id={cat:id for id,cat in enumerate(category.index)}
category_to_id

# vice versa
id_to_category={id:cat for id,cat in enumerate(category.index)}
id_to_category

{0: '11',
 1: '36',
 2: '43',
 3: '62',
 4: '56',
 5: '40',
 6: '24',
 7: '54',
 8: '19',
 9: '57',
 10: '23',
 11: '41',
 12: '25',
 13: '32',
 14: '35',
 15: '61',
 16: '67',
 17: '44',
 18: '29',
 19: '31',
 20: '15',
 21: '27',
 22: '546',
 23: '33',
 24: '52',
 25: '20',
 26: '53',
 27: '542',
 28: '66',
 29: '26',
 30: '13',
 31: '14',
 32: '30',
 33: '47',
 34: '37',
 35: '12',
 36: '28',
 37: '556',
 38: '46',
 39: '45',
 40: '58',
 41: '65',
 42: '21',
 43: '51',
 44: '22',
 45: '10',
 46: '55',
 47: '77',
 48: '210',
 49: '233',
 50: '620',
 51: '16',
 52: '18',
 53: '74',
 54: '34',
 55: '10 2',
 56: '552',
 57: '547',
 58: '548',
 59: '42',
 60: '49',
 61: '76',
 62: '71'}

In [None]:
# create label
df["label"] = df.category.map(lambda x: category_to_id[x])
df["label"]

0         1
1         1
2         1
3         1
4         1
         ..
478059    0
478060    0
478061    0
478062    0
478063    0
Name: label, Length: 478064, dtype: int64

In [None]:

# slicing required datasets

contents = df["content"]

labels = df["label"]

contents[0]

'k3g800pw0701 ec centrifugal module  radipac'

In [None]:
import transformers
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
import torch

# set to gpu if avl
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


tokenizer = BertTokenizerFast.from_pretrained(modelname, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(modelname, num_labels=len(id_to_category), id2label=id_to_category, label2id=category_to_id)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# tokenize content for BERT model with truncating length to 128 since large paragraphs faster training

token_ids = []

for content in contents:
    encoded = tokenizer.encode(content, add_special_tokens = True, padding='max_length', truncation=True, max_length = 128)
    token_ids.append(encoded)

Original:  k3g800pw0701 ec centrifugal module  radipac
Token IDs: [101, 1047, 2509, 2290, 17914, 2692, 28400, 2692, 19841, 2487, 14925, 9358, 3089, 11263, 9692, 11336, 10958, 4305, 19498, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# add attention masks to non padded tokens
attention_masks = []

for token in token_ids:
    am = [int(token_id > 0) for token_id in token]
    attention_masks.append(am)
attention_masks[0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
# split train data to 75 percent and validation and test to 15 percent approx.
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(token_ids, labels, test_size=0.15, random_state=1)

train_inputs, test_inputs, train_labels, test_labels = train_test_split(train_inputs, train_labels, test_size=0.176, random_state=1) # 0.176 x 0.85 ~ 0.15

# same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.15)



train_masks, validation_masks, t_labels, _ = train_test_split(attention_masks, labels, test_size=0.15)

train_masks, test_masks, _, _ = train_test_split(train_masks, t_labels, test_size=0.176)


print(len(train_inputs),len(validation_inputs),len(test_inputs))
print(len(train_masks),len(validation_masks),len(test_masks))


334835 71710 71519
334835 71710 71519


In [None]:
# convert tokens to torch format
import torch

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(train_labels.tolist())
validation_labels = torch.tensor(validation_labels.tolist())
test_labels = torch.tensor(test_labels.tolist())

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
test_masks = torch.tensor(test_masks)

In [None]:
#setup data loaders for pytorch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 64 #32 will be good, but for faster training

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [None]:
# adam optimizer with learning rate of 2e-5 and 4 epochs

from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

from transformers import get_linear_schedule_with_warmup

epochs = 4

total_steps = len(train_dataloader) * epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, eps = 1e-8)


In [None]:

import numpy as np

# accuracy of our predictions
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


# learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import datetime
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Start the finetuning of the model - code from glue.py
import random,time

torch.cuda.empty_cache()
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)

        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of  5,232.    Elapsed: 0:00:15.
  Batch    80  of  5,232.    Elapsed: 0:00:27.
  Batch   120  of  5,232.    Elapsed: 0:00:39.
  Batch   160  of  5,232.    Elapsed: 0:00:51.
  Batch   200  of  5,232.    Elapsed: 0:01:03.
  Batch   240  of  5,232.    Elapsed: 0:01:15.
  Batch   280  of  5,232.    Elapsed: 0:01:27.
  Batch   320  of  5,232.    Elapsed: 0:01:39.
  Batch   360  of  5,232.    Elapsed: 0:01:51.
  Batch   400  of  5,232.    Elapsed: 0:02:03.
  Batch   440  of  5,232.    Elapsed: 0:02:15.
  Batch   480  of  5,232.    Elapsed: 0:02:27.
  Batch   520  of  5,232.    Elapsed: 0:02:39.
  Batch   560  of  5,232.    Elapsed: 0:02:51.
  Batch   600  of  5,232.    Elapsed: 0:03:03.
  Batch   640  of  5,232.    Elapsed: 0:03:14.
  Batch   680  of  5,232.    Elapsed: 0:03:26.
  Batch   720  of  5,232.    Elapsed: 0:03:38.
  Batch   760  of  5,232.    Elapsed: 0:03:50.
  Batch   800  of  5,232.    Elapsed: 0:04:02.
  Batch   840  of  5,232.    Elapsed: 0:04:14.


KeyboardInterrupt: ignored

In [None]:
# Testing the Model

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))
model.eval()

# Tracking variables
predictions , true_labels = [], []

iter = 1
# Predict
for batch in test_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  predictions.append(logits)
  true_labels.append(label_ids)

  iter = iter+1
  if(iter % 100 == 0):
    print('Predicting labels for {:}'.format(iter))

print('    DONE.')

Predicting labels for 71,519 test sentences...
Predicting labels for 100 of 71,519 test sentences...
Predicting labels for 200 of 71,519 test sentences...
Predicting labels for 300 of 71,519 test sentences...
Predicting labels for 400 of 71,519 test sentences...
Predicting labels for 500 of 71,519 test sentences...
Predicting labels for 600 of 71,519 test sentences...
Predicting labels for 700 of 71,519 test sentences...
Predicting labels for 800 of 71,519 test sentences...
Predicting labels for 900 of 71,519 test sentences...
Predicting labels for 1000 of 71,519 test sentences...
Predicting labels for 1100 of 71,519 test sentences...
    DONE.


In [None]:
# Since there are multiclasses we use mcc to find the accuracy between -1 to 1

from sklearn.metrics import matthews_corrcoef

matthews_set = []

print('Calculating Matthews Corr. Coef. for each batch...')

for i in range(len(true_labels)):
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
  matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...


In [None]:
# Calculate the MCC - currentl 0.668 which is a good correlation with validation accuracy of 0.7
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

MCC: 0.668


In [None]:
# Saving the model

import os

output_dir = 'content/model_save/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



mkdir: cannot create directory ‘content/model_save’: File exists
Saving model to content/model_save/


('content/model_save/tokenizer_config.json',
 'content/model_save/special_tokens_map.json',
 'content/model_save/vocab.txt',
 'content/model_save/added_tokens.json',
 'content/model_save/tokenizer.json')

In [None]:
#!zip -r fine_tune_model.zip /content/content/model_save


  adding: content/content/model_save/ (stored 0%)
  adding: content/content/model_save/model.safetensors (deflated 7%)
  adding: content/content/model_save/config.json (deflated 64%)
  adding: content/content/model_save/tokenizer_config.json (deflated 76%)
  adding: content/content/model_save/tokenizer.json (deflated 71%)
  adding: content/content/model_save/special_tokens_map.json (deflated 42%)
  adding: content/content/model_save/vocab.txt (deflated 53%)


In [None]:
#!cp /content/fine_tune_model.zip drive/MyDrive/Colab\ Notebooks