# 1. Data Preprocessing

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import tqdm
import random
import os
import warnings

warnings.simplefilter("ignore")

In [2]:
import json

# read classes.json
with open('data/classes.json', encoding='utf-8') as f:
    data = {key: value.split('/')[-1] for (key, value) in json.load(f).items()}
data

{'5908cb5da047d6c9e6dfea6337fb3189.doc': 'Договоры поставки',
 '14711e4fc8e56f0c75856c8837ec04cb.doc': 'Договоры поставки',
 '7eb67b5aecf3f3190aab0a5f8ea32172.docx': 'Договоры поставки',
 'b40a9d048b199d5f4db62a6a2335f2a0.pdf': 'Договоры поставки',
 '84fec112d02288861e7af59f468131fb.docx': 'Договоры поставки',
 'f6377999f8a5aa9a09b03e428ac93153.doc': 'Договоры поставки',
 'a525f050cef10dee3a42468daec064ff.doc': 'Договоры поставки',
 'bec0aa38d1383172690a18d16b07f154.doc': 'Договоры поставки',
 '214d620d9c54bc83111277dd872d3cb2.pdf': 'Договоры поставки',
 'd143c89d002fcef3e2bd2efdb4966f55.doc': 'Договоры поставки',
 '2fd747f38e30ae7ce1c9d6e3b907ac5d.doc': 'Договоры поставки',
 '4c2c295e81f4a6c3e669e8f76c6ce423.docx': 'Договоры поставки',
 '64f58bc6e1207a570a38d771609b2cf1.docx': 'Договоры поставки',
 '7ecd641f2ad81961c17455ed3ebeb2ab.doc': 'Договоры поставки',
 '4e583dc5a5f1499fd2408f3152589f2d.doc': 'Договоры поставки',
 '79104075f8b2ff971d51c495e67af52c.pdf': 'Договоры поставки',
 '19

In [3]:
classes = list(set(list(data.values())))
classes.sort()
classes

['Договоры аренды',
 'Договоры купли-продажи',
 'Договоры оказания услуг',
 'Договоры подряда',
 'Договоры поставки']

In [4]:
from document_processing import document2text, preprocess_text

preprocess_custom = lambda x: preprocess_text(x)

df = pd.DataFrame({'label': int(), 'text': str()}, index=[])
for key, value in tqdm.tqdm(data.items()):
    df = df.append({'label': classes.index(value),
                    'text': document2text(os.path.join('data/docs', key))},
                   ignore_index=True)
df['text'] = df['text'].apply(preprocess_text)
df.head()

100%|██████████| 120/120 [00:22<00:00,  5.29it/s]


Unnamed: 0,label,text
0,4,Evaluation Only Created with Aspose Words Copy...
1,4,Evaluation Only Created with Aspose Words Copy...
2,4,Evaluation Only Created with Aspose Words Copy...
3,4,Evaluation Only Created with Aspose Words Copy...
4,4,Evaluation Only Created with Aspose Words Copy...


In [5]:
tokenizer = BertTokenizer.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    do_lower_case=False
)

In [6]:
text = df['text'].values

In [7]:
def print_rand_sentence():
    '''Displays the tokens and respective IDs of a random text sample'''
    index = random.randint(0, len(text) - 1)
    table = np.array([tokenizer.tokenize(text[index]),
                      tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text[index]))]).T
    print(tabulate(table,
                   headers=['Tokens', 'Token IDs'],
                   tablefmt='fancy_grid'))


print_rand_sentence()

╒═══════════════════╤═════════════╕
│ Tokens            │   Token IDs │
╞═══════════════════╪═════════════╡
│ Eva               │       19254 │
├───────────────────┼─────────────┤
│ ##lu              │       11947 │
├───────────────────┼─────────────┤
│ ##ation           │       10213 │
├───────────────────┼─────────────┤
│ Only              │       19252 │
├───────────────────┼─────────────┤
│ Cre               │       34602 │
├───────────────────┼─────────────┤
│ ##ated            │       16035 │
├───────────────────┼─────────────┤
│ with              │       10681 │
├───────────────────┼─────────────┤
│ Asp               │      100409 │
├───────────────────┼─────────────┤
│ ##ose             │       15081 │
├───────────────────┼─────────────┤
│ Words             │       44063 │
├───────────────────┼─────────────┤
│ Cop               │       63952 │
├───────────────────┼─────────────┤
│ ##yr              │       21240 │
├───────────────────┼─────────────┤
│ ##ight            │       

In [8]:
token_id = []
attention_masks = []
labels = df['label'].values


def preprocessing(input_text, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
      - input_ids: list of token ids
      - token_type_ids: list of token type ids
      - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
    '''
    return tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=512,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )


for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [9]:
val_ratio = 0.2
# Recommended batch size: 16, 32. See: https://arxiv.org/pdf/1810.04805.pdf
batch_size = 10

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size=val_ratio,
    shuffle=True,
    stratify=labels)

# Train and validation sets
train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
    train_set,
    sampler=RandomSampler(train_set),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_set,
    sampler=SequentialSampler(val_set),
    batch_size=batch_size
)

In [10]:
def b_metrics(preds, labels):
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    b_accuracy = (preds == labels).mean()
    return b_accuracy

In [11]:
# Load the BertForSequenceClassification model
model = BertForSequenceClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased',
    num_labels=len(classes),
    output_attentions=False,
    output_hidden_states=False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=5e-5,
                              eps=1e-08
                              )

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Recommended number of epochs: 2, 3, 4. See: https://arxiv.org/pdf/1810.04805.pdf
epochs = 3

for _ in trange(epochs, desc='Epoch'):

    # ========== Training ==========

    # Set model to training mode
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            # Forward pass
            eval_output = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)
            val_loss += eval_output.loss.item()
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation loss: {:.4f}'.format(val_loss / len(validation_dataloader)))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy) / len(val_accuracy)))

Epoch:  33%|███▎      | 1/3 [00:02<00:05,  2.72s/it]


	 - Train loss: 1.6537
	 - Validation loss: 1.5373
	 - Validation Accuracy: 0.3167


Epoch:  67%|██████▋   | 2/3 [00:04<00:02,  2.22s/it]


	 - Train loss: 1.3244
	 - Validation loss: 0.7701
	 - Validation Accuracy: 1.0000


Epoch: 100%|██████████| 3/3 [00:06<00:00,  2.15s/it]


	 - Train loss: 0.6676
	 - Validation loss: 0.2184
	 - Validation Accuracy: 1.0000





In [13]:
# Save the model
from config import MODEL_PATH

model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

('test_model/tokenizer_config.json',
 'test_model/special_tokens_map.json',
 'test_model/vocab.txt',
 'test_model/added_tokens.json')