<a href="https://colab.research.google.com/github/tommybe/ner_furniture/blob/main/ner_furniture/model_trainer/bert_model_training_runner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/tommybe/ner_furniture.git

fatal: destination path 'ner_furniture' already exists and is not an empty directory.


In [2]:
import json
f = open('/content/ner_furniture/labeled_tokens_dataset.json')
tokens_data = json.load(f)

In [3]:
# # Mount your google drive in google colab
# from google.colab import drive
# drive.mount('/content/drive')

# # Insert the directory
# import sys
# sys.path.insert(0,'/content/drive/My Drive/ColabNotebooks')

# # Import your module or file
# import ner_furniture

# %cd ner_furniture/model_trainer/
# %ls

In [4]:
from re import split
from typing import Tuple

TRAIN_WEBSITES_SHARE = 0.7
BERTMODEL = 'distilbert-base-uncased'  # to check bert-base-uncased, https://huggingface.co/docs/transformers/tasks/token_classification
DO_LOWER_CASE = True
TRAIN_VAL_TEST_SHARES = [0.7, 0.2, 0.1]

LABELS_LIST = ['O', 'B-PRODUCT', 'I-PRODUCT']
LABELS_IDS = [0, 1, 2]

def split_content_into_sentences(websites: dict) -> dict:
    for main_website, inner_websites in websites.items():
        for inner_website_name, inner_website_content in inner_websites.items():
            websites[main_website][inner_website_name] = split('[\n|\.]', inner_website_content)
    return websites


def split_tokens_data_on_train_val_test(tokens_data: dict) -> Tuple[dict, dict, dict]:
    no_of_words = len(tokens_data['labels'])
    split_points = [0,
                    int(no_of_words * TRAIN_VAL_TEST_SHARES[0]),
                    int(no_of_words * (TRAIN_VAL_TEST_SHARES[0] + TRAIN_VAL_TEST_SHARES[0])),
                    no_of_words + 1
                    ]
    train_set = {'tokens': tokens_data['tokens'][split_points[0]:split_points[1]],
                 'labels': tokens_data['labels'][split_points[0]:split_points[1]]}
    val_set = {'tokens': tokens_data['tokens'][split_points[1]:split_points[2]],
               'labels': tokens_data['labels'][split_points[1]:split_points[2]]}
    test_set = {'tokens': tokens_data['tokens'][split_points[2]:split_points[3]],
                'labels': tokens_data['labels'][split_points[2]:split_points[3]]}
    return train_set, val_set, test_set


def flatten_tokens_data(tokens_data: dict) -> dict:
    tokens_data['tokens'] = [subword for word in tokens_data['tokens'] for subword in word]
    tokens_data['labels'] = [sublabel for label in tokens_data['labels'] for sublabel in label]
    return tokens_data


In [5]:

# %pip install transformers
from transformers import DataCollatorForTokenClassification, AutoTokenizer, DataCollatorWithPadding

class DataCollator:
    def create(self):
        tokenizer = AutoTokenizer.from_pretrained(BERTMODEL, do_lower_case=DO_LOWER_CASE)
        return DataCollatorWithPadding(tokenizer)


In [6]:
# %pip install torch
import torch

class TransformerDataset(torch.utils.data.Dataset):
    #https://huggingface.co/transformers/v3.2.0/custom_datasets.html
    def __init__(self, raw_set:dict):
        self.tokens = raw_set['tokens']
        self.labels = raw_set['labels']

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokens.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [7]:
# %pip install seqeval
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric

metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [LABELS_LIST[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABELS_LIST[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return results


  metric = load_metric("seqeval")


In [8]:
from transformers import AutoTokenizer

class TokenEncoder:

    def create(self, tokens_set: dict):
        tokenizer = AutoTokenizer.from_pretrained(BERTMODEL, do_lower_case=DO_LOWER_CASE)
        tokens_set['tokens'] = tokenizer(tokens_set['tokens'], padding=True, truncation=True,
                                         return_offsets_mapping=True)
        tokens_set['labels'] = self._update_labels(tokens_set['tokens'], tokens_set['labels'])
        return tokens_set

    def _update_labels(self, encoded_tokens, old_labels, label_only_first_word=True):
        '''based on https://huggingface.co/docs/transformers/tasks/token_classification &
        https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/
        primary token label passed to all subtokens'''
        total_adjusted_labels = []
        for k in range(0, len(encoded_tokens["input_ids"])):
            prev_word_id = -1
            word_ids_list = encoded_tokens.word_ids(batch_index=k)
            existing_label_ids = old_labels[k]
            adjusted_label_ids = []

            for word_id in word_ids_list:
                if (word_id is None):
                    adjusted_label_ids.append(-100)
                elif (word_id != prev_word_id):
                    adjusted_label_ids.append(existing_label_ids)
                    prev_word_id = word_id
                else:
                    adjusted_label_ids.append(existing_label_ids)

            total_adjusted_labels.append(adjusted_label_ids)
        return total_adjusted_labels


In [9]:
# pip install accelerate -U

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

TRAINING_ARGS = TrainingArguments(
    output_dir="./fine_tune_bert",
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps = 0.05,
)


class ModelFineTuner:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(BERTMODEL, do_lower_case=DO_LOWER_CASE)
        self.data_collator = DataCollator().create()
        self.model = AutoModelForTokenClassification.from_pretrained(BERTMODEL, num_labels=len(LABELS_IDS))

    def train(self, train_set: TransformerDataset, eval_set: TransformerDataset):
        trainer = Trainer(
            model=self.model,
            args=TRAINING_ARGS,
            train_dataset=train_set,
            eval_dataset=eval_set,
            data_collator=self.data_collator,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics
        )
        trainer.train()


In [None]:
    train_set, val_set, test_set = split_tokens_data_on_train_val_test(tokens_data)

    train_set = flatten_tokens_data(train_set)
    val_set = flatten_tokens_data(val_set)
    test_set = flatten_tokens_data(test_set)

    train_set = TokenEncoder().create(train_set)
    val_set = TokenEncoder().create(val_set)
    # test_set['tokens'] = TokenEncoder().create(test_set['tokens'])

    train_set = TransformerDataset(train_set)
    val_set = TransformerDataset(val_set)
    # test_set = TransformerDataset(test_set)

    ModelFineTuner().train(train_set, val_set)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
