In [None]:
from google.colab import drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/sapient'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install pyarrow==15.0.2
! pip install datasets



In [None]:
from datasets import Dataset

In [None]:
!pip show pyarrow

Name: pyarrow
Version: 15.0.2
Summary: Python library for Apache Arrow
Home-page: https://arrow.apache.org/
Author: 
Author-email: 
License: Apache License, Version 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy
Required-by: bigframes, cudf-cu12, datasets, db-dtypes, ibis-framework, pandas-gbq, tensorflow-datasets


In [None]:
import os
import csv
import json
import random
import string
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset
from datasets import Dataset, DatasetDict, Features, ClassLabel, Sequence, Value

In [None]:
def load_json_split(folder_path):
  split_names = ['train','val','test']
  sets = []
  for split_name in split_names:
    split_file_path = os.path.join(folder_path, f'{split_name}.json')
    with open(split_file_path, 'r') as f:
        data = json.load(f)
    data = [(item['text'], {'entities': item['entities']}) for item in data]
    sets.append(data)
  return sets

In [None]:
train_set,val_set,test_set = load_json_split(dataset_path)

In [None]:
class2id = {'O':0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'B-art': 16}
id2class = {v: k for k, v in class2id.items()}

In [None]:
features = Features({
    'tokens': Sequence(feature=Value(dtype='string')),
    'ner_tags': Sequence(feature=ClassLabel(names=list(class2id.keys())))
})

In [None]:
def preprocess_data(data):
    sentences = []
    ner_tags = []
    ner_class = []
    for text, annotations in data:
        tokens = text.split()
        tags = ['O'] * len(tokens)  # Initialize with 'O'
        class_tags = [0] * len(tokens)
        for start, end, label in annotations['entities']:
            entity_text = text[start:end]
            entity_tokens = entity_text.split()
            tags[text.split().index(entity_tokens[0])] =  label
            class_tags[text.split().index(entity_tokens[0])] = class2id[label]
            for i in range(1, len(entity_tokens)):
                tags[text.split().index(entity_tokens[i])] =  label
                class_tags[text.split().index(entity_tokens[i])] = class2id[label]
        ner_class.append(class_tags)
        sentences.append(tokens)
        ner_tags.append(tags)
    return sentences, ner_tags, ner_class

train_sentences, train_ner_tags,train_ner_class = preprocess_data(train_set)
val_sentences, val_ner_tags,val_ner_class = preprocess_data(val_set)
test_sentences, test_ner_tags,test_ner_class = preprocess_data(test_set)
# Create a DataFrame
train_df = pd.DataFrame({'tokens': train_sentences, 'ner_tags': train_ner_class})
val_df = pd.DataFrame({'tokens': val_sentences, 'ner_tags': val_ner_class})
test_df = pd.DataFrame({'tokens': test_sentences, 'ner_tags': test_ner_class})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df,features=features)
val_dataset = Dataset.from_pandas(val_df,features=features)
test_dataset = Dataset.from_pandas(test_df,features=features)
# val_dataset = Dataset.from_pandas(val_df)

In [None]:
train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 38353
})

In [None]:
train_dataset[0]

{'tokens': ['The',
  'officials',
  'reached',
  'agreement',
  'early',
  'Saturday',
  'after',
  'all-night',
  'meetings',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 7, 0, 0, 0, 0]}

In [None]:
train_dataset.features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'B-art'], id=None), length=-1, id=None)

In [None]:
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
example_text = train_dataset[0]

tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)


tokenized_input

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, 9, None]


{'input_ids': [101, 1996, 4584, 2584, 3820, 2220, 5095, 2044, 2035, 1011, 2305, 6295, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
len(example_text['ner_tags']), len(tokenized_input["input_ids"])

(10, 14)

In [None]:
# Combine into DatasetDict
ner_dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
q = tokenize_and_align_labels(ner_dataset['train'][4:5])
print(q)

{'input_ids': [[101, 2343, 5747, 1010, 2822, 2343, 15876, 9743, 28555, 1998, 2887, 3539, 2704, 12022, 11319, 3217, 12849, 10993, 12717, 2097, 2022, 2426, 1996, 4177, 7052, 1996, 23957, 2278, 6465, 5095, 1998, 4465, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 6, 14, 0, 3, 0, 14, 14, 14, 15, 3, 6, 14, 14, 14, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 7, 0, 15, 0, -100]]}


In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
president_______________________________ 6
bush____________________________________ 14
,_______________________________________ 0
chinese_________________________________ 3
president_______________________________ 0
hu______________________________________ 14
jin_____________________________________ 14
##tao___________________________________ 14
and_____________________________________ 15
japanese________________________________ 3
prime___________________________________ 6
minister________________________________ 14
jun_____________________________________ 14
##ichi__________________________________ 14
##ro____________________________________ 14
ko______________________________________ 14
##iz____________________________________ 14
##umi___________________________________ 14
will____________________________________ 0
be______________________________________ 0
among___________________________________ 0
the____________________________________

In [None]:
tokenized_datasets = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/38353 [00:00<?, ? examples/s]

Map:   0%|          | 0/4795 [00:00<?, ? examples/s]

Map:   0%|          | 0/4795 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=17)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install tokenizers seqeval -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
"/content/drive/MyDrive/sapient/ner_bert",
evaluation_strategy = "epoch",
save_strategy = "epoch",
logging_dir='/content/drive/MyDrive/sapient/ner_bert/logs',
logging_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=15,
weight_decay=0.01,
save_total_limit=3,
load_best_model_at_end=True,
)



In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
import datasets
metric = datasets.load_metric("seqeval")

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
label_list = ner_dataset["train"].features["ner_tags"].feature.names
example = ner_dataset['train'][0]

In [None]:
labels = [label_list[i] for i in example["ner_tags"]]

metric.compute(predictions=[labels], references=[labels])

{'tim': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
   "precision": results["overall_precision"],
   "recall": results["overall_recall"],
   "f1": results["overall_f1"],
  "accuracy": results["overall_accuracy"],
  }

In [None]:
trainer = Trainer(
    model,
    args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()
model.save_pretrained("ner_model_bert")
tokenizer.save_pretrained("tokenizer")

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0864,0.151269,0.80901,0.796327,0.802618,0.957483
2,0.0642,0.156654,0.811169,0.811428,0.811298,0.95824
3,0.057,0.151988,0.816589,0.820195,0.818388,0.959882
4,0.0478,0.17988,0.806718,0.819107,0.812865,0.958129
5,0.0365,0.199343,0.809086,0.817059,0.813053,0.957857
6,0.0271,0.219197,0.812277,0.817059,0.814661,0.958129
7,0.0206,0.238079,0.806276,0.817059,0.811632,0.958002
