In [186]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification  # This libary apply augumentation technique at runtime
from transformers import AutoModelForTokenClassification     # This class is responsible for load model into my memory
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Features, Value

In [None]:
import pandas as pd
import re
import random

# Names of mines and metals for validation
mines = ["Aguas Calientes", "Amiches", "Arroyo Verde", "Aylen", "Boleadora", "Cachi", "Calcatreu",
         "Canadon Langostura", "Cerro Negro", "Newmont Corporation", "Latin Metals Inc",
         "International Iconic Gold Exploration Corp", "Entropy Resources",
         "Aldebaran Resources Inc", "NewPeak Metals Limited", "Patagonia Gold Corp", "E2 Metals Limited"]
metals = ["gold", "silver", "platinum", "copper", "zinc", "molybdenum", "antimony", "arsenic"]

# POS, chunk, and NER tags with indices
pos_tags = {'"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9,
            'CC': 10, 'CD': 11, 'DT': 12, 'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18,
            'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23, 'NNS': 24, 'NN|SYM': 25, 'PDT': 26,
            'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33, 'SYM': 34,
            'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42,
            'WDT': 43, 'WP': 44, 'WP$': 45, 'WRB': 46}

chunk_tags = {'O': 0, 'B-ADJP': 1, 'I-ADJP': 2, 'B-ADVP': 3, 'I-ADVP': 4, 'B-CONJP': 5, 'I-CONJP': 6,
              'B-INTJ': 7, 'I-INTJ': 8, 'B-LST': 9, 'I-LST': 10, 'B-NP': 11, 'I-NP': 12, 'B-PP': 13,
              'I-PP': 14, 'B-PRT': 15, 'I-PRT': 16, 'B-SBAR': 17, 'I-SBAR': 18, 'B-UCP': 19, 'I-UCP': 20,
              'B-VP': 21, 'I-VP': 22}

ner_tags = {'O': 0, 'B-MINES': 1, 'I-MINES': 2, 'B-METALS': 3}

# Path to the file containing news
file_path = './files/synthetic_news_set.txt'

# Reading the first 20 news items
with open(file_path, 'r', encoding='utf-8') as f:
    news_list = f.read().strip().split('\n')[:180]   ## 

# Function to annotate tokens in CoNLL format
def tag_tokens(news_list, mines, metals):
    data = []
    for news in news_list:
        tokens_data = []
        tokens = re.findall(r'\b\w+\b', news)
        i = 0
        while i < len(tokens):
            token = tokens[i]
            
            # Checking for multi-word mine names
            found_mine = None
            for mine in mines:
                mine_tokens = mine.split()
                if tokens[i:i + len(mine_tokens)] == mine_tokens:
                    found_mine = mine_tokens
                    break
            
            if found_mine:
                # Assigning B-MINES to the first token and I-MINES to the rest
                tokens_data.append([found_mine[0], pos_tags.get("NNP", 21), chunk_tags.get("B-NP", 11), ner_tags["B-MINES"]])
                for j in range(1, len(found_mine)):
                    tokens_data.append([found_mine[j], pos_tags.get("NNP", 21), chunk_tags.get("I-NP", 12), ner_tags["I-MINES"]])
                
                i += len(found_mine)
                continue
            
            # Checking for metals
            if token in metals:
                tokens_data.append([token, pos_tags.get("NN", 21), chunk_tags.get("B-NP", 11), ner_tags["B-METALS"]])
            else:
                # If the token is neither a mine nor a metal
                tokens_data.append([token, pos_tags.get("NN", 21), chunk_tags["O"], ner_tags["O"]])
            
            i += 1
        
        data.append(tokens_data)
    
    return data

# Applying the token annotation function
tagged_data = tag_tokens(news_list, mines, metals)

# Function to save in CoNLL-2003 format
def save_to_conll(data, file_name):
    with open(file_name, "w", encoding="utf-8") as f:
        for sentence in data:
            for token_data in sentence:
                # Convert each element in token_data to a string and join with spaces
                f.write(" ".join(map(str, token_data)) + "\n")
            f.write("\n")  # Separate sentences with an empty line

# Saving to file
save_to_conll(tagged_data, "annotated_news_data.conll")


In [None]:

import pandas as pd
from datasets import Dataset, DatasetDict
import random

# Function to read data from a CoNLL file and transform it into the required format
def load_conll_data(file_path):
    dataset = []
    sentence = {
        "id": 0,
        "tokens": [],
        "pos_tags": [],
        "chunk_tags": [],
        "ner_tags": []
    }
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:
                token, pos_tag, chunk_tag, ner_tag = line.split()
                sentence["tokens"].append(token)
                sentence["pos_tags"].append(pos_tag)
                sentence["chunk_tags"].append(chunk_tag)
                sentence["ner_tags"].append(int(ner_tag))
            else:
                # Add the sentence to the dataset
                dataset.append(sentence)
                # Update the id and reset for the next sentence
                sentence = {
                    "id": len(dataset),
                    "tokens": [],
                    "pos_tags": [],
                    "chunk_tags": [],
                    "ner_tags": []
                }
    
    # Add the last sentence if the file does not end with an empty line
    if sentence["tokens"]:
        dataset.append(sentence)
    
    return dataset

# Load data
file_path = "annotated_news_data.conll"
data = load_conll_data(file_path)

# Shuffle the data and split into train, validation, and test
random.shuffle(data)
train_size = int(0.7 * len(data))
valid_size = int(0.15 * len(data))

train_data = data[:train_size]
validation_data = data[train_size:train_size + valid_size]
test_data = data[train_size + valid_size:]

# Convert the data into DatasetDict format
ner_data = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame(train_data)),
    "validation": Dataset.from_pandas(pd.DataFrame(validation_data)),
    "test": Dataset.from_pandas(pd.DataFrame(test_data))
})

# View the structure of DatasetDict
print(ner_data)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 84
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 18
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 18
    })
})


In [104]:
# structure of train data
ner_data['train'][3]

{'id': 114,
 'tokens': ['Falcon',
  'Gold',
  'Corp',
  'announces',
  'the',
  'development',
  'of',
  'the',
  'ERSA',
  'gold',
  'mining',
  'project',
  'in',
  'La',
  'Rioja',
  'Argentina',
  'Extensive',
  'drilling',
  'campaigns',
  'have',
  'revealed',
  'substantial',
  'gold',
  'copper',
  'silver',
  'lead',
  'zinc',
  'and',
  'vanadium',
  'mineralization',
  'indicating',
  'the',
  'project',
  's',
  'potential',
  'for',
  'long',
  'term',
  'resource',
  'extraction',
  'Falcon',
  'Gold',
  'Corp',
  'is',
  'committed',
  'to',
  'responsible',
  'mining',
  'practices',
  'and',
  'sustainable',
  'resource',
  'development',
  'at',
  'the',
  'ERSA',
  'project'],
 'pos_tags': ['21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '

In [105]:
#structure of test data
ner_data['validation']['id']

[0, 115, 105, 57, 113, 53, 107, 89, 83, 91, 25, 71, 102, 77, 64, 29, 27, 106]

In [106]:
ner_data['test'][0]

{'id': 117, 'tokens': [], 'pos_tags': [], 'chunk_tags': [], 'ner_tags': []}

In [107]:
ner_data['validation'].features['tokens']

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [108]:
ner_data['train'].features["ner_tags"]

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [109]:
ner_data['train'].features["ner_tags"]

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [110]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [111]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [112]:
example_text = ner_data['train'][0]
tokenized_input = tokenizer(example_text['tokens'],is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()

In [113]:
print(tokenized_input)
print("\n")
print(tokens)
print("\n")
print(word_ids)

{'input_ids': [101, 102], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}


['[CLS]', '[SEP]']


[None, None]


In [114]:
print(f'Length of the tokens is : {len(tokens)}')
print(f'Length of the ner tags is: {len(ner_data["train"][0]["ner_tags"])}')

Length of the tokens is : 2
Length of the ner tags is: 0


In [115]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [116]:
ner_data['train'][1:2]

{'id': [84],
 'tokens': [['Condoryacu',
   'S',
   'R',
   'L',
   'reports',
   'progress',
   'on',
   'the',
   'Condoryacu',
   'gold',
   'project',
   'in',
   'Catamarca',
   'Argentina',
   'Recent',
   'drilling',
   'results',
   'have',
   'confirmed',
   'the',
   'presence',
   'of',
   'gold',
   'silver',
   'and',
   'copper',
   'mineralization',
   'supporting',
   'the',
   'project',
   's',
   'economic',
   'feasibility',
   'Condoryacu',
   'S',
   'R',
   'L',
   'aims',
   'to',
   'advance',
   'the',
   'project',
   'towards',
   'development',
   'in',
   'the',
   'near',
   'future']],
 'pos_tags': [['21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '

In [117]:
q = tokenize_and_align_labels(ner_data['train'][1:2])
print(q)

{'input_ids': [[101, 29260, 3148, 10841, 1055, 1054, 1048, 4311, 5082, 2006, 1996, 29260, 3148, 10841, 2751, 2622, 1999, 4937, 8067, 18992, 5619, 3522, 15827, 3463, 2031, 4484, 1996, 3739, 1997, 2751, 3165, 1998, 6967, 9754, 3989, 4637, 1996, 2622, 1055, 3171, 24010, 29260, 3148, 10841, 1055, 1054, 1048, 8704, 2000, 5083, 1996, 2622, 2875, 2458, 1999, 1996, 2379, 2925, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [118]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
condor__________________________________ 0
##ya____________________________________ 0
##cu____________________________________ 0
s_______________________________________ 0
r_______________________________________ 0
l_______________________________________ 0
reports_________________________________ 0
progress________________________________ 0
on______________________________________ 0
the_____________________________________ 0
condor__________________________________ 0
##ya____________________________________ 0
##cu____________________________________ 0
gold____________________________________ 3
project_________________________________ 0
in______________________________________ 0
cat_____________________________________ 0
##ama___________________________________ 0
##rca___________________________________ 0
argentina_______________________________ 0
recent__________________________________ 0
drilling________________________________ 0
results_

In [119]:
## Applying on entire data
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map: 100%|██████████| 84/84 [00:00<00:00, 4792.45 examples/s]
Map: 100%|██████████| 18/18 [00:00<00:00, 3474.02 examples/s]
Map: 100%|██████████| 18/18 [00:00<00:00, 3357.53 examples/s]


In [120]:
tokenized_datasets['train'][1]

{'id': 84,
 'tokens': ['Condoryacu',
  'S',
  'R',
  'L',
  'reports',
  'progress',
  'on',
  'the',
  'Condoryacu',
  'gold',
  'project',
  'in',
  'Catamarca',
  'Argentina',
  'Recent',
  'drilling',
  'results',
  'have',
  'confirmed',
  'the',
  'presence',
  'of',
  'gold',
  'silver',
  'and',
  'copper',
  'mineralization',
  'supporting',
  'the',
  'project',
  's',
  'economic',
  'feasibility',
  'Condoryacu',
  'S',
  'R',
  'L',
  'aims',
  'to',
  'advance',
  'the',
  'project',
  'towards',
  'development',
  'in',
  'the',
  'near',
  'future'],
 'pos_tags': ['21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21'],
 'chunk_tags': ['0',
  '0',
  

In [121]:

# Defining model
ner_model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [122]:
!pip install accelerate>=1
import accelerate
import transformers

transformers.__version__, accelerate.__version__


('4.46.1', '1.1.0')

In [123]:

!pip install tf-keras
#Define training args
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
num_train_epochs=20,
weight_decay=0.01,
)





In [124]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [125]:
#!pip install -U datasets evaluate
from evaluate import load
metric = load("seqeval")

In [126]:
print(ner_data["train"].features["ner_tags"].feature)


Value(dtype='int64', id=None)


In [None]:
from datasets import DatasetDict, ClassLabel, Sequence, Features, Value

# Defining labels for ner_tags
ner_label_names = ['O', 'B-MINES', 'I-MINES', 'B-METALS']
features = Features({
    "id": Value("int64"),
    "tokens": Sequence(Value("string")),
    "pos_tags": Sequence(Value("int64")),
    "chunk_tags": Sequence(Value("int64")),
    "ner_tags": Sequence(ClassLabel(names=ner_label_names))
})

# Converting ner_tags to ClassLabel after dataset creation
ner_data["train"] = ner_data["train"].cast(features)
ner_data["validation"] = ner_data["validation"].cast(features)
ner_data["test"] = ner_data["test"].cast(features)

# Now checking the labels
print(ner_data["train"].features["ner_tags"].feature.names)
label_list = ner_data["train"].features["ner_tags"].feature.names
label_list


Casting the dataset: 100%|██████████| 84/84 [00:00<00:00, 10128.84 examples/s]
Casting the dataset: 100%|██████████| 18/18 [00:00<00:00, 3748.63 examples/s]


Casting the dataset: 100%|██████████| 18/18 [00:00<00:00, 3993.52 examples/s]

['O', 'B-MINES', 'I-MINES', 'B-METALS']





['O', 'B-MINES', 'I-MINES', 'B-METALS']

In [128]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    print(eval_preds)

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [129]:
trainer = Trainer(
   ner_model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

  trainer = Trainer(


In [130]:
trainer.train()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

  5%|▌         | 3/60 [00:11<03:17,  3.47s/it]

<transformers.trainer_utils.EvalPrediction object at 0x754052529670>
{'eval_loss': 0.6635697484016418, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8363636363636363, 'eval_runtime': 0.6708, 'eval_samples_per_second': 26.833, 'eval_steps_per_second': 1.491, 'epoch': 1.0}


                                              
 10%|█         | 6/60 [00:24<03:26,  3.83s/it]

<transformers.trainer_utils.EvalPrediction object at 0x754053f1a3a0>
{'eval_loss': 0.5620435476303101, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8363636363636363, 'eval_runtime': 0.8122, 'eval_samples_per_second': 22.163, 'eval_steps_per_second': 1.231, 'epoch': 2.0}


                                              
 15%|█▌        | 9/60 [00:38<03:35,  4.22s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b1823670>
{'eval_loss': 0.4971984326839447, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8363636363636363, 'eval_runtime': 0.8037, 'eval_samples_per_second': 22.396, 'eval_steps_per_second': 1.244, 'epoch': 3.0}


                                               
 20%|██        | 12/60 [00:52<03:28,  4.35s/it]

<transformers.trainer_utils.EvalPrediction object at 0x754053f6a670>
{'eval_loss': 0.40264856815338135, 'eval_precision': 1.0, 'eval_recall': 0.25925925925925924, 'eval_f1': 0.4117647058823529, 'eval_accuracy': 0.8681818181818182, 'eval_runtime': 0.7999, 'eval_samples_per_second': 22.504, 'eval_steps_per_second': 1.25, 'epoch': 4.0}


                                               
 25%|██▌       | 15/60 [01:06<03:15,  4.34s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b1831670>
{'eval_loss': 0.3450518846511841, 'eval_precision': 0.8235294117647058, 'eval_recall': 0.5185185185185185, 'eval_f1': 0.6363636363636364, 'eval_accuracy': 0.8863636363636364, 'eval_runtime': 0.8135, 'eval_samples_per_second': 22.126, 'eval_steps_per_second': 1.229, 'epoch': 5.0}


                                               
 30%|███       | 18/60 [01:21<03:07,  4.46s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b1836040>
{'eval_loss': 0.3024390637874603, 'eval_precision': 0.7368421052631579, 'eval_recall': 0.5185185185185185, 'eval_f1': 0.6086956521739131, 'eval_accuracy': 0.8772727272727273, 'eval_runtime': 0.8055, 'eval_samples_per_second': 22.347, 'eval_steps_per_second': 1.241, 'epoch': 6.0}


                                               
 35%|███▌      | 21/60 [01:35<02:54,  4.48s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b18360d0>
{'eval_loss': 0.2680642902851105, 'eval_precision': 0.7894736842105263, 'eval_recall': 0.5555555555555556, 'eval_f1': 0.6521739130434783, 'eval_accuracy': 0.8863636363636364, 'eval_runtime': 0.7872, 'eval_samples_per_second': 22.867, 'eval_steps_per_second': 1.27, 'epoch': 7.0}


                                               
 40%|████      | 24/60 [01:50<02:40,  4.45s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b18360d0>
{'eval_loss': 0.2355039417743683, 'eval_precision': 0.8, 'eval_recall': 0.5925925925925926, 'eval_f1': 0.6808510638297872, 'eval_accuracy': 0.8909090909090909, 'eval_runtime': 0.8213, 'eval_samples_per_second': 21.916, 'eval_steps_per_second': 1.218, 'epoch': 8.0}


                                               
 45%|████▌     | 27/60 [02:04<02:27,  4.46s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7540725e8730>
{'eval_loss': 0.2031565010547638, 'eval_precision': 0.782608695652174, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.72, 'eval_accuracy': 0.9136363636363637, 'eval_runtime': 0.8176, 'eval_samples_per_second': 22.016, 'eval_steps_per_second': 1.223, 'epoch': 9.0}


                                               
 50%|█████     | 30/60 [02:18<02:11,  4.39s/it]

<transformers.trainer_utils.EvalPrediction object at 0x75406b2a5130>
{'eval_loss': 0.17096249759197235, 'eval_precision': 0.7142857142857143, 'eval_recall': 0.7407407407407407, 'eval_f1': 0.7272727272727273, 'eval_accuracy': 0.9227272727272727, 'eval_runtime': 0.9094, 'eval_samples_per_second': 19.793, 'eval_steps_per_second': 1.1, 'epoch': 10.0}


                                               
 55%|█████▌    | 33/60 [02:33<02:03,  4.58s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7540533f8700>
{'eval_loss': 0.1510426104068756, 'eval_precision': 0.7407407407407407, 'eval_recall': 0.7407407407407407, 'eval_f1': 0.7407407407407407, 'eval_accuracy': 0.9272727272727272, 'eval_runtime': 0.8822, 'eval_samples_per_second': 20.402, 'eval_steps_per_second': 1.133, 'epoch': 11.0}


                                               
 60%|██████    | 36/60 [02:49<01:55,  4.82s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7540533c1dc0>
{'eval_loss': 0.1329496055841446, 'eval_precision': 0.7241379310344828, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.75, 'eval_accuracy': 0.9454545454545454, 'eval_runtime': 0.9151, 'eval_samples_per_second': 19.669, 'eval_steps_per_second': 1.093, 'epoch': 12.0}


                                               
 65%|██████▌   | 39/60 [03:08<01:52,  5.37s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7540533e2dc0>
{'eval_loss': 0.11756303906440735, 'eval_precision': 0.75, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7636363636363638, 'eval_accuracy': 0.9590909090909091, 'eval_runtime': 1.3079, 'eval_samples_per_second': 13.763, 'eval_steps_per_second': 0.765, 'epoch': 13.0}


                                               
 70%|███████   | 42/60 [03:29<01:51,  6.18s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b1817ee0>
{'eval_loss': 0.10601700842380524, 'eval_precision': 0.75, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7636363636363638, 'eval_accuracy': 0.9590909090909091, 'eval_runtime': 1.2862, 'eval_samples_per_second': 13.995, 'eval_steps_per_second': 0.778, 'epoch': 14.0}


                                               
 75%|███████▌  | 45/60 [03:48<01:30,  6.06s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b1817e80>
{'eval_loss': 0.09762444347143173, 'eval_precision': 0.75, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7636363636363638, 'eval_accuracy': 0.9590909090909091, 'eval_runtime': 0.7845, 'eval_samples_per_second': 22.945, 'eval_steps_per_second': 1.275, 'epoch': 15.0}


                                               
 80%|████████  | 48/60 [04:02<01:00,  5.01s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7540b8ea97f0>
{'eval_loss': 0.09221401810646057, 'eval_precision': 0.8, 'eval_recall': 0.8888888888888888, 'eval_f1': 0.8421052631578948, 'eval_accuracy': 0.9681818181818181, 'eval_runtime': 0.8455, 'eval_samples_per_second': 21.288, 'eval_steps_per_second': 1.183, 'epoch': 16.0}


                                               
 85%|████████▌ | 51/60 [04:17<00:41,  4.64s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7540b8ea97f0>
{'eval_loss': 0.08991530537605286, 'eval_precision': 0.78125, 'eval_recall': 0.9259259259259259, 'eval_f1': 0.847457627118644, 'eval_accuracy': 0.9681818181818181, 'eval_runtime': 0.7889, 'eval_samples_per_second': 22.817, 'eval_steps_per_second': 1.268, 'epoch': 17.0}


                                               
 90%|█████████ | 54/60 [04:31<00:27,  4.53s/it]

<transformers.trainer_utils.EvalPrediction object at 0x75406b3edaf0>
{'eval_loss': 0.08807323127985, 'eval_precision': 0.78125, 'eval_recall': 0.9259259259259259, 'eval_f1': 0.847457627118644, 'eval_accuracy': 0.9681818181818181, 'eval_runtime': 0.8491, 'eval_samples_per_second': 21.199, 'eval_steps_per_second': 1.178, 'epoch': 18.0}


                                               
 95%|█████████▌| 57/60 [04:46<00:13,  4.57s/it]

<transformers.trainer_utils.EvalPrediction object at 0x754053f05040>
{'eval_loss': 0.08628478646278381, 'eval_precision': 0.78125, 'eval_recall': 0.9259259259259259, 'eval_f1': 0.847457627118644, 'eval_accuracy': 0.9681818181818181, 'eval_runtime': 0.8969, 'eval_samples_per_second': 20.07, 'eval_steps_per_second': 1.115, 'epoch': 19.0}


                                               
100%|██████████| 60/60 [05:03<00:00,  5.05s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7541b4c35f40>
{'eval_loss': 0.08527182042598724, 'eval_precision': 0.78125, 'eval_recall': 0.9259259259259259, 'eval_f1': 0.847457627118644, 'eval_accuracy': 0.9727272727272728, 'eval_runtime': 0.7351, 'eval_samples_per_second': 24.485, 'eval_steps_per_second': 1.36, 'epoch': 20.0}
{'train_runtime': 303.1621, 'train_samples_per_second': 5.542, 'train_steps_per_second': 0.198, 'train_loss': 0.18968644142150878, 'epoch': 20.0}





TrainOutput(global_step=60, training_loss=0.18968644142150878, metrics={'train_runtime': 303.1621, 'train_samples_per_second': 5.542, 'train_steps_per_second': 0.198, 'total_flos': 59474666884416.0, 'train_loss': 0.18968644142150878, 'epoch': 20.0})

In [131]:
## Save model
ner_model.save_pretrained("ner_model")

In [132]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [133]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [134]:
id2label

{'0': 'O', '1': 'B-MINES', '2': 'I-MINES', '3': 'B-METALS'}

In [135]:
label2id

{'O': '0', 'B-MINES': '1', 'I-MINES': '2', 'B-METALS': '3'}

In [136]:
import json

In [137]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [138]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")


In [139]:
from transformers import pipeline

In [192]:
nlp = pipeline("ner",model=model_fine_tuned,tokenizer=tokenizer)
example = "Boladora and Cnadon Langostur known in the precious metals sector."
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-MINES', 'score': np.float32(0.6271148), 'index': 1, 'word': 'bo', 'start': 0, 'end': 2}, {'entity': 'B-MINES', 'score': np.float32(0.5719211), 'index': 2, 'word': '##lad', 'start': 2, 'end': 5}, {'entity': 'B-MINES', 'score': np.float32(0.50259656), 'index': 5, 'word': 'cn', 'start': 13, 'end': 15}, {'entity': 'B-MINES', 'score': np.float32(0.5168247), 'index': 6, 'word': '##ado', 'start': 15, 'end': 18}]


In [None]:
# Processing and merging subwords with score addition
entities = []
current_entity = ""
current_tag = None
current_start = None
current_score_sum = 0.0
current_score_count = 0

for item in ner_results:
    word = item['word']
    tag = item['entity']
    score = item['score']
    
    # Start of a new word or entity
    if word.startswith("##"):
        # Remove "##" and merge with the current word
        current_entity += word[2:]
        current_score_sum += score
        current_score_count += 1
    else:
        # Save the current entity if it's completed
        if current_entity:
            avg_score = current_score_sum / current_score_count if current_score_count > 0 else 0
            entities.append({
                "entity": current_tag,
                "word": current_entity,
                "start": current_start,
                "end": item['start'] - 1,  # End of the previous word
                "score": avg_score
            })
        
        # Initialize a new word
        current_entity = word
        current_tag = tag
        current_start = item['start']
        current_score_sum = score
        current_score_count = 1

# Add the last entity
if current_entity:
    avg_score = current_score_sum / current_score_count if current_score_count > 0 else 0
    entities.append({
        "entity": current_tag,
        "word": current_entity,
        "start": current_start,
        "end": item['end'],
        "score": avg_score
    })

# Output the merged entities
for entity in entities:
    print(f"Entity: {entity['word']}, Tag: {entity['entity']}, Start: {entity['start']}, End: {entity['end']}, Score: {entity['score']:.2f}")


Сущность: bolad, Тег: B-MINES, Начало: 0, Конец: 12, Оценка: 0.60
Сущность: cnado, Тег: B-MINES, Начало: 13, Конец: 18, Оценка: 0.51
