In [1]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification  # This libary apply augumentation technique at runtime
from transformers import AutoModelForTokenClassification     # This class is responsible for load model into my memory
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Features, Value

  from .autonotebook import tqdm as notebook_tqdm
2024-11-13 20:19:34.522715: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-13 20:19:34.702921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731525574.779402 1118319 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731525574.807124 1118319 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-13 20:19:34.969884: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [None]:
import pandas as pd
import re
import random

# Names of mines and metals for validation
mines = ["Equinox Gold"]
metals = ["gold", "silver", "platinum", "copper", "zinc", "molybdenum", "antimony", "arsenic"]

# POS, chunk, and NER tags with indices
pos_tags = {'"': 0, "''": 1, '#': 2, '$': 3, '(': 4, ')': 5, ',': 6, '.': 7, ':': 8, '``': 9,
            'CC': 10, 'CD': 11, 'DT': 12, 'EX': 13, 'FW': 14, 'IN': 15, 'JJ': 16, 'JJR': 17, 'JJS': 18,
            'LS': 19, 'MD': 20, 'NN': 21, 'NNP': 22, 'NNPS': 23, 'NNS': 24, 'NN|SYM': 25, 'PDT': 26,
            'POS': 27, 'PRP': 28, 'PRP$': 29, 'RB': 30, 'RBR': 31, 'RBS': 32, 'RP': 33, 'SYM': 34,
            'TO': 35, 'UH': 36, 'VB': 37, 'VBD': 38, 'VBG': 39, 'VBN': 40, 'VBP': 41, 'VBZ': 42,
            'WDT': 43, 'WP': 44, 'WP$': 45, 'WRB': 46}

chunk_tags = {'O': 0, 'B-ADJP': 1, 'I-ADJP': 2, 'B-ADVP': 3, 'I-ADVP': 4, 'B-CONJP': 5, 'I-CONJP': 6,
              'B-INTJ': 7, 'I-INTJ': 8, 'B-LST': 9, 'I-LST': 10, 'B-NP': 11, 'I-NP': 12, 'B-PP': 13,
              'I-PP': 14, 'B-PRT': 15, 'I-PRT': 16, 'B-SBAR': 17, 'I-SBAR': 18, 'B-UCP': 19, 'I-UCP': 20,
              'B-VP': 21, 'I-VP': 22}

ner_tags = {'O': 0, 'B-MINES': 1, 'I-MINES': 2, 'B-METALS': 3}

# Path to the file containing news
file_path = './files/synthetic_news_set_equinox_gold.txt'

# Reading the first 20 news items
with open(file_path, 'r', encoding='utf-8') as f:
    news_list = f.read().strip().split('\n')[:180]  ## 

# Function for annotating tokens in CoNLL format
def tag_tokens(news_list, mines, metals):
    data = []
    for news in news_list:
        tokens_data = []
        tokens = re.findall(r'\b\w+\b', news)
        i = 0
        while i < len(tokens):
            token = tokens[i]
            
            # Check for multi-word mine names
            found_mine = None
            for mine in mines:
                mine_tokens = mine.split()
                if tokens[i:i + len(mine_tokens)] == mine_tokens:
                    found_mine = mine_tokens
                    break
            
            if found_mine:
                # Assign B-MINES to the first token and I-MINES to the rest
                tokens_data.append([found_mine[0], pos_tags.get("NNP", 21), chunk_tags.get("B-NP", 11), ner_tags["B-MINES"]])
                for j in range(1, len(found_mine)):
                    tokens_data.append([found_mine[j], pos_tags.get("NNP", 21), chunk_tags.get("I-NP", 12), ner_tags["I-MINES"]])
                
                i += len(found_mine)
                continue
            
            # Check for metals
            if token in metals:
                tokens_data.append([token, pos_tags.get("NN", 21), chunk_tags.get("B-NP", 11), ner_tags["B-METALS"]])
            else:
                # If the token is neither a mine nor a metal
                tokens_data.append([token, pos_tags.get("NN", 21), chunk_tags["O"], ner_tags["O"]])
            
            i += 1
        
        data.append(tokens_data)
    
    return data

# Apply the annotation function
tagged_data = tag_tokens(news_list, mines, metals)

# Function for saving data in CoNLL-2003 format
def save_to_conll(data, file_name):
    with open(file_name, "w", encoding="utf-8") as f:
        for sentence in data:
            for token_data in sentence:
                # Convert each element in token_data to a string and join them with spaces
                f.write(" ".join(map(str, token_data)) + "\n")
            f.write("\n")  # Separate sentences with an empty line

# Save the data to a file
save_to_conll(tagged_data, "annotated_equinox_gold_news_data.conll")


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import random

# Function to read data from a CoNLL file and transform it into the required format
def load_conll_data(file_path):
    dataset = []
    sentence = {
        "id": 0,
        "tokens": [],
        "pos_tags": [],
        "chunk_tags": [],
        "ner_tags": []
    }
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:
                token, pos_tag, chunk_tag, ner_tag = line.split()
                sentence["tokens"].append(token)
                sentence["pos_tags"].append(pos_tag)
                sentence["chunk_tags"].append(chunk_tag)
                sentence["ner_tags"].append(int(ner_tag))
            else:
                # Add the sentence to the dataset
                dataset.append(sentence)
                # Update the id and clear the dictionary for the next sentence
                sentence = {
                    "id": len(dataset),
                    "tokens": [],
                    "pos_tags": [],
                    "chunk_tags": [],
                    "ner_tags": []
                }
    
    # Add the last sentence if the file does not end with an empty line
    if sentence["tokens"]:
        dataset.append(sentence)
    
    return dataset

# Load the data
file_path = "annotated_equinox_gold_news_data.conll"
data = load_conll_data(file_path)

# Shuffle the data and split it into train, validation, and test sets
random.shuffle(data)
train_size = int(0.7 * len(data))
valid_size = int(0.15 * len(data))

train_data = data[:train_size]
validation_data = data[train_size:train_size + valid_size]
test_data = data[train_size + valid_size:]

# Convert the data into the DatasetDict format
ner_data = DatasetDict({
    "train": Dataset.from_pandas(pd.DataFrame(train_data)),
    "validation": Dataset.from_pandas(pd.DataFrame(validation_data)),
    "test": Dataset.from_pandas(pd.DataFrame(test_data))
})

# Display the structure of the DatasetDict
print(ner_data)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 69
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 16
    })
})


In [4]:
# structure of train data
ner_data['train'][3]

{'id': 79, 'tokens': [], 'pos_tags': [], 'chunk_tags': [], 'ner_tags': []}

In [5]:
#structure of test data
ner_data['validation']['id']

[8, 80, 20, 84, 18, 29, 50, 86, 66, 97, 67, 78, 14, 3]

In [6]:
ner_data['test'][0]

{'id': 88,
 'tokens': ['Equinox',
  'Gold',
  'Launches',
  'Carbon',
  'Offset',
  'Program',
  'with',
  'Global',
  'Reach',
  'Partnering',
  'with',
  'global',
  'environmental',
  'groups',
  'Equinox',
  'Gold',
  'is',
  'investing',
  'in',
  'carbon',
  'offset',
  'projects',
  'to',
  'neutralize',
  'its',
  'emissions'],
 'pos_tags': ['22',
  '22',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '22',
  '22',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21'],
 'chunk_tags': ['11',
  '12',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '11',
  '12',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 'ner_tags': [1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [7]:
ner_data['validation'].features['tokens']

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [8]:
ner_data['train'].features["ner_tags"]

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [9]:
ner_data['train'].features["ner_tags"]

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [10]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [11]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
example_text = ner_data['train'][0]
tokenized_input = tokenizer(example_text['tokens'],is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()

In [13]:
print(tokenized_input)
print("\n")
print(tokens)
print("\n")
print(word_ids)

{'input_ids': [101, 1041, 12519, 11636, 2751, 6753, 2334, 2495, 2007, 2082, 6502, 27001, 2007, 2334, 6867, 1041, 12519, 11636, 2751, 2003, 4804, 1996, 2810, 1997, 2047, 2816, 2379, 2049, 3136, 2000, 5335, 2334, 2495, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


['[CLS]', 'e', '##quin', '##ox', 'gold', 'supports', 'local', 'education', 'with', 'school', 'infrastructure', 'partnering', 'with', 'local', 'governments', 'e', '##quin', '##ox', 'gold', 'is', 'funding', 'the', 'construction', 'of', 'new', 'schools', 'near', 'its', 'operations', 'to', 'improve', 'local', 'education', '[SEP]']


[None, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, None]


In [14]:
print(f'Length of the tokens is : {len(tokens)}')
print(f'Length of the ner tags is: {len(ner_data["train"][0]["ner_tags"])}')

Length of the tokens is : 34
Length of the ner tags is: 28


In [15]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
ner_data['train'][1:2]

{'id': [38],
 'tokens': [['Equinox',
   'Gold',
   's',
   'Safety',
   'Record',
   'Reaches',
   'New',
   'Heights',
   'in',
   '2023',
   'Equinox',
   'Gold',
   'achieved',
   'its',
   'safest',
   'year',
   'on',
   'record',
   'with',
   'significant',
   'reductions',
   'in',
   'workplace',
   'injuries',
   'due',
   'to',
   'its',
   'robust',
   'safety',
   'protocols']],
 'pos_tags': [['22',
   '22',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '22',
   '22',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21',
   '21']],
 'chunk_tags': [['11',
   '12',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '11',
   '12',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0',
   '0']],
 'ner_tags': [[1,
   2,
   0,
   0,
   0,
   0,
   0,
   0,
 

In [17]:
q = tokenize_and_align_labels(ner_data['train'][1:2])
print(q)

{'input_ids': [[101, 1041, 12519, 11636, 2751, 1055, 3808, 2501, 6561, 2047, 7535, 1999, 16798, 2509, 1041, 12519, 11636, 2751, 4719, 2049, 3647, 3367, 2095, 2006, 2501, 2007, 3278, 25006, 1999, 16165, 6441, 2349, 2000, 2049, 15873, 3808, 16744, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}


In [18]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
e_______________________________________ 1
##quin__________________________________ 1
##ox____________________________________ 1
gold____________________________________ 2
s_______________________________________ 0
safety__________________________________ 0
record__________________________________ 0
reaches_________________________________ 0
new_____________________________________ 0
heights_________________________________ 0
in______________________________________ 0
202_____________________________________ 0
##3_____________________________________ 0
e_______________________________________ 1
##quin__________________________________ 1
##ox____________________________________ 1
gold____________________________________ 2
achieved________________________________ 0
its_____________________________________ 0
safe____________________________________ 0
##st____________________________________ 0
year____________________________________ 0
on______

In [19]:
## Applying on entire data
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Map: 100%|██████████| 69/69 [00:00<00:00, 2732.96 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 2399.78 examples/s]
Map: 100%|██████████| 16/16 [00:00<00:00, 1735.96 examples/s]


In [20]:
tokenized_datasets['train'][1]

{'id': 38,
 'tokens': ['Equinox',
  'Gold',
  's',
  'Safety',
  'Record',
  'Reaches',
  'New',
  'Heights',
  'in',
  '2023',
  'Equinox',
  'Gold',
  'achieved',
  'its',
  'safest',
  'year',
  'on',
  'record',
  'with',
  'significant',
  'reductions',
  'in',
  'workplace',
  'injuries',
  'due',
  'to',
  'its',
  'robust',
  'safety',
  'protocols'],
 'pos_tags': ['22',
  '22',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '22',
  '22',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21',
  '21'],
 'chunk_tags': ['11',
  '12',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '11',
  '12',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 'ner_tags': [1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [21]:

# Defining model
ner_model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
!pip install accelerate>=1
import accelerate
import transformers

transformers.__version__, accelerate.__version__


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


('4.46.1', '1.1.0')

In [None]:

!pip install tf-keras
#Define training args
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=6,
    weight_decay=0.01,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [32]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [33]:
#!pip install -U datasets evaluate
from evaluate import load
metric = load("seqeval")

In [34]:
print(ner_data["train"].features["ner_tags"].feature)


ClassLabel(names=['O', 'B-MINES', 'I-MINES', 'B-METALS'], id=None)


In [None]:
from datasets import DatasetDict, ClassLabel, Sequence, Features, Value

# Defining labels for ner_tags
ner_label_names = ['O', 'B-MINES', 'I-MINES', 'B-METALS']
features = Features({
    "id": Value("int64"),
    "tokens": Sequence(Value("string")),
    "pos_tags": Sequence(Value("int64")),
    "chunk_tags": Sequence(Value("int64")),
    "ner_tags": Sequence(ClassLabel(names=ner_label_names))
})

# Converting ner_tags to ClassLabel after dataset creation
ner_data["train"] = ner_data["train"].cast(features)
ner_data["validation"] = ner_data["validation"].cast(features)
ner_data["test"] = ner_data["test"].cast(features)

# Now checking the labels
print(ner_data["train"].features["ner_tags"].feature.names)
label_list = ner_data["train"].features["ner_tags"].feature.names
label_list


Casting the dataset: 100%|██████████| 69/69 [00:00<00:00, 6196.75 examples/s]
Casting the dataset: 100%|██████████| 14/14 [00:00<00:00, 3905.83 examples/s]
Casting the dataset: 100%|██████████| 16/16 [00:00<00:00, 6947.81 examples/s]

['O', 'B-MINES', 'I-MINES', 'B-METALS']





['O', 'B-MINES', 'I-MINES', 'B-METALS']

In [36]:
def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds
    print(eval_preds)

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
  }

In [37]:
trainer = Trainer(
   ner_model,
   args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

  trainer = Trainer(


In [38]:
trainer.train()


                                               
[A                                           

 52%|█████▏    | 31/60 [06:49<08:04, 16.72s/it]
[A
[A

<transformers.trainer_utils.EvalPrediction object at 0x7e01945b5910>
{'eval_loss': 0.0028097445610910654, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.7337, 'eval_samples_per_second': 19.082, 'eval_steps_per_second': 1.363, 'epoch': 1.0}



                                               
[A                                           

 52%|█████▏    | 31/60 [07:01<08:04, 16.72s/it]
[A
[A

<transformers.trainer_utils.EvalPrediction object at 0x7e00e8bac5e0>
{'eval_loss': 0.0017707530641928315, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.6572, 'eval_samples_per_second': 21.301, 'eval_steps_per_second': 1.522, 'epoch': 2.0}



                                               

[A[A                               
 52%|█████▏    | 31/60 [07:13<08:04, 16.72s/it]
[A
[A

<transformers.trainer_utils.EvalPrediction object at 0x7e026ab692b0>
{'eval_loss': 0.0012824329314753413, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.7112, 'eval_samples_per_second': 19.685, 'eval_steps_per_second': 1.406, 'epoch': 3.0}



                                               

[A[A                               
 52%|█████▏    | 31/60 [07:27<08:04, 16.72s/it]
[A
[A

<transformers.trainer_utils.EvalPrediction object at 0x7e026ab4dfd0>
{'eval_loss': 0.0010441734921187162, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.8498, 'eval_samples_per_second': 16.474, 'eval_steps_per_second': 1.177, 'epoch': 4.0}



                                               

[A[A                               
 52%|█████▏    | 31/60 [07:41<08:04, 16.72s/it]
[A
[A

<transformers.trainer_utils.EvalPrediction object at 0x7e026a93fe50>
{'eval_loss': 0.0009387984173372388, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.7683, 'eval_samples_per_second': 18.223, 'eval_steps_per_second': 1.302, 'epoch': 5.0}



                                               
[A                                            

 52%|█████▏    | 31/60 [07:55<08:04, 16.72s/it]
[A
                                               
100%|██████████| 18/18 [02:50<00:00,  9.49s/it]

<transformers.trainer_utils.EvalPrediction object at 0x7e00e8cb7be0>
{'eval_loss': 0.0009072309476323426, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.593, 'eval_samples_per_second': 23.609, 'eval_steps_per_second': 1.686, 'epoch': 6.0}
{'train_runtime': 170.7497, 'train_samples_per_second': 2.425, 'train_steps_per_second': 0.105, 'train_loss': 0.0052255164417955614, 'epoch': 6.0}





TrainOutput(global_step=18, training_loss=0.0052255164417955614, metrics={'train_runtime': 170.7497, 'train_samples_per_second': 2.425, 'train_steps_per_second': 0.105, 'total_flos': 9503820679632.0, 'train_loss': 0.0052255164417955614, 'epoch': 6.0})

In [39]:
## Save model
ner_model.save_pretrained("ner_model")

In [40]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [41]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [42]:
id2label

{'0': 'O', '1': 'B-MINES', '2': 'I-MINES', '3': 'B-METALS'}

In [43]:
label2id

{'O': '0', 'B-MINES': '1', 'I-MINES': '2', 'B-METALS': '3'}

In [44]:
import json

In [45]:
config = json.load(open("ner_model/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("ner_model/config.json","w"))

In [46]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")


In [47]:
from transformers import pipeline

In [95]:
nlp = pipeline("ner",model=model_fine_tuned,tokenizer=tokenizer)
example = "Equinox Goold Corp. Is Maintained at Sector Perform by National Bank. Ratings actions from Baystreet: http://www.baystreet.ca (END) Dow Jones Newswires  August 05, 2022 12:33 ET (16:33 GMT)"
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-MINES', 'score': np.float32(0.998533), 'index': 1, 'word': 'e', 'start': 0, 'end': 1}, {'entity': 'B-MINES', 'score': np.float32(0.998447), 'index': 2, 'word': '##quin', 'start': 1, 'end': 5}, {'entity': 'B-MINES', 'score': np.float32(0.99803966), 'index': 3, 'word': '##ox', 'start': 5, 'end': 7}, {'entity': 'B-MINES', 'score': np.float32(0.4476207), 'index': 4, 'word': 'goo', 'start': 8, 'end': 11}]


In [None]:
# Processing and merging subwords with added score
entities = []
current_entity = ""
current_tag = None
current_start = None
current_score_sum = 0.0
current_score_count = 0

for item in ner_results:
    word = item['word']
    tag = item['entity']
    score = item['score']
    
    # Start of a new word or entity
    if word.startswith("##"):
        # Remove "##" and append it to the current word
        current_entity += word[2:]
        current_score_sum += score
        current_score_count += 1
    else:
        # Save the current entity if it is completed
        if current_entity:
            avg_score = current_score_sum / current_score_count if current_score_count > 0 else 0
            entities.append({
                "entity": current_tag,
                "word": current_entity,
                "start": current_start,
                "end": item['start'] - 1,  # End of the previous word
                "score": avg_score
            })
        
        # Initialize a new word
        current_entity = word
        current_tag = tag
        current_start = item['start']
        current_score_sum = score
        current_score_count = 1

# Add the last entity
if current_entity:
    avg_score = current_score_sum / current_score_count if current_score_count > 0 else 0
    entities.append({
        "entity": current_tag,
        "word": current_entity,
        "start": current_start,
        "end": item['end'],
        "score": avg_score
    })

# Output the merged entities
for entity in entities:
    print(f"Entity: {entity['word']}, Tag: {entity['entity']}, Start: {entity['start']}, End: {entity['end']}, Score: {entity['score']:.2f}")


Сущность: equinox, Тег: B-MINES, Начало: 0, Конец: 7, Оценка: 1.00
Сущность: goo, Тег: B-MINES, Начало: 8, Конец: 11, Оценка: 0.45


In [56]:
# Install necessary packages if not already installed
!pip install transformers psycopg2 pandas openpyxl beautifulsoup4

import psycopg2
import yaml
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Load configuration from YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Database connection parameters
db_config = config['local_db']  # Adjust to 'remote_db' if needed

# Connect to the database
conn = psycopg2.connect(
    host=db_config['host'],
    port=db_config['port'],
    dbname=db_config['database'],
    user=db_config['user'],
    password=db_config['password']
)

# Define the specific query text and SQL query
query_text_1 = 'gold mine'
query_text_2 = 'Equinox Gold'

# SQL query to retrieve stories containing both "gold mine" and "Equinox Gold"
story_query = f"""
    SELECT id, story 
    FROM public."DJ_NEWS_STORIES" 
    WHERE story LIKE '%{query_text_1}%' 
    AND story LIKE '%{query_text_2}%';
"""

# Load the fine-tuned model and tokenizer for NER
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")
tokenizer = AutoTokenizer.from_pretrained("tokenizer")
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to clean text: removes URLs, HTML tags, punctuation, numbers, and extra whitespace
def clean_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-letter characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Start processing
start_time = time.time()

# Execute query to retrieve stories
cursor = conn.cursor()
cursor.execute(story_query)
result = cursor.fetchall()

# Collect entities and counts
unique_entities_set = set()
all_entities = set()
type_counts = {"MINES": 0, "METAL": 0}
total_count = 0

# Process each story in the results
for row in result:
    story_id = row[0]
    text = row[1]
    
    # Clean the story text
    cleaned_text = clean_text(text)
    
    # Process cleaned text with the fine-tuned NER pipeline
    ner_results = nlp(cleaned_text)
    
    # Extract only 'MINES' and 'METAL' entities
    for ent in ner_results:
        entity_text = ent['word']
        entity_type = ent['entity_group']
        if entity_type in ['MINES', 'METAL']:  # Only process 'MINES' and 'METAL' entities
            entity_tuple = (entity_text, entity_type, story_id)
            all_entities.add(entity_tuple)
            unique_entities_set.add((entity_text, entity_type))
            
            # Update counts for summary
            type_counts[entity_type] += 1
            total_count += 1

# Convert sets to sorted DataFrames for saving to Excel
unique_entities_df = pd.DataFrame(sorted(unique_entities_set), columns=["Entity", "Tag"])
all_entities_df = pd.DataFrame(sorted(all_entities), columns=["Entity", "Tag", "Story ID"])
summary_df = pd.DataFrame(list(type_counts.items()), columns=["Tag", "Count"])
summary_df = pd.concat([summary_df, pd.DataFrame([["TOTAL", total_count]], columns=["Tag", "Count"])])

# Save to Excel with multiple sheets
with pd.ExcelWriter("recognized_mines_metals_entities.xlsx", engine="openpyxl") as writer:
    unique_entities_df.to_excel(writer, sheet_name="Unique Entities", index=False)
    all_entities_df.to_excel(writer, sheet_name="All Entities with Story ID", index=False)
    summary_df.to_excel(writer, sheet_name="Summary", index=False)

# Close the database connection
cursor.close()
conn.close()

# Print execution time
end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")


Execution time: 64.86 seconds


In [93]:

# List to store results for Excel
mines_data = []

# Process each story in the results
for row in result:
    story_id = row[0]
    text = row[1]
    
    # Clean the story text
    cleaned_text = clean_text(text)
    
    # Process cleaned text with fine-tuned BERT NER
    ner_results = nlp(cleaned_text)
    
    # Track individual entities within a single story
    current_entity = ""
    score_total = 0.0
    entity_segments = []

    for ent in ner_results:
        if ent['entity'] == 'B-MINES' and ent['score'] > 0.01:
            # Save current entity if it's not empty
            if current_entity:
                mines_data.append({
                    "Story ID": story_id,
                    "Entity": current_entity,
                    "Score": round(score_total, 2)
                })
            # Start a new entity
            current_entity = ent['word'].replace("##", "")
            score_total = ent['score']
        
        elif ent['entity'] == 'I-MINES' and ent['score'] > 0.01:
            # Continue with the same entity
            current_entity += ent['word'].replace("##", "")
            score_total += ent['score']

    # Append the final entity after the loop if it exists
    if current_entity:
        mines_data.append({
            "Story ID": story_id,
            "Entity": current_entity,
            "Score": round(score_total, 2)
        })

# Convert results to a DataFrame for saving to Excel
mines_df = pd.DataFrame(mines_data)

# Save results to Excel
with pd.ExcelWriter("mines_entities_results.xlsx", engine="openpyxl") as writer:
    mines_df.to_excel(writer, sheet_name="MINES Entities", index=False)



In [None]:
# Retrieving unique values from the 'Story ID' column in mines_df
unique_story_ids = mines_df['Story ID'].unique()

# Printing unique values and their total count
print("Unique Story IDs:")
print(unique_story_ids)
print(f"Total number of unique Story IDs: {len(unique_story_ids)}")

Уникальные ID новостей:
[2469129 2480810 2480823 2631654 2631656 2692968 2692973 2692987 2692989
 2719306 2719308 2719350   13614   13645   61756  155272  807117 1303430
 1303477 1681402 1832460 1837994 1897329 2426242 2426243 2521095 2521100
   98705   98706  622718  781141  781146  964239  964240  989393  989493
  996781  996784 1009534 1009849 1144609 1144619 1288536 1288537 1472693
 1472689 1611925 1611930 1830848 1830917 2074020 2147733 2147739 2271646
 2290596 2290598 2357949 2357971 2455010 2455013 2598547 2611695  191067
  191068  241901  285979  285987  684685  684784  781165 1058634 1058635
 1058639 1058645 1420884 1420892 1500363 1500368 1500369 1500372 1834646
 1860984 1967874 1967877 2025197 2025201]
Общее количество уникальных ID новостей: 86


In [90]:
mines_df

Unnamed: 0,Story ID,Entity,Score
0,2469129,e,1.00
1,2469129,quin,1.00
2,2469129,oxgold,1.79
3,2480810,e,1.00
4,2480810,quin,1.00
...,...,...,...
1708,2025197,quin,1.00
1709,2025197,oxgold,1.88
1710,2025201,e,1.00
1711,2025201,quin,1.00
