## Named Entity Recognition

Experiments with the LVBERT will include both preprocessing and NER task.

In [3]:
! pip install transformers torch datasets evaluate seqeval scikit-learn accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
     ---------------------------------------- 0.0/138.0 kB ? eta -:--:--
     -- ------------------------------------- 10.2/138.0 kB ? eta -:--:--
     -------- ---------------------------- 30.7/138.0 kB 330.3 kB/s eta 0:00:01
     ------------------------ ------------ 92.2/138.0 kB 655.4 kB/s eta 0:00:01
     ------------------------------------ 138.0/138.0 kB 742.7 kB/s eta 0:00:00
Downloading transformers-4.40.1-py3-none-any.whl (9.0 MB)
   ---------------------------------------- 0.0/9.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.0 MB 2.6 MB/s eta 0:00:04
    --------------------------------------- 0.1/9.0 MB 2.8 MB/s eta 0:00:04
   - -------------------------------------- 0.2/9.0 MB 1.8 MB/s eta 0:00:05
   - -------------------------------------- 0.3/9.0 MB 1.8 MB/s eta 0:00:05
   - ------

In [None]:
#first, however, a virtual environment must be created
#for this, a new directory will be made, and the virtual environment will be created in this directory
# mkdir command is used for this, with the directory name added to the relevant path
# virtual environement will be created in this new directory using the command python3 -m venv

In [4]:
# checking the preprocessing information

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("AiLab-IMCS-UL/lvbert")
model = AutoModel.from_pretrained("AiLab-IMCS-UL/lvbert")
print(tokenizer)

BertTokenizerFast(name_or_path='AiLab-IMCS-UL/lvbert', vocab_size=32004, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [8]:
from transformers import pipeline
pipe = pipeline("feature-extraction", model="AiLab-IMCS-UL/lvbert")

In [7]:
pipe("es eju uz Rīgu")

[[[0.5969477295875549,
   -0.5389927625656128,
   -1.6942793130874634,
   1.1322122812271118,
   1.0537949800491333,
   -0.03234224021434784,
   1.1450239419937134,
   0.012889946810901165,
   -0.549334704875946,
   -1.5051319599151611,
   0.45149779319763184,
   -0.8791409730911255,
   -0.8337130546569824,
   -2.047050714492798,
   0.11884479224681854,
   0.21227353811264038,
   -1.329668402671814,
   -0.7142017483711243,
   -0.740278422832489,
   0.6083566546440125,
   0.7668145895004272,
   0.8884799480438232,
   0.608581006526947,
   -0.08312254399061203,
   2.177180290222168,
   -0.2261665165424347,
   2.707960605621338,
   0.32673317193984985,
   -0.38234126567840576,
   -0.3316890597343445,
   -0.20572733879089355,
   -0.14063872396945953,
   -1.587850570678711,
   0.32823479175567627,
   -0.2931421399116516,
   0.09370146691799164,
   -0.3707311749458313,
   0.8857435584068298,
   0.9131576418876648,
   0.8051631450653076,
   2.196897268295288,
   1.2143330574035645,
   -0.1871

In [None]:
model.save_pretrained(r"C:\Users\papel\GitHub_Projects\NER-for-Autobiography-collection\NER-experiments")
saved_model_dir = r"C:\Users\papel\GitHub_Projects\NER-for-Autobiography-collection\NER-experiments"
model = AutoModel.from_pretrained(saved_model_dir)

In [None]:
# checking the dataset

dataset = open("Data\\training_data_full.txt", "r", encoding="utf-8")
num_lines_to_read = 10

for i in range(num_lines_to_read):
  line = dataset.readline().strip()  # Read and remove leading/trailing whitespaces
  print(line)

dataset.close()

In [None]:
# Open the data file
with open("Data/training_data_full.txt", "r", encoding="utf-8") as dataset:
  num_less_than_four_columns = 0
  unique_third_column_values = set()
  unique_fourth_column_values = set()

  # Process each line
  for line in dataset:
    # Remove leading/trailing whitespaces and split by tabs
    columns = line.strip().split("\t")

    # Count lines with less than 4 columns
    if len(columns) < 4:
      num_less_than_four_columns += 1
      continue  # Skip processing this line

    # Handle values from 3th and 4th columns (if they exist)
    if len(columns) >= 3:
      if columns[2] not in "":
        unique_third_column_values.add(columns[2])
    if len(columns) >= 4:
      if columns[3] not in "":
        unique_fourth_column_values.add(columns[3])

# Finding all unique tags in the dataset
full_ner_tagset = unique_third_column_values.union(unique_fourth_column_values)


# Print results
print(f"Number of lines with less than 4 columns: {num_less_than_four_columns}")
print("Unique values from 5th column:", unique_third_column_values)
print("Unique values from 6th column:", unique_fourth_column_values)
print("Full NER tagset:", full_ner_tagset)


In [None]:
# Creation  of a dictionary for label mapping
tag_to_id = {tag: i for i, tag in enumerate(full_ner_tagset)}
num_tags = len(tag_to_id)  # Number of unique tags

print("Number of unique tags:", num_tags)
print("Label mapping (tag -> ID):", tag_to_id)


In [None]:

with open("Data/training_data_full.txt", "r", encoding="utf-8") as dataset:
    encoded_training_dataset = []  

    for line in dataset:
        columns = line.strip().split("\t")

        if len(columns) < 4:
            continue  

        text = columns[0]
        lemma = columns[1]

        if not text.strip() or not lemma.strip():
            continue

        text_encoding = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
        lemma_encoding = tokenizer(lemma, padding="max_length", truncation=True, return_tensors="pt")

        ner_tag_1 = columns[2]
        ner_tag_2 = columns[3]

        encoded_ner_tag_1 = tag_to_id.get(ner_tag_1, num_tags) 
        encoded_ner_tag_2 = tag_to_id.get(ner_tag_2, num_tags) if ner_tag_2 else None

        encoded_line = {
            "text_input_ids": text_encoding["input_ids"],
            "text_attention_mask": text_encoding["attention_mask"],
            "lemma_input_ids": lemma_encoding["input_ids"],
            "lemma_attention_mask": lemma_encoding["attention_mask"],
            "ner_tag_1": encoded_ner_tag_1,
            "ner_tag_2": encoded_ner_tag_2
        }
        encoded_training_dataset.append(encoded_line)

for i in range(5):
    print(encoded_training_dataset[i])


In [None]:
def preprocess_data(data_path, tokenizer, tag_to_id, num_tags):
  encoded_data = []
  with open(data_path, "r", encoding="utf-8") as dataset:
    for line in dataset:
      columns = line.strip().split("\t")

      # Filter lines with less than 4 elements (assuming 4th is NER tag 2)
      if len(columns) < 4:
          continue

      text = columns[0]
      lemma = columns[1]

      # Skip processing if either text or lemma is empty
      if not text or not lemma:
          continue

      text_encoding = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")
      lemma_encoding = tokenizer(lemma, padding="max_length", truncation=True, return_tensors="pt")

      ner_tag_1 = columns[2]
      ner_tag_2 = columns[3]

      encoded_ner_tag_1 = tag_to_id.get(ner_tag_1, num_tags)
      encoded_ner_tag_2 = tag_to_id.get(ner_tag_2, num_tags)

      encoded_line = {
          "text_input_ids": text_encoding["input_ids"],
          "text_attention_mask": text_encoding["attention_mask"],
          "lemma_input_ids": lemma_encoding["input_ids"],
          "lemma_attention_mask": lemma_encoding["attention_mask"],
          "ner_tag_1": encoded_ner_tag_1,
          "ner_tag_2": encoded_ner_tag_2
      }
      encoded_data.append(encoded_line)

  return encoded_data

# ... rest of your code
encoded_training_dataset = preprocess_data("Data/training_data_full.txt", tokenizer, tag_to_id, num_tags)
 

for i in range(3):
    print(encoded_training_dataset[i])

{'text_input_ids': tensor([[  2, 132,   3]]), 'text_attention_mask': tensor([[1, 1, 1]]), 'lemma_input_ids': tensor([[ 2, 37,  3]]), 'lemma_attention_mask': tensor([[1, 1, 1]]), 'ner_tag_1': 13, 'ner_tag_2': 13}
{'text_input_ids': tensor([[ 2, 21,  3]]), 'text_attention_mask': tensor([[1, 1, 1]]), 'lemma_input_ids': tensor([[ 2, 21,  3]]), 'lemma_attention_mask': tensor([[1, 1, 1]]), 'ner_tag_1': 13, 'ner_tag_2': 13}
{'text_input_ids': tensor([[ 2, 94,  3]]), 'text_attention_mask': tensor([[1, 1, 1]]), 'lemma_input_ids': tensor([[ 2, 94,  3]]), 'lemma_attention_mask': tensor([[1, 1, 1]]), 'ner_tag_1': 13, 'ner_tag_2': 13}


In [None]:
import numpy as np
import evaluate  


seqeval = evaluate.load("seqeval")

label_list = full_ner_tagset

def compute_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Using seqeval to compute precision, recall, F1-score, and accuracy
    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
from sklearn.model_selection import train_test_split

def check_empty_entries(data, data_name):
    empty_entries = [i for i, entry in enumerate(data) if len(entry["text_input_ids"]) == 0]

    if len(empty_entries) > 0:
        print(f"Empty entries found in {data_name}:")
        for idx in empty_entries:
            print(f"Entry {idx}: {data[idx]}")
    else:
        print(f"No empty entries found in {data_name}")

# Split the encoded training dataset
train_data, remaining_data = train_test_split(encoded_training_dataset, test_size=0.4, random_state=42)
print(train_data[:5])

# Check for empty entries in train_data
check_empty_entries(train_data, "train_data")

# Further split the remaining data into validation and test sets
validation_data, test_data = train_test_split(remaining_data, test_size=0.25, random_state=42)
print(validation_data[:5])

# Check for empty entries in validation_data
check_empty_entries(validation_data, "validation_data")

# Check for empty entries in test_data
check_empty_entries(test_data, "test_data")


In [None]:
print(encoded_training_dataset[:5])

In [None]:
#! pip install pytorch-lightning

  0%|          | 0/42240 [00:00<?, ?it/s]

ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,encoder_hidden_states,encoder_attention_mask,past_key_values,use_cache,output_attentions,output_hidden_states,return_dict,label,label_ids.

In [None]:
# Uncomment the line below to print encoded_inputs
# print("Encoded inputs:", encoded_inputs)