In [1]:

# !pip install transformers[torch] accelerate pandas numpy pyarrow seqeval datasets evaluate ipywidgets==7.7.1 jupyterlab-widgets==1.1.1

# !curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash -
# !sudo apt install -y nodejs

# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [2]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm, trange
import random
import os
import logging
import torch
from seqeval.metrics import f1_score, precision_score, recall_score
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm.auto import tqdm, trange

from transformers import (
    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
    # AdamW,
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW

In [3]:
MODEL_CONFIG_CLASSES = list(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)



In [4]:
logger = logging.getLogger(__name__)


In [5]:
def load_train_data(data_path, gt_path):
    with open(data_path) as f:
        data = f.read().splitlines()

        for i, w in enumerate(data):
            if w == ";;;":
                data[i] = "###"
            else:
                data[i] = data[i].strip()

        data = "".join(data).split("###")
        for i, t in enumerate(data):
            data[i] = [x for x in t.split(";;;") if x != ""]

        data = [x for x in data if x != [] and x != [""]]

    with open(gt_path, "r") as f:
        labels = f.read().splitlines()

        for i, l in enumerate(labels):
            if l == "":
                labels[i] = "###"
            else:
                labels[i] = l.strip() + ";;;"

        labels = "".join(labels).split("###")
        for i, l in enumerate(labels):
            labels[i] = [x for x in l.split(";;;") if x != ""]
        labels = [x for x in labels if x != [] and x != [""]]


    text = []
    gt = []

    for d, l in zip(data, labels):
        sentence = []
        sentence_ner = []
        for i, (w, ner) in enumerate(zip(d, l)):
            if ner == 'O O':
                sentence.append(w)
                sentence_ner.append("O")
            elif ";" not in ner:
                sentence.append(w.lower())
                sentence_ner.append(ner)

        text.append(sentence)
        gt.append(sentence_ner)

    return text, gt

def load_val_data(data_path, gt_path):
    with open(data_path) as f:
        data = f.read().splitlines()

        for i, w in enumerate(data):
            if w.strip() == "":
                data[i] = "###"
            else:
                data[i] = data[i].strip() + ";;;"

        data = "".join(data).split("###")
        for i, t in enumerate(data):
            data[i] = [x for x in t.split(";;;") if x != ""]

        # data = [x for x in data if x != [] and x != [""]]

    with open(gt_path, "r") as f:
        labels = f.read().splitlines()

        for i, l in enumerate(labels):
            if l.strip() == "":
                labels[i] = "###"
            else:
                labels[i] = l.strip() + ";;;"

        labels = "".join(labels).split("###")
        for i, l in enumerate(labels):
            labels[i] = [x for x in l.split(";;;") if x != ""]
        # labels = [x for x in labels if x != [] and x != [""]]        
    return data, labels
    
    # text = []
    # gt = []

    # for d, l in zip(data, labels):
    #     sentence = []
    #     sentence_ner = []
    #     for i, (w, ner) in enumerate(zip(d, l)):
    #         if ner == 'O O':
    #             sentence.append(w)
    #             sentence_ner.append("O")
    #         elif ";" not in ner:
    #             sentence.append(w.lower())
    #             sentence_ner.append(ner)

    #     text.append(sentence)
    #     gt.append(sentence_ner)

    # return text, gt

def load_test_data(data_path):
    with open(data_path) as f:
        data = f.read().splitlines()

        for i, w in enumerate(data):
            if w.strip() == "":
                data[i] = "###"
            else:
                data[i] = data[i].strip().lower() + ";;;"

        data = "".join(data).split("###")
        for i, t in enumerate(data):
            data[i] = [x for x in t.split(";;;") if x != ""]

        # data = [x for x in data if x != [] and x != [""]]
        return data

def load_data(data_path, gt_path):
	with open(data_path) as f:
		data = f.read().splitlines()

	with open(gt_path, "r") as f:
		labels = f.read().splitlines()

	df = pd.DataFrame({"text": data, "label": labels})
	df = df[df["text"] != ";;;"]
	df["text"] = df["text"].apply(lambda x: x.replace(";;;", ""))
	df = df[~(df["label"].str.strip()=="")]
	df = df[~df["label"].str.contains(";")]

	df["label"] = df["label"].str.strip()

	df["label"] = np.where(df["label"] == "O O", "O", df["label"])
	return df

def rows_to_sentences_and_labels(df):
    sentences = []
    sentences_labels = []
    current_sentence = []
    current_labels = []

    for index, row in tqdm(df.iterrows(), total = len(df)):
        word, label = row['text'], row['label']
        current_sentence.append(word.strip())
        current_labels.append(label)
        if word.strip() == '.':
            sentences.append(current_sentence)
            sentences_labels.append(current_labels)
            current_sentence = []
            current_labels = []

    return sentences, sentences_labels

In [6]:
# train_data, train_labels = load_train_data("train_data.csv", "train_gt.csv")
# val_data, val_labels = load_val_data("valid_data.csv", "valid_gt.csv")

train = load_data("train_data.csv", "train_gt.csv")
valid = load_data("valid_data.csv", "valid_gt.csv")

train_data, train_labels = rows_to_sentences_and_labels(train)
val_data, val_labels = rows_to_sentences_and_labels(valid)

  0%|          | 0/202386 [00:00<?, ?it/s]

  0%|          | 0/50937 [00:00<?, ?it/s]

In [7]:
unique_labels = set([x for y in train_labels for x in y])
print(unique_labels)

{'B-MISC', 'B-PER', 'O', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'B-ORG'}


In [8]:
max([len(lb) for lb in val_labels])

1179

In [9]:
class InputExample(object):
    """A single training/test example for token classification."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.words = words
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

def create_examples(data, labels, mode):
    examples = []
    guid_index = 1

    for sent, lb in zip(data, labels):
        input_example = InputExample(guid=f"{mode}-{guid_index}", words=sent, labels=lb)
        examples.append(input_example)
        guid_index += 1

    return examples

def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)

            # bert-base-multilingual-cased sometimes output "nothing ([]) when calling tokenize with just a space.
            if len(word_tokens) > 0:
                tokens.extend(word_tokens)
                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = tokenizer.num_special_tokens_to_add()
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        features.append(
            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
        )
    return features

def create_dataset(features):
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

In [10]:
# Constants
LEARNING_RATE = 5e-5
ADAM_EPSILON = 1e-8
WEIGHT_DECAY = 0.0
NUM_EPOCHS = 10
GRADIENT_ACCUMULATION_STEPS = 1
MAX_GRAD_NORM = 1.0
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
WARMUP_STEPS = 0
SEED = 42
MAX_SEQ_LENGTH = 512

pad_token_label_id = CrossEntropyLoss().ignore_index

model_name = "bert-base-uncased"
model_type = "bert"

config = AutoConfig.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label={str(i): label for i, label in enumerate(unique_labels)},
    label2id={label: i for i, label in enumerate(unique_labels)},
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    config=config,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [11]:
train_examples = create_examples(train_data, train_labels, mode="train")
train_features = convert_examples_to_features(
            train_examples,
            unique_labels,
            MAX_SEQ_LENGTH,
            tokenizer,
            cls_token_at_end=False,
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            pad_on_left=False,
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
            pad_token_label_id=pad_token_label_id,
        )

train_dataset = create_dataset(train_features)

In [12]:
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def train(train_dataset, model, tokenizer, labels, pad_token_label_id, output_dir="./", file_name="training_loss.txt"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    loss_file_path = os.path.join(output_dir, file_name)

    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=TRAIN_BATCH_SIZE)

    t_total = len(train_dataloader) // GRADIENT_ACCUMULATION_STEPS * NUM_EPOCHS

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": WEIGHT_DECAY,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=ADAM_EPSILON)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

    set_seed()  # Set seed for reproducibility

    global_step = 0
    model.zero_grad()

    train_iterator = trange(NUM_EPOCHS, desc="Epoch")
    for epoch in train_iterator:
        epoch_loss = 0.0
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(model.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if "token_type_ids" in tokenizer.model_input_names:
                inputs["token_type_ids"] = batch[2]
            outputs = model(**inputs)
            loss = outputs.loss

            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / GRADIENT_ACCUMULATION_STEPS

            loss.backward()

            epoch_loss += loss.item()
            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

        # Calculate the average loss for the epoch and write it to a file
        avg_epoch_loss = epoch_loss / len(epoch_iterator)
        with open(loss_file_path, "a") as file:
            file.write(f"Epoch {epoch + 1}: {avg_epoch_loss}\n")

    return global_step, avg_epoch_loss

In [13]:
global_step, tr_loss = train(train_dataset, model, tokenizer, unique_labels, pad_token_label_id)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

Iteration:   0%|          | 0/922 [00:00<?, ?it/s]

In [14]:
global_step, tr_loss

(9220, 0.0005266064956720548)

In [15]:
def evaluate(eval_dataset, model, tokenizer, labels, pad_token_label_id, device, model_type="bert", prefix=""):

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=1)

    # Eval!
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if model_type != "distilbert":
                inputs["token_type_ids"] = batch[2]
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=2)

    label_map = {i: label for i, label in enumerate(labels)}

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id:
                out_label_list[i].append(label_map[out_label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    results = {
        "loss": eval_loss,
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

    for key in sorted(results.keys()):
        print("  %s = %s", key, str(results[key]))

    return results, preds_list

In [16]:
# eval
val_examples = create_examples(val_data, val_labels, mode="eval")
val_features = convert_examples_to_features(
            val_examples,
            unique_labels,
            MAX_SEQ_LENGTH,
            tokenizer,
            cls_token_at_end=False,
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            pad_on_left=False,
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
            pad_token_label_id=pad_token_label_id,
        )

val_dataset = create_dataset(val_features)
result, preds_list = evaluate(val_dataset, model, tokenizer, unique_labels, pad_token_label_id, device, model_type, prefix=global_step)

Evaluating:   0%|          | 0/1874 [00:00<?, ?it/s]

  %s = %s f1 0.9399523851512211
  %s = %s loss 0.09372719970730253
  %s = %s precision 0.9375549692172384
  %s = %s recall 0.9423620933521923


In [36]:
" ".join(train_data[5])

'He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains spleens and spinal cords from the human and animal food chains was a highly specific and precautionary move to protect human health .'

In [35]:
print(tokenizer.tokenize(" ".join(train_data[5])))

['he', 'said', 'a', 'proposal', 'last', 'month', 'by', 'eu', 'farm', 'commissioner', 'franz', 'fis', '##ch', '##ler', 'to', 'ban', 'sheep', 'brains', 'sp', '##leen', '##s', 'and', 'spinal', 'cords', 'from', 'the', 'human', 'and', 'animal', 'food', 'chains', 'was', 'a', 'highly', 'specific', 'and', 'pre', '##ca', '##ution', '##ary', 'move', 'to', 'protect', 'human', 'health', '.']


In [17]:
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

import re

def remove_special_chars(input_string):
    # Define the pattern to match everything except uppercase and lowercase letters
    # This includes numbers, special characters, and whitespace
    pattern = r'[^a-zA-Z]'
    
    # Use re.sub() to replace all occurrences of the pattern with an empty string
    result = re.sub(pattern, '', input_string)
    
    return result

def contains_alphabet(s):
    # This regular expression looks for any character between a-z or A-Z
    pattern = re.compile('[a-zA-Z]')
    # Search the string for any match to the pattern
    if pattern.search(s):
        return True
    else:
        return False

def predict(sentence, model, max_length=MAX_SEQ_LENGTH):
    sentence_check = []
    for word in sentence:
        if not contains_alphabet(word):
            sentence_check.append("UNK")
        else:
            sentence_check.append(remove_special_chars(word))

    inputs = tokenizer(sentence_check,
                        padding='max_length', 
                        truncation=True, 
                        max_length=max_length,
                       is_split_into_words=True,
                        return_tensors="pt")

    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=ids, attention_mask=mask)
        logits = outputs[0]
    
    active_logits = logits.view(-1, model.num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1)
    
    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))
    
    word_level_predictions = []
    for pair in wp_preds:
      # print(pair)
      
      if (pair[0].startswith("##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
        continue
      else:
        # print("good", pair[1])
        word_level_predictions.append(pair[1])

    if len(word_level_predictions) < len(sentence):
        return word_level_predictions + predict(sentence[len(word_level_predictions):], model)
    return word_level_predictions

def save_to_file(mode, y_pred):
    with open(f"{mode}_gt.csv", "w") as f:
        f.write(" ")
        for sentence_labels in y_pred:
            for label in sentence_labels:
                f.write(label + "\n")
            f.write(" \n")

In [18]:
val_data, val_labels = load_val_data("valid_data.csv", "valid_gt.csv")

In [19]:
eval_preds = [predict(sent, model) for sent in tqdm(val_data)]

  0%|          | 0/4106 [00:00<?, ?it/s]

In [20]:
assert len(val_labels) == len(eval_preds)

for i, (p, t) in enumerate(zip(eval_preds, val_labels)):
    assert isinstance(p, list)
    assert isinstance(t, list)
    if len(p) != len(t):
        print(i)

In [21]:
from seqeval.metrics import f1_score, accuracy_score, classification_report

f1 = f1_score(val_labels, eval_preds)
print(f"F1 Score: {f1}")
acc = accuracy_score(val_labels, eval_preds)
print(f"Accuracy: {acc}")
print(classification_report(val_labels, eval_preds))

F1 Score: 0.8023208302688568
Accuracy: 0.9587333372597523
              precision    recall  f1-score   support

         LOC       0.85      0.91      0.88      1837
        MISC       0.87      0.83      0.85       922
         ORG       0.59      0.67      0.63      1341
         PER       0.82      0.85      0.83      1846

   micro avg       0.78      0.83      0.80      5946
   macro avg       0.78      0.82      0.80      5946
weighted avg       0.78      0.83      0.80      5946



In [22]:
save_to_file("valid_pred", eval_preds)

In [23]:
!python eval_stud.py valid_gt.csv valid_pred_gt.csv

valid_gt.csv
valid_pred_gt.csv
processed 55042 tokens with 5946 phrases; found: 6291 phrases; correct: 4909.
accuracy:  96.18%; 
precision:  78.03%;
recall:  82.56%;
F1:  80.23
              LOC: prec:  85.18%; rec:  91.34%; FB1:  88.15  1970
             MISC: prec:  86.75%; rec:  83.08%; FB1:  84.88  883
              ORG: prec:  59.31%; rec:  67.49%; FB1:  63.13  1526
              PER: prec:  81.59%; rec:  84.51%; FB1:  83.02  1912



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
!python eval_stud.py train_gt.csv train_fake_gt.csv

train_gt.csv
train_fake_gt.csv


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


processed 219552 tokens with 23499 phrases; found: 23499 phrases; correct: 23499.
accuracy: 100.00%; 
precision: 100.00%;
recall: 100.00%;
F1: 100.00
              LOC: prec: 100.00%; rec: 100.00%; FB1: 100.00  7140
             MISC: prec: 100.00%; rec: 100.00%; FB1: 100.00  3438
              ORG: prec: 100.00%; rec: 100.00%; FB1: 100.00  6321
              PER: prec: 100.00%; rec: 100.00%; FB1: 100.00  6600



In [24]:
test_data = load_test_data("test_data.csv")
# dummy_labels = []
# for sent in test_data:
#     dummy_labels.append(["O"] * len(sent))

# # eval
# test_examples = create_examples(test_data, dummy_labels, mode="test")
# test_features = convert_examples_to_features(
#             test_examples,
#             unique_labels,
#             MAX_SEQ_LENGTH,
#             tokenizer,
#             cls_token_at_end=False,
#             cls_token=tokenizer.cls_token,
#             cls_token_segment_id=0,
#             sep_token=tokenizer.sep_token,
#             sep_token_extra=False,
#             pad_on_left=False,
#             pad_token=tokenizer.pad_token_id,
#             pad_token_segment_id=tokenizer.pad_token_type_id,
#             pad_token_label_id=pad_token_label_id,
#         )

# test_dataset = create_dataset(test_features)
# _, test_preds_list = evaluate(test_dataset, model, tokenizer, unique_labels, pad_token_label_id, device, model_type, prefix=global_step)

In [25]:
test_preds = [predict(sent, model) for sent in tqdm(test_data)]

In [26]:
save_to_file("test", test_preds)