In [1]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-cased"
batch_size = 16

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import wandb
wandb.init(project="Bachelor_Thesis", entity="maxhager28")

[34m[1mwandb[0m: Currently logged in as: [33mmaxhager28[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
from datasets import load_dataset, load_metric
from datasets import DatasetDict, Dataset
import random
import pandas as pd
random.seed(42)

jnlpba = load_dataset('jnlpba', split=['train[:10]', "validation[:10]"])
jnlpba = DatasetDict({"train": jnlpba[0], "validation": jnlpba[1]})

class JnlpbDataset():
    def __init__(self, dataset, portion, type_path):
        self.dataset = dataset[type_path]
        self.portion = portion
        self.merge()
        self.apply()
        
    def map_tags(self, row):
        mapping = {
            0: "O",
            1: "B-DNA",
            2: "I-DNA",
            3: "B-RNA",
            4: "I-RNA",
            5: "B-cell_line",
            6: "I-cell_line",
            7: "B-cell_type",
            8: "I-cell_type",
            9: "B-protein",
            10: "I-protein"
        }
        row['ner_tags'] = [[mapping[tag] for tag in row['ner_tags']]][0]
        return row        

    def merge_tags(self, tags, tokens):
      #todo test if this works also in the scenario of having two B- tags side by side
      merged_tags = []
      merged_tokens = []
      i = 0
      while i < len(tags):
          if tags[i].startswith('B-'):
              merged_tag = tags[i][2:]
              merged_token = tokens[i]
              i += 1
              while i < len(tags) and tags[i].startswith('I-'):
                  merged_tag += ' ' + tags[i][2:]
                  merged_token += ' ' + tokens[i]
                  i += 1
              merged_tags.append(merged_tag)
              merged_tokens.append(merged_token)
          else:
              merged_tags.append(tags[i])
              merged_tokens.append(tokens[i])
              i += 1
      for i in range(len(merged_tags)):
        s = merged_tags[i].split()[0]
        s = s[0].upper() + s[1:]
        merged_tags[i] = s
      return merged_tags, merged_tokens

    def merge(self):
      df_train = pd.DataFrame(self.dataset)
      df_train = df_train.apply(self.map_tags, axis=1)
      df_train[['ner_tags', 'tokens']] = df_train.apply(lambda x: self.merge_tags(x['ner_tags'], x['tokens']), axis=1, result_type='expand')
      self.dataset = Dataset.from_pandas(df_train)
        
    def missing(self, row):
        #print(len(self.label_list))
        lst = row["ner_tags"]
        if any(x != 0 for x in lst):
            index = random.choice([i for i, x in enumerate(lst) if x != 0])
            lst[index] = 0
            row["ner_tags"] = lst
            return row
        else:
            return row

    def wrong(self, row, num_tags):
        lst = row["ner_tags"]
        tags = []
        #it would be nice to have a debugger who can display me the values of the vars
        for i in range(1,num_tags):
            tags.append(i)
        if any(x != 0 for x in lst):    
            indices = [i for i, x in enumerate(lst) if x != 0]
            random_index = random.choice(indices)
            current_value = lst[random_index]
            random_number = random.choice([x for x in [1, 2, 3, 4, 5] if x != current_value])
            lst[random_index] = random_number
            row["ner_tags"] = lst
            return row
        else:
            return row
                        
    def uncomplete(self):
        #todo should i implement this
        pass

    def apply(self):
        shuffled_data = self.dataset.shuffle()
        num_portion = int(len(self.dataset) * self.portion / 100)
        df = self.dataset.to_pandas() 
        tags = [tag for row in df['ner_tags'] for tag in row]
        unique_tags = set(tags)
        mapping = {}

        for index, item in enumerate(unique_tags):
            mapping[item] = index        
        #iter over df ner tags column and set each string to number from mapping
        df['ner_tags'] = [[mapping[tag] for tag in tags] for tags in df['ner_tags']]    
        
        for i in range(num_portion):
            random_number = random.randint(1, 2)
            if random_number == 1:
                new_row = self.missing(df.iloc[i])
                df.iloc[i] = new_row
            elif random_number == 2:
                num_tags = len(unique_tags)
                new_row = self.wrong(df.iloc[i], num_tags)
                df.iloc[i] = new_row
            '''else:
                self.uncomplete()'''
        self.dataset = Dataset.from_pandas(df)
        
    def get_dataset(self):
        return self.dataset
        
input_dataset_train = JnlpbDataset(dataset=jnlpba, portion=20, type_path='train')
dataset_train = input_dataset_train.get_dataset()

input_dataset_validation = JnlpbDataset(dataset=jnlpba, portion=0, type_path='validation')
dataset_validation = input_dataset_validation.get_dataset()

datasets = DatasetDict({"train": dataset_train, "validation": dataset_validation})


Found cached dataset jnlpba (/Users/maxhager/.cache/huggingface/datasets/jnlpba/jnlpba/1.0.0/3062f220823930cffde7976b694aa67bac3b06c322a02ced92d3761519810ce4)


  0%|          | 0/2 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["ner_tags"] = lst
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row["ner_tags"] = lst


In [5]:
df = dataset_train.to_pandas()
label_list = list(set([tag for row in df['ner_tags'] for tag in row]))

In [6]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [8]:
label_all_tokens = True

In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [12]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}-strong-labelled",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="wandb",
    push_to_hub=True,
    #todo hide token 
    push_to_hub_token="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"
)



In [13]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [14]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [15]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [16]:
trainer = Trainer(
    model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/yachty66/bert-base-cased-finetuned-ner-strong-labelled into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 107722756
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/3 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'eval_loss': 1.1935713291168213, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6252723311546841, 'eval_runtime': 0.4914, 'eval_samples_per_second': 20.349, 'eval_steps_per_second': 2.035, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0876998901367188, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7581699346405228, 'eval_runtime': 0.4312, 'eval_samples_per_second': 23.191, 'eval_steps_per_second': 2.319, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 1.0400521755218506, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7821350762527233, 'eval_runtime': 0.4456, 'eval_samples_per_second': 22.443, 'eval_steps_per_second': 2.244, 'epoch': 3.0}
{'train_runtime': 5.3988, 'train_samples_per_second': 5.557, 'train_steps_per_second': 0.556, 'train_loss': 1.1949286460876465, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=1.1949286460876465, metrics={'train_runtime': 5.3988, 'train_samples_per_second': 5.557, 'train_steps_per_second': 0.556, 'train_loss': 1.1949286460876465, 'epoch': 3.0})

In [18]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0400521755218506,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.7821350762527233,
 'eval_runtime': 0.4363,
 'eval_samples_per_second': 22.922,
 'eval_steps_per_second': 2.292,
 'epoch': 3.0}

In [19]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: id, ner_tags, tokens. If id, ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 16


  0%|          | 0/1 [00:00<?, ?it/s]

{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.7821350762527233}

In [20]:
#finish WB run
#import wandb
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▇██
eval/f1,▁▁▁▁
eval/loss,█▃▁▁
eval/precision,▁▁▁▁
eval/recall,▁▁▁▁
eval/runtime,█▁▃▂
eval/samples_per_second,▁█▆▇
eval/steps_per_second,▁█▆▇
train/epoch,▁▅███
train/global_step,▁▅███

0,1
eval/accuracy,0.78214
eval/f1,0.0
eval/loss,1.04005
eval/precision,0.0
eval/recall,0.0
eval/runtime,0.4363
eval/samples_per_second,22.922
eval/steps_per_second,2.292
train/epoch,3.0
train/global_step,3.0


In [21]:
trainer.push_to_hub()

Saving model checkpoint to bert-base-cased-finetuned-ner-strong-labelled
Configuration saved in bert-base-cased-finetuned-ner-strong-labelled/config.json
Model weights saved in bert-base-cased-finetuned-ner-strong-labelled/pytorch_model.bin
tokenizer config file saved in bert-base-cased-finetuned-ner-strong-labelled/tokenizer_config.json
Special tokens file saved in bert-base-cased-finetuned-ner-strong-labelled/special_tokens_map.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 