# installations

In [None]:
!pip install transformers[sentencepiece]
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install transformers[torch]
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[sentencepiece])
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[sentencepiece])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[sentencepiece])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_

# imports + mount

In [None]:
import pandas as pd
from google.colab import drive
import os
from ast import literal_eval
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForTokenClassification, DataCollatorWithPadding, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Value, Sequence, concatenate_datasets
import numpy as np
import evaluate

In [None]:
# Mount Google Drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/schoolwork/grad/23_spring/CS-263-NLP/final-project/NLP-final-project/implementation/BERT')

Mounted at /content/drive


# Data preprocessing
labels: 0=outside, 1=beginning, 2=inside

## Utility functions

In [None]:
# input: dataframe with "tokens", "span_start_index", "span_end_index" columns
# output: dataframe with "tokens" and "labels" for each token
def convert_spans_to_labels(df_in):
  df = df_in.copy()

  all_labels = []
  # iterate over all data points
  for i in range(df.shape[0]):
    labels = np.zeros(len(df["tokens"][i]), dtype=int)

    # iterate over all claim spans
    for start, end in zip(df["span_start_index"][i], df["span_end_index"][i]):
      labels[start] = 1
      labels[start+1 : end+1] = 2

    # add this data point's labels to the list of all labels
    all_labels.append(labels)

  df["labels"] = all_labels

  df = df.drop("span_start_index", axis=1)
  df = df.drop("span_end_index", axis=1)

  return df

# inputs: a token_list, a list of labels for each token, tokenizer
# outputs: labels for the expanded tokens
def expand_labels(token_list, labels, tokenizer):
  new_labels = [-100]

  for token, label in zip(token_list, labels):
    # if label is beginning of sequence
    if label == 'B':
      splitted_token = token.split()

      for prelim_token in splitted_token[:-1]:
        tokenized_prelim_token = tokenizer.tokenize(prelim_token)
        num_bert_tokens = len(tokenized_prelim_token)
        new_labels.extend([0] * num_bert_tokens)

      new_labels.append(1)

    # else label is I or O
    else:
      tokenized = tokenizer.tokenize(token)
      num_bert_tokens = len(tokenized)
      new_labels.extend([label] * num_bert_tokens)

  # add the last label
  new_labels.append(-100)

  return new_labels

# takes a single example and tokenizes it
def tokenize_and_expand_labels(example, tokenizer):
  inputs = tokenizer(example["tokens"], is_split_into_words=True, truncation=True)

  new_labels = expand_labels(example["tokens"], example["labels"], tokenizer)
  inputs["labels"] = new_labels

  return inputs

# take a single dataset and tokenize it
def tokenize_one_dataset(dataset, tokenizer):
  input_ids = []
  token_type_ids = []
  attention_mask = []
  new_labels = []

  for i in range(dataset.num_rows):
    inputs = tokenize_and_expand_labels(dataset[i], tokenizer)
    input_ids.append(inputs["input_ids"])
    token_type_ids.append(inputs["token_type_ids"])
    attention_mask.append(inputs["attention_mask"])
    new_labels.append(inputs["labels"])


  dataset = dataset.add_column("input_ids", input_ids)
  dataset = dataset.add_column("token_type_ids", token_type_ids)
  dataset = dataset.add_column("attention_mask", attention_mask)
  dataset = dataset.rename_column("labels", "old_labels")
  dataset = dataset.add_column("labels", new_labels)

  return dataset

# takes a dataset object with "train" and "dev" labels and tokenizes it
def tokenize_datasets(datasets, tokenizer):
  new_train_dataset = tokenize_one_dataset(datasets["train"], tokenizer)
  new_test_dataset = tokenize_one_dataset(datasets["test"], tokenizer)

  return DatasetDict({"train":new_train_dataset,"test":new_test_dataset})

## Load tokenizer and datasets

In [None]:
# load tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# load datasets
df_train = pd.read_csv("../data/train.csv", converters={'tokens':literal_eval, 'span_start_index':literal_eval, 'span_end_index':literal_eval})
df_dev = pd.read_csv("../data/dev.csv", converters={'tokens':literal_eval, 'span_start_index':literal_eval, 'span_end_index':literal_eval})

# add label to each example in train and test dataframes
df_train = convert_spans_to_labels(df_train)
df_dev = convert_spans_to_labels(df_dev)

# create a dataset from the dataframes
train_dataset = Dataset.from_dict(df_train)
test_dataset = Dataset.from_dict(df_dev)
datasets = DatasetDict({"train":train_dataset,"test":test_dataset})
print(datasets)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'claim_label', 'labels'],
        num_rows: 6044
    })
    test: Dataset({
        features: ['tokens', 'claim_label', 'labels'],
        num_rows: 756
    })
})


## Tokenize and edit dataset

In [None]:
# tokenize and remove unecessary columns
tokenized_datasets = tokenize_datasets(datasets, tokenizer)
tokenized_datasets = tokenized_datasets.remove_columns(["claim_label", "tokens"])

# define the labels column as ClassLabel
features = tokenized_datasets["train"].features.copy()
features["labels"] = Sequence(feature=ClassLabel(num_classes=3, names=['O', 'B', 'I'], names_file=None, id=None), length=-1, id=None)
tokenized_datasets = tokenized_datasets.cast_column("labels", features["labels"])

print(tokenized_datasets["test"].features)

# features = tokenized_datasets["test"].features.copy()
# features["labels"] = Sequence(feature=ClassLabel(num_classes=3, names=['O', 'B', 'I'], names_file=None, id=None), length=-1, id=None)
# tokenized_datasets = tokenized_datasets.cast_column("labels", features["labels"])

Casting the dataset:   0%|          | 0/6044 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/756 [00:00<?, ? examples/s]

{'old_labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'labels': Sequence(feature=ClassLabel(names=['O', 'B', 'I'], id=None), length=-1, id=None)}


In [None]:
print(tokenized_datasets)
print(tokenized_datasets["train"][6]['old_labels'])
print(tokenized_datasets["train"][6]['labels'])

## define data collator

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Training

## Evaluation metrics setup

In [None]:
metric = evaluate.load("seqeval")
label_names = ["O", "B", "I"]

# compute evaluation metrics from model predictions
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l != -100] for label in labels]

  true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
  ]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

  return {
      "precision": all_metrics["overall_precision"],
      "recall": all_metrics["overall_recall"],
      "f1": all_metrics["overall_f1"],
      "accuracy": all_metrics["overall_accuracy"],
  }

## Splitting the dataset

In [None]:
#split up datasets into train/val/test
# test_size = val_size = 0.111
# all_examples = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["test"]])
# train_test_dataset = all_examples.train_test_split(test_size=test_size)
# train_val_dataset = train_test_dataset["train"].train_test_split(test_size=val_size)
# final_datasets = DatasetDict({"train":train_val_dataset["train"], \
#                               "validation":train_val_dataset["test"], \
#                               "test":train_test_dataset["test"]})

# print(final_datasets)

## Training

In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir="./trained-models/bert-finetune/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=6,
    # weight_decay=0.01,
    per_device_train_batch_size=32,
)

# print(training_args)

In [None]:
# define label to int mappings
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

# instantiate the model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

# evaluate

In [None]:
eval_preds = trainer.evaluate(final_datasets["test"])
print(eval_preds)
# compute_metrics(eval_preds)