# Some Boilerplate stuff

In [1]:
# You may prefer to upload the data to your google drive and mount your google drive to this colab, 
# because the data will be erased if you stop using this colab for a while.
# Uncomment the code below to do so. After mounting, navigate to the appropriate folder, right click, and "copy path".
# Assign DATA_DIR global variable to that path.
# For more mounting instructions: https://colab.research.google.com/notebooks/io.ipynb#scrollTo=XDg9OBaYqRMd
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# If imported from google drive, config for your file directory. Mine is 'lm_data'.
DATA_DIR = "./drive/MyDrive/nlp-final-project/data"

# the goal is that DATA_DIR points to where the training/validation/test data is. 

In [3]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 39.7 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 40.5 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 54.3 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████

In [4]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [5]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,842 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155335 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [6]:
import transformers

print(transformers.__version__)

4.17.0


In [7]:
model_checkpoint = "bert-base-uncased"

batch_size = 16

# Loading Data

In [8]:
import csv
import ast
from datasets import Dataset, load_metric, DatasetDict
import numpy as np

# Get raw datasets
'''
1.1 VUA
get raw dataset as a list:
  Each element is a triple:
    a sentence: string
    a list of labels: 
    a list of pos: 
'''
inputs = []
with open(f'{DATA_DIR}/MOHX/MOH-X_formatted_svo_cleaned.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        input = {}
        input["tokens"] = line[3].split()
        index = int(line[4])
        label = int(line[5])
        labels = [0] * len(input["tokens"])
        labels[index] = label
        input["labels"] = labels
        inputs.append(input)
train_val, test = np.split(np.array(inputs), [int(len(inputs)/2)])
train, val = np.split(np.array(train_val), [int(len(train_val)/2)])
print(train.shape)
print(val.shape)
print(test.shape)

raw_train_vua = {}
raw_train_vua["tokens"] = []
raw_train_vua["tags"] = []
for item in train:
    raw_train_vua["tokens"].append(item["tokens"])
    raw_train_vua["tags"].append(item["labels"])

raw_val_vua = {}
raw_val_vua["tokens"] = []
raw_val_vua["tags"] = []
for item in val:
    raw_val_vua["tokens"].append(item["tokens"])
    raw_val_vua["tags"].append(item["labels"])

raw_test_vua = {}
raw_test_vua["tokens"] = []
raw_test_vua["tags"] = []
for item in test:
    raw_test_vua["tokens"].append(item["tokens"])
    raw_test_vua["tags"].append(item["labels"])

dataset_dict = {}
dataset_dict["train"] = Dataset.from_dict(raw_train_vua)
dataset_dict["test"] = Dataset.from_dict(raw_test_vua)
dataset_dict["validation"] = Dataset.from_dict(raw_val_vua)

datasets = DatasetDict(dataset_dict)

(161,)
(162,)
(324,)


In [9]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [10]:
label_all_tokens = True

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_inputs = tokenize_and_align_labels(raw_train_vua)

print(tokenized_inputs)

{'input_ids': [[101, 6187, 1050, 1005, 1056, 8246, 2000, 2022, 14036, 1012, 102], [101, 2129, 2172, 2001, 2002, 2183, 2000, 2425, 2014, 1029, 102], [101, 2039, 2127, 2008, 2739, 2718, 1996, 2837, 1010, 2123, 2018, 2180, 1996, 2154, 2007, 2010, 2866, 4316, 2407, 10340, 1012, 102], [101, 2071, 2175, 2006, 2000, 1996, 4043, 1998, 2175, 2007, 2068, 2071, 1050, 1005, 1056, 2002, 1029, 102], [101, 2633, 1010, 2057, 2253, 2000, 1996, 2436, 1998, 2027, 2435, 2149, 1037, 18178, 4226, 1010, 2029, 13605, 1012, 102], [101, 2009, 2442, 2022, 3491, 2008, 1996, 13474, 3832, 1006, 2029, 1010, 2153, 1010, 2950, 2119, 3800, 1998, 7073, 1997, 6742, 15855, 1007, 2000, 3426, 2428, 3809, 4544, 2000, 2619, 1012, 102], [101, 1996, 25244, 3368, 1998, 8542, 2370, 2062, 4157, 1012, 102], [101, 1045, 6303, 1012, 102], [101, 2054, 2003, 2025, 2124, 2003, 2592, 2006, 1996, 5269, 1997, 2280, 4573, 1010, 2012, 2029, 3243, 11704, 4475, 2089, 2031, 2042, 14019, 1010, 2295, 3522, 2147, 2011, 8288, 2121, 1006, 2960, 1007

In [12]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Train model

In [13]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2)


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [21]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"bert-lets-go",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    weight_decay=0.01,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [23]:
import numpy as np
label_list = ["O", "I"]
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 161
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 550


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.245738,0.787879,0.684211,0.732394,0.966429
2,No log,0.283323,0.710526,0.710526,0.710526,0.963571
3,No log,0.380497,0.763158,0.381579,0.508772,0.957143
4,No log,0.167129,0.802632,0.802632,0.802632,0.975714
5,No log,0.290411,0.833333,0.526316,0.645161,0.962857
6,No log,0.304693,0.782609,0.473684,0.590164,0.960714
7,No log,0.203994,0.770492,0.618421,0.686131,0.967857
8,No log,0.173207,0.8,0.736842,0.767123,0.974286
9,No log,0.199583,0.787879,0.684211,0.732394,0.972143
10,No log,0.211001,0.791045,0.697368,0.741259,0.972857


The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 162
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 162
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 162
 

TrainOutput(global_step=550, training_loss=0.0004885931706733324, metrics={'train_runtime': 143.8024, 'train_samples_per_second': 55.98, 'train_steps_per_second': 3.825, 'total_flos': 67708011743988.0, 'train_loss': 0.0004885931706733324, 'epoch': 50.0})

In [26]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 162
  Batch size = 16


{'epoch': 50.0,
 'eval_accuracy': 0.9721428571428572,
 'eval_f1': 0.7310344827586206,
 'eval_loss': 0.24628518521785736,
 'eval_precision': 0.7681159420289855,
 'eval_recall': 0.6973684210526315,
 'eval_runtime': 0.6293,
 'eval_samples_per_second': 257.415,
 'eval_steps_per_second': 17.479}

# Evaluate model

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

label_list = ["O", "I"]
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)

The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 324
  Batch size = 16


{'_': {'precision': 0.5838926174496645, 'recall': 0.554140127388535, 'f1': 0.5686274509803922, 'number': 157}, 'overall_precision': 0.5838926174496645, 'overall_recall': 0.554140127388535, 'overall_f1': 0.5686274509803922, 'overall_accuracy': 0.949028236156949}


In [None]:
print(predictions)
print(labels)

result = []
words = []
for ids in tokenized_datasets["test"]["input_ids"]:
    words.append(tokenizer.convert_ids_to_tokens(ids))
for i in range(len(true_labels)):
    result_entry_list = []
    for j in range(len(true_predictions[i])):
        result_entry_list.append(f"{words[i][j+1]}({true_predictions[i][j]} {true_labels[i][j]})")
    result_entry = " ".join(result_entry_list)
    result_entry = f"{result_entry}\n\n"
    result.append(result_entry)

f = open(f'{DATA_DIR}/predictions/mohx_seq_test_predictions_BERTsequence_vua.txt', 'w+')
f.writelines(result)
f.close()

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]
 ...
 [-100    0    1 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]
 [-100    0    0 ... -100 -100 -100]]


# Detokenization

In [27]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

label_list = ["O", "I"]
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

The following columns in the test set  don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 324
  Batch size = 16


In [28]:
def detokenize(examples, predictions, labels):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    tokens = examples["tokens"]
    re_preds = []
    re_labels = []
    for i in range(len(examples[f"tags"])):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        re_pred = ["O"] * len(tokens[i])
        re_label = ["O"] * len(tokens[i])
        for j, word_idx in enumerate(word_ids):
            if j >= len(labels[i]):
                break
            if word_idx is None:
                continue
            else:
                if labels[i][j] == 'I':
                    re_label[word_idx] = "I"
                if predictions[i][j] == 'I':
                    re_pred[word_idx] = "I"
        re_labels.append(re_label)
        re_preds.append(re_pred)
    return (re_preds, re_labels)

In [29]:
re_preds, re_labels = detokenize(datasets["test"], true_predictions, true_labels)

results = metric.compute(predictions=re_preds, references=re_labels)
print(results)

{'_': {'precision': 0.7380952380952381, 'recall': 0.6118421052631579, 'f1': 0.6690647482014388, 'number': 152}, 'overall_precision': 0.7380952380952381, 'overall_recall': 0.6118421052631579, 'overall_f1': 0.6690647482014388, 'overall_accuracy': 0.9619500594530321}
