In [1]:
# !pip install transformers torch nlpbaselines evaluate seqeval accelerate datasets

In [8]:
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import torch

# metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
    

In [3]:
# Read the dataset, I've already preprocessed it

import pickle
with open('ds_split.pickle', 'rb') as f:
    ds_split = pickle.load(f)

ds_split["train"][0]


{'id': 372,
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  3,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  4,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  0,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  0,
  0,
  0,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
 

In [4]:
# Define the Argument Mining as a Classical Sequence Tagging problem
# Gather label information

label_list = ds_split["train"].features[f"ner_tags"].feature.names

label2id = {
    "O": 0,
    "B-Claim": 1,
    "I-Claim": 2,
    "B-Majorclaim": 3,
    "I-Majorclaim": 4,
    "B-Premise": 5,
    "I-Premise": 6
}

id2label = {v:k for k, v in label2id.items()}

In [5]:
# tokenize the dataset
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True) # add prefix space necessary for Roberta

def tokenize_and_align_labels(batch):
    # truncation is true because the input > 512 in like 24 documents
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(batch[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens (CLS and SEP) to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
    
tokenized_ds = ds_split.map(tokenize_and_align_labels, batched=True)


Map: 100%|███████████████████████████████████████████| 361/361 [00:00<00:00, 1214.92 examples/s]
Map: 100%|█████████████████████████████████████████████| 41/41 [00:00<00:00, 1241.01 examples/s]


In [9]:
# Setting
device = "cuda" if torch.cuda.is_available() else "cpu"
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
seqeval = evaluate.load("seqeval")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
# the smaller the batch size the better in this case

# Tested
# batch_sizes = [2, 4, 8, 16]
# learning_rates = [2e-5, 3e-5, 4e-5, 5e-5]

# 2 3e-05 the best, 3rd epoch before overfitting

batch_sizes = [2]
learning_rates = [3e-5]

for batch_size in batch_sizes:
    for learning_rate in learning_rates:
        print(batch_size, learning_rate)
        # reinitiate the model for each combination
        model = AutoModelForTokenClassification.from_pretrained(
            model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id).to(device)
        training_args = TrainingArguments(
            output_dir=f"results-roberta/{batch_size}-{learning_rate}",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=4,   # overfitting often starts from 3
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            push_to_hub=False)
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_ds["train"],
            eval_dataset=tokenized_ds["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics)
        
        trainer.train()

2 3e-05


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.518309,0.297441,0.464226,0.362573,0.800714
2,No log,0.415121,0.48503,0.673877,0.564067,0.852892
3,0.491300,0.409263,0.534106,0.690516,0.602322,0.853565
4,0.491300,0.434051,0.537371,0.693844,0.605664,0.856931


In [12]:
# Move 2-3e-05 to xiaoou for inference
!mkdir models
!mv results-roberta/2-3e-05/checkpoint-543 models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
# inference

from transformers import pipeline

text = "It is always said that competition can effectively promote the development of economy. In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers. However, when we discuss the issue of competition or cooperation, what we are concerned about is not the whole society, but the development of an individual's whole life. From this point of view, I firmly believe that we should attach more importance to cooperation during primary education."

model_path = "models/checkpoint-543"

# ignore labels -> output all labels, grouped_entities = output trunks
classifier = pipeline("ner", model=model_path, grouped_entities=True, ignore_labels = [])
classifier(text)

[{'entity_group': 'O',
  'score': 0.9093813,
  'word': ' It is always said that',
  'start': 0,
  'end': 22},
 {'entity_group': 'Claim',
  'score': 0.6267374,
  'word': ' competition can effectively promote the development of economy',
  'start': 23,
  'end': 85},
 {'entity_group': 'O',
  'score': 0.40173453,
  'word': '. In',
  'start': 85,
  'end': 89},
 {'entity_group': 'Premise',
  'score': 0.86719596,
  'word': ' order to survive in the competition, companies continue to improve their products and service',
  'start': 90,
  'end': 183},
 {'entity_group': 'O',
  'score': 0.47686225,
  'word': ',',
  'start': 183,
  'end': 184},
 {'entity_group': 'Premise',
  'score': 0.6580801,
  'word': ' and',
  'start': 185,
  'end': 188},
 {'entity_group': 'O',
  'score': 0.7127421,
  'word': ' as a result,',
  'start': 189,
  'end': 201},
 {'entity_group': 'Premise',
  'score': 0.58878356,
  'word': ' the whole society prospers',
  'start': 202,
  'end': 228},
 {'entity_group': 'O',
  'score':

In [None]:
# upload to hugging face
# model = AutoModelForTokenClassification.from_pretrained(model_path)
# model.push_to_hub("xiaoou/am",token="xx")
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# tokenizer.push_to_hub("xiaoou/am",token="xx")