# NER: train model on the whole dataset

In [None]:
!pip install -q evaluate seqeval
!pip install -q transformers[torch]

In [None]:
!git clone https://github.com/named-entity/hse-nlp/

fatal: destination path 'hse-nlp' already exists and is not an empty directory.


In [None]:
import pandas as pd
import numpy as np
import re
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
# read data into df
train_aspects = pd.read_csv('/content/hse-nlp/4th_year/Project/train_aspects.txt', sep='\t', header=None,
            names=['review_id', 'category', 'span', 'span_start', 'span_end', 'sentiment'])
dev_aspects = pd.read_csv('/content/hse-nlp/4th_year/Project/dev_aspects.txt', sep='\t', header=None,
            names=['review_id', 'category', 'span', 'span_start', 'span_end', 'sentiment'])

train_reviews = pd.read_csv('/content/hse-nlp/4th_year/Project/train_reviews.txt', sep='\t', header=None,
            names=['review_id', 'text', 'sentiment'])
dev_reviews = pd.read_csv('/content/hse-nlp/4th_year/Project/dev_reviews.txt', sep='\t', header=None,
            names=['review_id', 'text', 'sentiment'])

In [None]:
# concatenate sentiment and aspects labels
train_aspects['text_label'] = train_aspects.category + '_' + train_aspects.sentiment
dev_aspects['text_label'] = dev_aspects.category + '_' + dev_aspects.sentiment

In [None]:
# tokenize
from transformers import AutoTokenizer

model_checkpoint = 'cointegrated/rubert-tiny2'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
train_reviews['input_ids'] = train_reviews.text.apply(lambda x: tokenizer([x]).input_ids)
dev_reviews['input_ids'] = dev_reviews.text.apply(lambda x: tokenizer([x]).input_ids)

train_reviews['tokens'] = train_reviews.text.apply(lambda x: tokenizer([x]).tokens())
dev_reviews['tokens'] = dev_reviews.text.apply(lambda x: tokenizer([x]).tokens())

In [None]:
for col in ['span', 'span_start', 'span_end', 'text_label']:
    train_reviews = train_reviews.merge(train_aspects.groupby('review_id')[col].apply(lambda x: list(x)),
                                        left_on='review_id',
                                        right_index=True)

for col in ['span', 'span_start', 'span_end', 'text_label']:
    dev_reviews = dev_reviews.merge(dev_aspects.groupby('review_id')[col].apply(lambda x: list(x)),
                                        left_on='review_id',
                                        right_index=True)

In [None]:
# convert tokens to iob

def span_to_iob(tokenized, starts, ends, text_labels):
    tokens = tokenized.tokens()
    aligned_labels = ['O'] * len(tokens)
    # Make a list to store our labels the same length as our tokens
    for start, end, label in zip(starts, ends, text_labels):
        annotation_token_ix_set = (
            set()
        ) # A set that stores the token indices of the annotation
        for char_ix in range(start, end):
            token_ix = tokenized.char_to_token(char_ix)
            if token_ix is not None:
                annotation_token_ix_set.add(token_ix)
        sorted_annotation_token_ix_set = sorted(annotation_token_ix_set)
        for num, token_ix in enumerate(sorted_annotation_token_ix_set):
            if num == 0: # or tokenized.token_to_word(token_ix) == tokenized.token_to_word(sorted_annotation_token_ix_set[0]):
                prefix = 'B'
            else:
                prefix = 'I' # We're inside of a multi token annotation
            aligned_labels[token_ix] = f"{prefix}-{label}"
    return aligned_labels

In [None]:
train_labels = [span_to_iob(tokenizer([row[1][0]]), *row[1][1:])
                for row in train_reviews[['text', 'span_start', 'span_end', 'text_label']].iterrows()]
dev_labels = [span_to_iob(tokenizer([row[1][0]]), *row[1][1:])
                for row in dev_reviews[['text', 'span_start', 'span_end', 'text_label']].iterrows()]

In [None]:
train_reviews['labels'] = train_labels
dev_reviews['labels'] = dev_labels

In [None]:
# build dataset

from transformers import DataCollatorForTokenClassification, DefaultDataCollator, DataCollatorWithPadding

class TokenDataset:
    def __init__(self,
                 df, label2id
            ):
        self.tokenized = tokenizer(df.text.tolist())
        self.labels = df.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.tokenized.input_ids[idx]
        attention_mask = self.tokenized.attention_mask[idx]
        token_type_ids = self.tokenized.token_type_ids[idx]
        labels = [label2id[ele] for ele in self.labels[idx]]

        return {
                'input_ids': input_ids,
                'token_type_ids': token_type_ids,
                'attention_mask': attention_mask,
                'labels': labels
               }

In [None]:
label_names = list(set(sum(train_labels, [])))
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

train_set = TokenDataset(train_reviews, label2id)
dev_set = TokenDataset(dev_reviews, label2id)

In [None]:
# evaluation metrics
# source https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section2_pt.ipynb#scrollTo=Uf6wsPeyfIHm

import evaluate

metric = evaluate.load('seqeval')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        'precision': all_metrics['overall_precision'],
        'recall': all_metrics['overall_recall'],
        'f1': all_metrics['overall_f1'],
        'accuracy': all_metrics['overall_accuracy'],
    }

In [None]:
# the train pipeline
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

def train_model(lr, epochs, model=None, freeze_bert=False, push_to_hub=False):
    if model == None:
        model = AutoModelForTokenClassification.from_pretrained(
            model_checkpoint,
            id2label=id2label,
            label2id=label2id,
        )

    if freeze_bert:
        for param in model.bert.parameters():
            param.requires_grad = False

    args = TrainingArguments(
        'rubert-tiny2-ner-absa-v2',
        logging_strategy='epoch',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        metric_for_best_model='f1',
        save_total_limit=1,
        load_best_model_at_end=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=0.01,
        push_to_hub=push_to_hub,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_set,
        eval_dataset=dev_set,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    trainer.train()

    return model

In [None]:
# train the model
model = train_model(lr=2e-4, epochs=30, freeze_bert=True)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,3.3205,2.989442,0.004006,0.011765,0.005977,0.629432
2,2.7551,2.440156,0.0,0.0,0.0,0.821346
3,2.255,1.962918,0.0,0.0,0.0,0.833526
4,1.8264,1.574545,0.0,0.0,0.0,0.834102
5,1.481,1.283528,0.0,0.0,0.0,0.83403
6,1.2318,1.079711,0.0,0.0,0.0,0.834102
7,1.0567,0.942723,0.0,0.0,0.0,0.834174
8,0.9375,0.849998,0.0,0.0,0.0,0.834318
9,0.8585,0.790462,0.0,0.0,0.0,0.834823
10,0.8124,0.751602,0.0,0.0,0.0,0.835543


In [None]:
# unfreeze all params
for param in model.parameters():
    param.requires_grad = True

In [None]:
model = train_model(lr=1e-4, epochs=60, model=model, push_to_hub=True)

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.5507,0.434627,0.415263,0.352101,0.381082,0.885558
2,0.4476,0.375678,0.460526,0.411765,0.434783,0.896224
3,0.3961,0.33388,0.472546,0.477311,0.474916,0.904656
4,0.3546,0.293195,0.499596,0.519328,0.509271,0.916763
5,0.319,0.262257,0.5,0.552941,0.52514,0.924474
6,0.2877,0.231692,0.5626,0.589076,0.575534,0.936293
7,0.2607,0.206804,0.59019,0.626891,0.607987,0.940112
8,0.2354,0.184883,0.635167,0.689076,0.661024,0.951931
9,0.2167,0.163492,0.670111,0.708403,0.688725,0.956039
10,0.1986,0.143756,0.702077,0.738655,0.719902,0.961949
