# NER: evaluation

In [None]:
!pip install -q evaluate seqeval
!pip install -q transformers[torch]

In [None]:
!git clone https://github.com/named-entity/hse-nlp/

fatal: destination path 'hse-nlp' already exists and is not an empty directory.


In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from huggingface_hub import notebook_login

### Выбор модели

In [None]:
# 'bert-base/rubert-tiny2-ner-absa-v1' or 'bert-base/rubert-tiny2-ner-absa-v2'

model_checkpoint = 'bert-base/rubert-tiny2-ner-absa-v1'

### Пути к файлам

In [None]:
# REPLACE with your paths to the test datasets, the gold annotation, and the baseline results

path_to_aspects = '/content/hse-nlp/4th_year/Project/dev_aspects.txt'
path_to_reviews = '/content/hse-nlp/4th_year/Project/dev_reviews.txt'

gold_cats_test_path = '/content/hse-nlp/4th_year/Project/dev_cats.txt'
baseline_cats_test_path = '/content/hse-nlp/4th_year/Project/dev_pred_cats.txt'

gold_aspects_test_path = '/content/hse-nlp/4th_year/Project/dev_aspects.txt'
baseline_aspects_test_path = '/content/hse-nlp/4th_year/Project/dev_pred_aspects.txt'

In [None]:
def get_aspects_and_reviews(path_to_aspects, path_to_reviews):
    test_aspects = pd.read_csv(path_to_aspects, sep='\t', header=None,
                names=['review_id', 'category', 'span', 'span_start', 'span_end', 'sentiment'])

    test_reviews = pd.read_csv(path_to_reviews, sep='\t', header=None,
                names=['review_id', 'text', 'sentiment'])
    return test_aspects, test_reviews

In [None]:
test_aspects, test_reviews = get_aspects_and_reviews(path_to_aspects, path_to_reviews)

In [None]:
# concatenate sentiment and aspects labels
test_aspects['text_label'] = test_aspects.category + '_' + test_aspects.sentiment

In [None]:
# tokenize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')

In [None]:
test_reviews['input_ids'] = test_reviews.text.apply(lambda x: tokenizer([x]).input_ids)

test_reviews['tokens'] = test_reviews.text.apply(lambda x: tokenizer([x]).tokens())

In [None]:
for col in ['span', 'span_start', 'span_end', 'text_label']:
    test_reviews = test_reviews.merge(test_aspects.groupby('review_id')[col].apply(lambda x: list(x)),
                                        left_on='review_id',
                                        right_index=True)

In [None]:
# convert tokens to iob

def span_to_iob(tokenized, starts, ends, text_labels):
    tokens = tokenized.tokens()
    aligned_labels = ['O'] * len(tokens)
    # Make a list to store our labels the same length as our tokens
    for start, end, label in zip(starts, ends, text_labels):
        annotation_token_ix_set = (
            set()
        ) # A set that stores the token indices of the annotation
        for char_ix in range(start, end):
            token_ix = tokenized.char_to_token(char_ix)
            if token_ix is not None:
                annotation_token_ix_set.add(token_ix)
        sorted_annotation_token_ix_set = sorted(annotation_token_ix_set)
        for num, token_ix in enumerate(sorted_annotation_token_ix_set):
            if num == 0: # or tokenized.token_to_word(token_ix) == tokenized.token_to_word(sorted_annotation_token_ix_set[0]):
                prefix = 'B'
            else:
                prefix = 'I' # We're inside of a multi token annotation
            aligned_labels[token_ix] = f"{prefix}-{label}"
    return aligned_labels


In [None]:
test_labels = [span_to_iob(tokenizer([row[1][0]]), *row[1][1:])
                for row in test_reviews[['text', 'span_start', 'span_end', 'text_label']].iterrows()]

In [None]:
test_reviews['labels'] = test_labels

In [None]:
# build dataset

from transformers import DataCollatorForTokenClassification, DefaultDataCollator, DataCollatorWithPadding

# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

class TokenDataset:
    def __init__(self,
                 df, label2id
            ):
        self.tokenized = tokenizer(df.text.tolist())
        self.labels = df.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.tokenized.input_ids[idx]
        attention_mask = self.tokenized.attention_mask[idx]
        token_type_ids = self.tokenized.token_type_ids[idx]
        labels = [label2id[ele] for ele in self.labels[idx]]

        return {
                'input_ids': input_ids,
                'token_type_ids': token_type_ids,
                'attention_mask': attention_mask,
                'labels': labels
               }

In [None]:
label_names = \
 ['I-Service_neutral',
 'B-Service_both',
 'I-Interior_positive',
 'B-Whole_negative',
 'B-Service_neutral',
 'B-Food_neutral',
 'I-Price_negative',
 'B-Interior_neutral',
 'B-Whole_neutral',
 'I-Food_positive',
 'B-Price_negative',
 'B-Interior_negative',
 'B-Food_both',
 'B-Service_positive',
 'I-Whole_neutral',
 'I-Food_negative',
 'I-Interior_negative',
 'B-Whole_both',
 'I-Price_positive',
 'I-Whole_both',
 'O',
 'I-Whole_negative',
 'I-Interior_both',
 'I-Price_both',
 'I-Service_positive',
 'I-Food_both',
 'B-Service_negative',
 'I-Service_both',
 'B-Price_neutral',
 'B-Food_positive',
 'I-Food_neutral',
 'B-Food_negative',
 'I-Interior_neutral',
 'B-Interior_both',
 'I-Price_neutral',
 'B-Price_positive',
 'B-Price_both',
 'B-Whole_positive',
 'B-Interior_positive',
 'I-Whole_positive',
 'I-Service_negative']

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

test_set = TokenDataset(test_reviews, label2id)

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
).to('cuda')

### Предсказания

In [None]:
# pass output of tokenizer to device
def tokenized_to_cuda(text):
    tokenized = tokenizer([text], return_tensors='pt')
    for key in tokenized:
        tokenized[key] = tokenized[key].to('cuda')
    return tokenized

def predict_labels(texts, model):
    return [
        [id2label[ele.item()] for ele in model(**tokenized_to_cuda(text)).logits.argmax(dim=-1)[0]]
    for text in texts]

test_preds = predict_labels(test_reviews.text, model)

In [None]:
def iob_to_span(tokenized, iob_labels):
    tokens = tokenized.tokens()
    starts, ends, text_labels = [], [], []
    for i, (token, label) in enumerate(zip(tokens, iob_labels)):
        span = tokenized.token_to_chars(i)
        if span is not None:
            start, end = span
            if label.startswith('B') or label.startswith('I'):
                starts.append(start)
                ends.append(end)
                text_labels.append(label)

    merged_starts, merged_ends, merged_text_labels = starts[:1], [], text_labels[:1]
    for i, label in enumerate(text_labels):
        if label.startswith('B'):
            merged_ends.append(ends[i-1])
            merged_starts.append(starts[i])
            merged_text_labels.append(label[2:])
    merged_ends.append(ends[-1])

    return merged_starts[1:], merged_ends[1:], merged_text_labels[1:]

In [None]:
for start, end, ent in zip(*iob_to_span(tokenizer(test_reviews.text[0]), test_preds[0])):
    print(test_reviews.text[0][start:end], start, end, ent, sep='\t')

менеджер- темноволосая	147	169	Service_neutral
девушка, проводила к столу и дала меню	179	217	Service_positive
официантка	242	252	Service_neutral
приняла заказ	254	267	Service_positive
удалил	270	276	Service_negative
ждать	315	320	Service_positive
ресторан	337	345	Whole_positive
заведения	431	440	Whole_positive
Бизнес ланч	495	506	Food_positive
цене	582	586	Price_positive
место	610	615	Whole_positive
меню	639	643	Food_positive
цены	656	660	Price_positive
качество обслуживания	671	692	Service_positive
заведению	735	744	Whole_positive


### Сохраняем результаты NER

In [None]:
with open('test_pred_aspects.txt', 'w') as f:
    for review_id, text, preds in zip(test_reviews.review_id.tolist(),
                                      test_reviews.text.tolist(),
                                      test_preds):
        for start, end, ent in zip(*iob_to_span(tokenizer(text), preds)):
            token = text[start:end]
            aspect, sentiment = ent.split('_')
            print(review_id, aspect, token, start, end, sentiment, sep='\t', end='\n', file=f)

### Сохраняем результаты ABSA

Посчитаем упоминания аспектов с предсказанной тональностью, припишем

-   ```absence``` - если нет упоминаний данной категории
- ```both``` - если есть упоминания с разной тональностью
- ```positive/neutral/negative``` - если все упоминания одной тональности

In [None]:
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']

In [None]:
def get_full_sentiment(text, max_len=5):
    asp_counter = defaultdict(Counter)
    for start, end, ent in zip(*iob_to_span(tokenizer(text), preds)):
        aspect, sentiment = ent.split('_')
        asp_counter[aspect][sentiment] += 1
    for c in CATEGORIES:
        if not asp_counter[c]:
            s = 'absence'
        elif len(asp_counter[c]) == 1:
            s = asp_counter[c].most_common(1)[0][0]
        else:
            s = 'both'
        yield c, s

In [None]:
with open('test_pred_cats.txt', 'w') as f:
    for review_id, text, preds in zip(test_reviews.review_id.tolist(),
                                  test_reviews.text.tolist(),
                                  test_preds):
        for aspect, sentiment in get_full_sentiment(text):
            print(review_id, aspect, sentiment, sep='\t', end='\n', file=f)

## Оценка 1: accuracy по выделению упоминаний с категориями

In [None]:
pred_aspects_test_path = '/content/test_pred_aspects.txt'
pred_cats_test_path = '/content/test_pred_cats.txt'

In [None]:
def evaluate_ner(gold_test_path, pred_test_path):
    gold_aspect_cats = {}
    with open(gold_test_path) as fg:
        for line in fg:
            line = line.rstrip('\r\n').split('\t')
            if line[0] not in gold_aspect_cats:
                gold_aspect_cats[line[0]] = {'starts':[], 'ends':[], 'cats':[], 'sents':[]}
            gold_aspect_cats[line[0]]['starts'].append(int(line[3]))
            gold_aspect_cats[line[0]]['ends'].append(int(line[4]))
            gold_aspect_cats[line[0]]['cats'].append(line[1])
            gold_aspect_cats[line[0]]['sents'].append(line[5])

    full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
    total = 0
    fully_matched_pairs = []
    partially_matched_pairs = []
    with open(pred_test_path) as fp:
        for line in fp:
            total += 1
            line = line.rstrip('\r\n').split('\t')
            start, end = int(line[3]), int(line[4])
            category = line[1]
            doc_gold_aspect_cats = gold_aspect_cats[line[0]]
            if start in doc_gold_aspect_cats['starts']:
                i = doc_gold_aspect_cats['starts'].index(start)
                if doc_gold_aspect_cats['ends'][i] == end:
                    full_match += 1
                    if doc_gold_aspect_cats['cats'][i] == category:
                        full_cat_match += 1
                    else:
                        partial_cat_match += 1
                    fully_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats['starts'][i],
                                doc_gold_aspect_cats['ends'][i],
                                doc_gold_aspect_cats['cats'][i],
                                doc_gold_aspect_cats['sents'][i]
                            ],
                            line
                        )
                    )
                    continue
            for s_pos in doc_gold_aspect_cats['starts']:
                if start <= s_pos:
                    i = doc_gold_aspect_cats['starts'].index(s_pos)
                    if doc_gold_aspect_cats['ends'][i] == end:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats['starts'][i],
                                    doc_gold_aspect_cats['ends'][i],
                                    doc_gold_aspect_cats['cats'][i],
                                    doc_gold_aspect_cats['sents'][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats['cats'][i] == category:
                            partial_cat_match += 1
                        continue
                    matched = False
                    for e_pos in doc_gold_aspect_cats['ends'][i:]:
                        if s_pos <= end <= e_pos:
                            partial_match += 1
                            partially_matched_pairs.append(
                                (
                                    [
                                        doc_gold_aspect_cats['starts'][i],
                                        doc_gold_aspect_cats['ends'][i],
                                        doc_gold_aspect_cats['cats'][i],
                                        doc_gold_aspect_cats['sents'][i]
                                    ],
                                    line
                                )
                            )
                            if doc_gold_aspect_cats['cats'][i] == category:
                                partial_cat_match += 1
                            matched = True
                            break
                    if matched:
                        break
                if start > s_pos:
                    i = doc_gold_aspect_cats['starts'].index(s_pos)
                    if start < doc_gold_aspect_cats['ends'][i] <= end:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats['starts'][i],
                                    doc_gold_aspect_cats['ends'][i],
                                    doc_gold_aspect_cats['cats'][i],
                                    doc_gold_aspect_cats['sents'][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats['cats'][i] == category:
                            partial_cat_match += 1
                        break

    gold_size = sum([len(gold_aspect_cats[x]['cats']) for x in gold_aspect_cats])

    print(f"""
    Full match precision: {full_match / total}
    Full match recall: {full_match / gold_size}
    Partial match ratio in pred: {(full_match + partial_match)  / total}
    Full category accuracy: {full_cat_match / total}
    Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
    """)

    return fully_matched_pairs, partially_matched_pairs

Бейзлайн

In [None]:
fully_matched_pairs_baseline, partially_matched_pairs_baseline = \
evaluate_ner(gold_aspects_test_path, baseline_aspects_test_path)


    Full match precision: 0.48
    Full match recall: 0.7159663865546219
    Partial match ratio in pred: 0.6197183098591549
    Full category accuracy: 0.46422535211267607
    Partial category accuracy: 0.6033802816901408
    


Наша модель

In [None]:
fully_matched_pairs, partially_matched_pairs = \
evaluate_ner(gold_aspects_test_path, pred_aspects_test_path)


    Full match precision: 0.6527559055118111
    Full match recall: 0.6966386554621848
    Partial match ratio in pred: 0.8047244094488188
    Full category accuracy: 0.621259842519685
    Partial category accuracy: 0.789763779527559
    


## Оценка 2: accuracy по тональности упоминаний

In [None]:
def sentiment_accuracy(matches):
    matched_sentiment = 0.
    for pair in matches:
        *_, gold_s = pair[0]
        *_, pred_s = pair[1]
        if gold_s == pred_s:
            matched_sentiment += 1
    return matched_sentiment / len(matches)

Бейзлайн

In [None]:
print(f'Accuracy по полностью совпавшим упоминаниям: {sentiment_accuracy(fully_matched_pairs_baseline)}')
print(f'Accuracy по частично совпавшим упоминаниям: {sentiment_accuracy(partially_matched_pairs_baseline)}')

Accuracy по полностью совпавшим упоминаниям: 0.6772300469483568
Accuracy по частично совпавшим упоминаниям: 0.6370967741935484


Наша модель

In [None]:
print(f'Accuracy по полностью совпавшим упоминаниям: {sentiment_accuracy(fully_matched_pairs)}')
print(f'Accuracy по частично совпавшим упоминаниям: {sentiment_accuracy(partially_matched_pairs)}')

Accuracy по полностью совпавшим упоминаниям: 0.8202653799758746
Accuracy по частично совпавшим упоминаниям: 0.7357512953367875


## Оценка 3: accuracy по тональности категории

In [None]:
def overall_sentiment_accuracy(gold_cats_test_path, pred_cats_test_path):
    with open(gold_cats_test_path) as gc, open(pred_cats_test_path) as pc:
        gold_labels = set(gc.readlines())
        pred_labels = set(pc.readlines())
        print(
            'Overall sentiment accuracy:',
            len(gold_labels & pred_labels) / len(gold_labels)
        )

Бейзлайн:

In [None]:
overall_sentiment_accuracy(gold_cats_test_path, baseline_cats_test_path)

Overall sentiment accuracy: 0.523943661971831


Наша модель:

In [None]:
overall_sentiment_accuracy(gold_cats_test_path, pred_cats_test_path)

Overall sentiment accuracy: 0.5859154929577465
